# **链家二手房数据分析**
### 读取并显示数据
*构建read_data函数读取csv文件，并返回以列表形式存储的header以及data*

In [195]:
def read_data(fname):
    data=[]
    with open(fname,encoding='utf-8') as csvfile:
        for line in csvfile.readlines():
            if line.strip() != '':
                data.append(line.strip().split(','))
    return data[0],data[1:]

*读取链家二手房数据，并打印header和data中的前10组数据*

In [196]:
fname='lianjia.csv'
header,data=read_data(fname)
print(header,'\n',data[:10])

['Direction', 'District', 'Elevator', 'Floor', 'Garden', 'Id', 'Layout', 'Price', 'Region', 'Renovation', 'Size', 'Year'] 
 [['东西', '灯市口', '', '6', '锡拉胡同21号院', '101102647043', '3室1厅', '780', '东城', '精装', '75.0', '1988'], ['南北', '东单', '无电梯', '6', '东华门大街', '101102650978', '2室1厅', '705', '东城', '精装', '60.0', '1988'], ['南西', '崇文门', '有电梯', '16', '新世界中心', '101102672743', '3室1厅', '1400', '东城', '其他', '210.0', '1996'], ['南', '崇文门', '', '7', '兴隆都市馨园', '101102577410', '1室1厅', '420', '东城', '精装', '39.0', '2004'], ['南', '陶然亭', '有电梯', '19', '中海紫御公馆', '101102574696', '2室2厅', '998', '东城', '精装', '90.0', '2010'], ['南北', '广渠门', '有电梯', '18', '幸福家园二期', '101102407993', '2室1厅', '1180', '东城', '其他', '111.0', '2003'], ['南', '西罗园', '无电梯', '6', '西革新里110号院', '101102629841', '1室1厅', '319', '东城', '其他', '42.0', '1992'], ['南', '西罗园', '有电梯', '16', '建予园', '101102378003', '2室1厅', '640', '东城', '其他', '105.0', '1999'], ['南北', '东花市', '有电梯', '9', '富贵园一区', '101102345859', '3室2厅', '1780', '东城', '精装', '161.0', '2003'], ['东北', '东直门'

### 对数据进行描述性统计分析
*定义计算均值的函数*

In [197]:
def mean(lst):
    return sum(lst)/len(lst)

*定义计算方差的函数*

In [198]:
def var(lst):
    lst1=[(i-mean(lst))**2 for i in lst]
    return sum(lst1)/(len(lst1)-1)

*定义计算中位数的函数*

In [199]:
def median(lst):
    lst1=sorted(lst)
    return (lst1[len(lst1)//2-1]+lst1[len(lst1)//2]) if len(lst1)%2==0 else lst1[(len(lst1)+1)//2-1]

*定义计算四分位数的函数*

In [200]:
def quartile_down(lst):
    lst1=sorted(lst)
    quartile_down=len(lst1)/4
    return lst1[int(quartile_down)-1]+(lst1[int(quartile_down)]-lst1[int(quartile_down)-1])*(quartile_down-int(quartile_down))
def quartile_up(lst):
    lst1=sorted(lst)
    quartile_up=len(lst1)*3/4
    return lst1[int(quartile_up)-1]+(lst1[int(quartile_up)]-lst1[int(quartile_up)-1])*(quartile_up-int(quartile_up))

*定义计算众数的函数*

In [201]:
def mode(lst):
    counts=dict()
    for i in lst:
        counts[str(i)]=counts.get(str(i),0)+1
    mode=None
    max=None
    for key,val in counts.items():
        if max is None or val > max:
            mode=float(key)
            max=val
    return mode     

*定义计算偏度的函数*

In [202]:
def skew(lst):
    lst1=[(i-mean(lst))**3 for i in lst]
    return sum(lst1)/(var(lst)**1.5)

*定义计算峰度的函数*

In [203]:
def kurt(lst):
    lst1=[(i-mean(lst))**4 for i in lst]
    return sum(lst1)/(var(lst)**2)

*分别对Price和Size进行描述性统计分析*

In [204]:
price_index=header.index('Price')
size_index=header.index('Size')
price_lst=[float(i[price_index]) for i in data]
size_lst=[float(i[size_index]) for i in data]
print("pirce:\nmean:{}, var:{}, max:{}, min:{}, median:{}, quartile_down:{}, quartile_up:{}, mode:{}, skew:{}, kurt:{}".format(mean(price_lst),var(price_lst),max(price_lst),min(price_lst),median(price_lst),quartile_down(price_lst),quartile_up(price_lst),mode(price_lst),skew(price_lst),kurt(price_lst)))
print("size:\nmean:{}, var:{}, max:{}, min:{}, median:{}, quartile_down:{}, quartile_up:{}, mode:{}, skew:{}, kurt:{}".format(mean(size_lst),var(size_lst),max(size_lst),min(size_lst),median(size_lst),quartile_down(size_lst),quartile_up(size_lst),mode(size_lst),skew(size_lst),kurt(size_lst)))

pirce:
mean:610.6683194661488, var:169292.83602578138, max:6000.0, min:60.0, median:499.0, quartile_down:365.0, quartile_up:717.0, mode:450.0, skew:72258.77241759041, kurt:443855.8043497853
size:
mean:99.14930100941842, var:2599.861599127047, max:1019.0, min:2.0, median:88.0, quartile_down:66.0, quartile_up:118.0, mode:89.0, skew:70609.84530996392, kurt:555323.8991955152


### 利用前500组数据组成样本数据建立Unit Price与装修情况、有无电梯的回归模型
*定义计算斜率的函数*

In [205]:
def slope(y,x1,x2):
    a=sum([y[i]*x1[i] for i in range(len(y))])
    b=sum([y[i]*x2[i] for i in range(len(y))])
    c=sum([x1[i]**2 for i in range(len(y))])
    d=sum([x2[i]**2 for i in range(len(y))])
    e=sum([x1[i]*x2[i] for i in range(len(y))])
    return (a*d-b*e)/(c*d-e**2),(b*c-a*e)/(c*d-e**2)

*定义计算截距和拟合优度R²的函数*

In [206]:
def intercept(y,x1,x2):
    a,b=slope(y,x1,x2)
    return mean(y)-a*mean(x1)+b*mean(x2)

*量化Renovation以及Elevator的元素*

In [207]:
unit_price_lst_s=[price_lst[i]/size_lst[i] for i in range(500)]
reno_lst_s=[i[header.index('Renovation')] for i in data][:500]
count=dict()
for value in reno_lst_s:
    count[value]=count.get(value,0)+1
print(count)
reno_lst_s=[0 if i=='其他' else i for i in reno_lst_s]
reno_lst_s=[1 if i=='毛坯' else i for i in reno_lst_s]
reno_lst_s=[2 if i=='简装' else i for i in reno_lst_s]
reno_lst_s=[3 if i=='精装' else i for i in reno_lst_s]
print(reno_lst_s[:50])
eleva_lst_s=[i[header.index('Elevator')] for i in data][:500]
count=dict()
for value in eleva_lst_s:
    count[value]=count.get(value,0)+1
print(count)
eleva_lst_s=[0 if i=='无电梯' or i=='' else i for i in eleva_lst_s]
eleva_lst_s=[1 if i=='有电梯' else i for i in eleva_lst_s]
print(eleva_lst_s[:50])

{'精装': 219, '其他': 122, '简装': 135, '毛坯': 24}
[3, 3, 0, 3, 3, 0, 0, 0, 3, 3, 3, 0, 3, 3, 0, 2, 2, 3, 0, 3, 2, 3, 0, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2, 2, 3, 3, 0, 3, 3, 3, 3, 2, 2, 3, 3]
{'': 168, '无电梯': 166, '有电梯': 166}
[0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1]


*计算样本回归模型*

In [208]:
beta0=intercept(unit_price_lst_s,reno_lst_s,eleva_lst_s)
beta1,beta2=slope(unit_price_lst_s,reno_lst_s,eleva_lst_s)
print("unit_price={:.4f}+{:.4f}*renovation+{:.4f}*elevator".format(beta0,beta1,beta2))

unit_price=2.7532+1.4860*renovation+2.4967*elevator


### Size数据可视化
*统计前500组数据组成的样本的Size的区间分布*

In [209]:
size_lst_s=size_lst[:500]
section=dict()
n=100
while n <= (max(size_lst_s)//100+1)*100:
    for i in size_lst_s:
        if n-100 < i <=n:
            section[str(n-100)+'-'+str(n)]=section.get(str(n-100)+'-'+str(n),0)+1
    n+=100
print(section)
sections=list(section.values())
print(sections)

{'0-100': 311, '100-200': 165, '200-300': 12, '300-400': 7, '400-500': 2, '500-600': 2, '600-700': 1}
[311, 165, 12, 7, 2, 2, 1]


*制作坐标轴*

In [210]:
n=100
hist=[[]]
hist[0].append('0')
while n <= (max(size_lst_s)//100+1)*100:
    hist[0].append('-')
    hist[0].append(str(n))
    n+=100
hist[0].append('\n')
print(hist)

[['0', '-', '100', '-', '200', '-', '300', '-', '400', '-', '500', '-', '600', '-', '700', '\n']]


*绘制直方图框架*

In [211]:
for i in range(max(sections)):
    hist.append([' ',' ','   ',' ','   ',' ','   ',' ','   ',' ','   ',' ','   ',' ','   '])
print(hist)

[['0', '-', '100', '-', '200', '-', '300', '-', '400', '-', '500', '-', '600', '-', '700', '\n'], [' ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   '], [' ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   '], [' ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   '], [' ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   '], [' ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   '], [' ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   '], [' ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   '], [' ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   '], [' ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   '], [' ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', '   ', ' ', 

*绘制直方图*

In [212]:
for i in range(1,len(hist)):
    for j in range(len(sections)):
        if j==0:
            if i <= sections[j]:
                hist[i][0]='*'
                hist[i][1]='*'
                if sections[j] >= sections[j+1]:
                    hist[i][2]='***'
        else:
            if i <= sections[j]:
                if sections[j] > sections[j-1]:
                    hist[i][0+j*2]="***"
                hist[i][1+j*2]="*"
                if j != len(sections)-1:
                    if sections[j] >= sections[j+1]:
                        hist[i][2+j*2]='***'
                else:
                    hist[i][2+j*2]='***'
    hist[i].append('\n')
hist_str=''
for i in range(len(hist),0,-1):
    for j in hist[i-1]:
        hist_str+=j
print(hist_str)

*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****                        
*****     