In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score
import matplotlib.pyplot as plt
%matplotlib inline



# 1. 训练

 ## 1. 1数据整理
读入数据，观察得到数据没有缺失值。数据主要存在两个问题，第一个就是没有下雨的RAINFOW值为NR，第二个就是存在大量的字符型数字，不利于将来处理，因此首先要解决这两个问题

In [2]:
data = pd.read_csv('train.csv', engine='python')
data = data.iloc[:, range(1, data.shape[1])]
test = pd.read_csv('test.csv', header=None)

In [3]:
data.head()

Unnamed: 0,factor,0,1,2,3,4,5,6,7,8,...,14,15,16,17,18,19,20,21,22,23
0,AMB_TEMP,14.0,14.0,14.0,13.0,12.0,12.0,12.0,12.0,15.0,...,22.0,22.0,21.0,19.0,17.0,16.0,15.0,15.0,15.0,15.0
1,CH4,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,...,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8
2,CO,0.51,0.41,0.39,0.37,0.35,0.3,0.37,0.47,0.78,...,0.37,0.37,0.47,0.69,0.56,0.45,0.38,0.35,0.36,0.32
3,NMHC,0.2,0.15,0.13,0.12,0.11,0.06,0.1,0.13,0.26,...,0.1,0.13,0.14,0.23,0.18,0.12,0.1,0.09,0.1,0.08
4,NO,0.9,0.6,0.5,1.7,1.8,1.5,1.9,2.2,6.6,...,2.5,2.2,2.5,2.3,2.1,1.9,1.5,1.6,1.8,1.5


In [4]:
test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,id_0,AMB_TEMP,15.0,14.0,14.0,13.0,13.0,13.0,13.0,13.0,12.0
1,id_0,CH4,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8
2,id_0,CO,0.36,0.35,0.34,0.33,0.33,0.34,0.34,0.37,0.42
3,id_0,NMHC,0.11,0.09,0.09,0.1,0.1,0.1,0.1,0.11,0.12
4,id_0,NO,0.6,0.4,0.3,0.3,0.3,0.7,0.8,0.8,0.9


`step1` 首先将所有RAINFALL中的NR值替换成0， 这里也可以考虑替换成-1， 最后可以修改下测试效果

In [5]:
NR = -1
test = test.replace('NR', NR)
data = data.replace('NR', NR)

`step2` 接下来将所有字符串型数字的格式改成浮点数，方便后面的操作

In [6]:
for i in np.arange(0, 24):
    str_i = str(i)
    data[str_i] = data[str_i].astype('float')

for i in np.arange(2, 11):
    test[i] = test[i].astype('float')

In [7]:
data.head()

Unnamed: 0,factor,0,1,2,3,4,5,6,7,8,...,14,15,16,17,18,19,20,21,22,23
0,AMB_TEMP,14.0,14.0,14.0,13.0,12.0,12.0,12.0,12.0,15.0,...,22.0,22.0,21.0,19.0,17.0,16.0,15.0,15.0,15.0,15.0
1,CH4,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,...,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8
2,CO,0.51,0.41,0.39,0.37,0.35,0.3,0.37,0.47,0.78,...,0.37,0.37,0.47,0.69,0.56,0.45,0.38,0.35,0.36,0.32
3,NMHC,0.2,0.15,0.13,0.12,0.11,0.06,0.1,0.13,0.26,...,0.1,0.13,0.14,0.23,0.18,0.12,0.1,0.09,0.1,0.08
4,NO,0.9,0.6,0.5,1.7,1.8,1.5,1.9,2.2,6.6,...,2.5,2.2,2.5,2.3,2.1,1.9,1.5,1.6,1.8,1.5


In [8]:
test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,id_0,AMB_TEMP,15.0,14.0,14.0,13.0,13.0,13.0,13.0,13.0,12.0
1,id_0,CH4,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8
2,id_0,CO,0.36,0.35,0.34,0.33,0.33,0.34,0.34,0.37,0.42
3,id_0,NMHC,0.11,0.09,0.09,0.1,0.1,0.1,0.1,0.11,0.12
4,id_0,NO,0.6,0.4,0.3,0.3,0.3,0.7,0.8,0.8,0.9


## 1.2 相关系数
将数据堆积成列， 计算其相关系数， 只取相关系数高一点的前几项

In [9]:
words = np.array(data.factor)[:18]
arrange_data = pd.DataFrame(columns=words)

In [10]:
for i in np.arange(int((data.shape[0]) / 18)):
    select = data.iloc[range(i*18, i*18 + 18), :]
    value = (select.iloc[:,1:]).T
    value.columns = words
    arrange_data = arrange_data.append(value)

In [11]:
corr = abs(arrange_data.corr()['PM2.5']).sort_values()
print(corr)
##这里先考虑相关系数d大于0.3
corr_list = list(corr[corr > 0.2].index)

AMB_TEMP      0.017127
NO            0.029970
WS_HR         0.045458
RAINFALL      0.070448
WIND_SPEED    0.084703
WIND_DIREC    0.156990
WD_HR         0.186138
CH4           0.254657
RH            0.264196
CO            0.283119
NMHC          0.291778
THC           0.352159
O3            0.356670
SO2           0.370831
NOx           0.375564
NO2           0.449113
PM10          0.776426
PM2.5         1.000000
Name: PM2.5, dtype: float64


## 1.3 训练模型
计算X_train, Y_train, 调用sckit_learn进行训练

In [12]:
arrange_data = arrange_data.loc[:, corr_list]

In [13]:
arrange_data.head()

Unnamed: 0,CH4,RH,CO,NMHC,THC,O3,SO2,NOx,NO2,PM10,PM2.5
0,1.8,77.0,0.51,0.2,2.0,16.0,1.8,17.0,16.0,56.0,26.0
1,1.8,68.0,0.41,0.15,2.0,30.0,2.0,9.8,9.2,50.0,39.0
2,1.8,67.0,0.39,0.13,2.0,27.0,1.7,8.7,8.2,48.0,36.0
3,1.8,74.0,0.37,0.12,1.9,23.0,1.6,8.6,6.9,35.0,35.0
4,1.8,72.0,0.35,0.11,1.9,24.0,1.9,8.5,6.8,25.0,31.0


In [14]:
X_train = []
Y_train = []

In [15]:
for i in np.arange(int((arrange_data.shape[0]) / 24 / 20)):
    current_month = arrange_data.iloc[np.arange(i * 24 * 20, (i + 1)*24*20), :]
    for j in np.arange(len(current_month) - 9):
        x = current_month.iloc[range(j, j + 9), :]
        y = current_month.iloc[j+9, :]
        X_train.append(list(np.array(x).flatten()))
        Y_train.append([y['PM2.5']])

In [26]:
model = LinearRegression(normalize=True, fit_intercept=True)
np.average(cross_val_score(model, X_train, Y_train, cv=10))

0.8258906192238383

In [27]:
model.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

整理测试数据集

# 2.测试

## 2.1 整理test_data 

In [28]:
arrange_test_data = pd.DataFrame(columns=words)

for i in np.arange(int((test.shape[0]) / 18)):
    select = test.iloc[range(i*18, i*18 + 18), :]
    value = (select.iloc[:,2:]).T
    value.columns = words
    arrange_test_data = arrange_test_data.append(value)
    
arrange_test_data = arrange_test_data.loc[:, corr_list]

In [29]:
arrange_test_data.head()

Unnamed: 0,CH4,RH,CO,NMHC,THC,O3,SO2,NOx,NO2,PM10,PM2.5
2,1.8,75.0,0.36,0.11,1.9,36.0,1.2,9.9,9.3,51.0,27.0
3,1.8,71.0,0.35,0.09,1.8,44.0,1.2,7.5,7.1,51.0,13.0
4,1.8,71.0,0.34,0.09,1.8,45.0,1.2,6.4,6.1,31.0,24.0
5,1.8,73.0,0.33,0.1,1.9,44.0,1.6,5.9,5.7,40.0,29.0
6,1.8,74.0,0.33,0.1,1.9,44.0,1.5,5.8,5.5,34.0,41.0


## 2.2 计算Yhat 

In [30]:
X_test = []
for i in np.arange(int((arrange_test_data.shape[0]) / 9)):
    current_day = arrange_test_data.iloc[np.arange(i * 9, (i + 1)* 9), :]
    X_test.append(list(np.array(current_day).flatten()))
    
Yhat = (model.predict(X_test)).flatten()
real=pd.read_csv('https://ntumlta.github.io/2017fall-ml-hw1/ans.csv')
Y = np.array(real.value)

In [33]:
abs(Yhat - Y).sum() / len(Y)

4.896611907713555