In [1]:
%autosave 5

Autosaving every 5 seconds


In [2]:
from sklearn.model_selection import train_test_split, KFold
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder




In [3]:
def normalize(df_in, col_name):
    max_value = df_in[col_name].max()
    min_value = df_in[col_name].min()
    df_out = df_in.copy()
    df_out[col_name] = (df_in[col_name] - min_value) / (max_value - min_value)
    return df_out

In [4]:
df = pd.read_csv("Metro_Interstate_Traffic_Volume.csv")

In [5]:
df.head()



Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918


In [6]:
df.shape

(48204, 9)

In [7]:
df['traffic_volume'].max()

7280

In [8]:
df['traffic_volume'].min()

0

In [9]:
df.columns

Index(['holiday', 'temp', 'rain_1h', 'snow_1h', 'clouds_all', 'weather_main',
       'weather_description', 'date_time', 'traffic_volume'],
      dtype='object')

In [10]:
df[df.duplicated(keep=False)].count()


holiday                34
temp                   34
rain_1h                34
snow_1h                34
clouds_all             34
weather_main           34
weather_description    34
date_time              34
traffic_volume         34
dtype: int64

In [11]:
df.drop_duplicates(keep = 'first', inplace = True)


In [12]:
pd.set_option('display.max_rows', None)
df.isnull().sum()

holiday                0
temp                   0
rain_1h                0
snow_1h                0
clouds_all             0
weather_main           0
weather_description    0
date_time              0
traffic_volume         0
dtype: int64

In [13]:
print(df['holiday'].unique())
print(df['weather_main'].unique())
print(df['weather_description'].unique())

['None' 'Columbus Day' 'Veterans Day' 'Thanksgiving Day' 'Christmas Day'
 'New Years Day' 'Washingtons Birthday' 'Memorial Day' 'Independence Day'
 'State Fair' 'Labor Day' 'Martin Luther King Jr Day']
['Clouds' 'Clear' 'Rain' 'Drizzle' 'Mist' 'Haze' 'Fog' 'Thunderstorm'
 'Snow' 'Squall' 'Smoke']
['scattered clouds' 'broken clouds' 'overcast clouds' 'sky is clear'
 'few clouds' 'light rain' 'light intensity drizzle' 'mist' 'haze' 'fog'
 'proximity shower rain' 'drizzle' 'moderate rain' 'heavy intensity rain'
 'proximity thunderstorm' 'thunderstorm with light rain'
 'proximity thunderstorm with rain' 'heavy snow' 'heavy intensity drizzle'
 'snow' 'thunderstorm with heavy rain' 'freezing rain' 'shower snow'
 'light rain and snow' 'light intensity shower rain' 'SQUALLS'
 'thunderstorm with rain' 'proximity thunderstorm with drizzle'
 'thunderstorm' 'Sky is Clear' 'very heavy rain'
 'thunderstorm with light drizzle' 'light snow'
 'thunderstorm with drizzle' 'smoke' 'shower drizzle' 'light 

In [14]:
df['holiday'] = LabelEncoder().fit_transform(df['holiday'])
df['weather_main'] = LabelEncoder().fit_transform(df['weather_main'])
df['weather_description'] = LabelEncoder().fit_transform(df['weather_description'])

In [15]:
df.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,7,288.28,0.0,0.0,40,1,24,2012-10-02 09:00:00,5545
1,7,289.36,0.0,0.0,75,1,2,2012-10-02 10:00:00,4516
2,7,289.58,0.0,0.0,90,1,19,2012-10-02 11:00:00,4767
3,7,290.13,0.0,0.0,90,1,19,2012-10-02 12:00:00,5026
4,7,291.14,0.0,0.0,75,1,2,2012-10-02 13:00:00,4918


In [16]:
data = df

In [17]:
data['date_time'] = pd.to_datetime(data['date_time'])
data['date'] = data['date_time'].dt.date
data['hour'] = data['date_time'].dt.hour
data['weekday'] = data['date_time'].dt.weekday


data = data.drop(columns=['date_time'])

data.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,traffic_volume,date,hour,weekday
0,7,288.28,0.0,0.0,40,1,24,5545,2012-10-02,9,1
1,7,289.36,0.0,0.0,75,1,2,4516,2012-10-02,10,1
2,7,289.58,0.0,0.0,90,1,19,4767,2012-10-02,11,1
3,7,290.13,0.0,0.0,90,1,19,5026,2012-10-02,12,1
4,7,291.14,0.0,0.0,75,1,2,4918,2012-10-02,13,1


In [18]:
correl = data.corr()
correl.style.background_gradient(cmap = 'coolwarm')

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,traffic_volume,hour,weekday
holiday,1.0,-0.000473,6.6e-05,0.000432,0.007074,-0.004316,-0.002713,0.018677,0.026043,0.021454
temp,-0.000473,1.0,0.00907,-0.019756,-0.101968,-0.033434,-0.049437,0.130161,0.112048,-0.007824
rain_1h,6.6e-05,0.00907,1.0,-9e-05,0.004818,0.009537,0.010777,0.004715,0.003422,-0.00692
snow_1h,0.000432,-0.019756,-9e-05,1.0,0.027934,0.036642,0.005104,0.000736,0.009852,-0.014929
clouds_all,0.007074,-0.101968,0.004818,0.027934,1.0,0.50082,-0.341822,0.067138,0.054522,-0.039816
weather_main,-0.004316,-0.033434,0.009537,0.036642,0.50082,1.0,-0.127635,-0.040149,-0.053599,-0.038731
weather_description,-0.002713,-0.049437,0.010777,0.005104,-0.341822,-0.127635,1.0,-0.067533,-0.019611,0.031962
traffic_volume,0.018677,0.130161,0.004715,0.000736,0.067138,-0.040149,-0.067533,1.0,0.3523,-0.149551
hour,0.026043,0.112048,0.003422,0.009852,0.054522,-0.053599,-0.019611,0.3523,1.0,-0.003808
weekday,0.021454,-0.007824,-0.00692,-0.014929,-0.039816,-0.038731,0.031962,-0.149551,-0.003808,1.0


# primeira abordagem


In [20]:
data2 = df

In [21]:
data2['date_time'] = pd.to_datetime(data2['date_time'])
data2['year'] = data2['date_time'].dt.year
data2['month'] = data2['date_time'].dt.month
data2['day'] = data2['date_time'].dt.day
data2['hour'] = data2['date_time'].dt.hour
data2['weekday'] = data2['date_time'].dt.weekday


data2 = data2.drop(columns=['date_time'])

data2.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,traffic_volume,date,hour,weekday,year,month,day
0,7,288.28,0.0,0.0,40,1,24,5545,2012-10-02,9,1,2012,10,2
1,7,289.36,0.0,0.0,75,1,2,4516,2012-10-02,10,1,2012,10,2
2,7,289.58,0.0,0.0,90,1,19,4767,2012-10-02,11,1,2012,10,2
3,7,290.13,0.0,0.0,90,1,19,5026,2012-10-02,12,1,2012,10,2
4,7,291.14,0.0,0.0,75,1,2,4918,2012-10-02,13,1,2012,10,2


In [22]:
    data2 = normalize(data2, 'temp')
    data2 = normalize(data2, 'rain_1h')
    data = normalize(data2, 'snow_1h')
    data2 = normalize(data2, 'clouds_all')
    #data2 = normalize(data2, 'traffic_volume')
    data2 = normalize(data2, 'hour')
    data2 = normalize(data2, 'weekday')
    data2 = normalize(data2, 'holiday')
    data2 = normalize(data2, 'date')
    data2 = normalize(data2, 'year')
    data2 = normalize(data2, 'month')
    data2 = normalize(data2, 'day')



        
    

In [23]:
data2.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,traffic_volume,date,hour,weekday,year,month,day
0,0.636364,0.929726,0.0,0.0,0.4,1,24,5545,0.0,0.391304,0.166667,0.0,0.818182,0.033333
1,0.636364,0.933209,0.0,0.0,0.75,1,2,4516,0.0,0.434783,0.166667,0.0,0.818182,0.033333
2,0.636364,0.933918,0.0,0.0,0.9,1,19,4767,0.0,0.478261,0.166667,0.0,0.818182,0.033333
3,0.636364,0.935692,0.0,0.0,0.9,1,19,5026,0.0,0.521739,0.166667,0.0,0.818182,0.033333
4,0.636364,0.938949,0.0,0.0,0.75,1,2,4918,0.0,0.565217,0.166667,0.0,0.818182,0.033333


In [24]:
correl = data2.corr()
correl.style.background_gradient(cmap = 'coolwarm')

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,traffic_volume,date,hour,weekday,year,month,day
holiday,1.0,-0.000473,6.6e-05,0.000432,0.007074,-0.004316,-0.002713,0.018677,-0.002872,0.026043,0.021454,-0.001426,-0.01011,0.007694
temp,-0.000473,1.0,0.00907,-0.019756,-0.101968,-0.033434,-0.049437,0.130161,0.170399,0.112048,-0.007824,0.134916,0.223943,0.022677
rain_1h,6.6e-05,0.00907,1.0,-9e-05,0.004818,0.009537,0.010777,0.004715,0.000613,0.003422,-0.00692,0.000443,0.0013,-0.002293
snow_1h,0.000432,-0.019756,-9e-05,1.0,0.027934,0.036642,0.005104,0.000736,-0.000268,0.009852,-0.014929,-0.003514,0.020422,0.015798
clouds_all,0.007074,-0.101968,0.004818,0.027934,1.0,0.50082,-0.341822,0.067138,-0.074492,0.054522,-0.039816,-0.072855,-0.009118,0.048425
weather_main,-0.004316,-0.033434,0.009537,0.036642,0.50082,1.0,-0.127635,-0.040149,0.041203,-0.053599,-0.038731,0.036449,0.026351,0.023058
weather_description,-0.002713,-0.049437,0.010777,0.005104,-0.341822,-0.127635,1.0,-0.067533,0.114941,-0.019611,0.031962,0.119352,-0.035962,-0.032775
traffic_volume,0.018677,0.130161,0.004715,0.000736,0.067138,-0.040149,-0.067533,1.0,0.004249,0.3523,-0.149551,0.004697,-0.00248,-0.00776
date,-0.002872,0.170399,0.000613,-0.000268,-0.074492,0.041203,0.114941,0.004249,1.0,-0.007499,-0.010818,0.988639,-0.00917,0.026308
hour,0.026043,0.112048,0.003422,0.009852,0.054522,-0.053599,-0.019611,0.3523,-0.007499,1.0,-0.003808,-0.007561,0.001844,-0.009531


# oficial start

In [25]:
scaler = preprocessing.StandardScaler()
data3 = data2
data3.head()



Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,traffic_volume,date,hour,weekday,year,month,day
0,0.636364,0.929726,0.0,0.0,0.4,1,24,5545,0.0,0.391304,0.166667,0.0,0.818182,0.033333
1,0.636364,0.933209,0.0,0.0,0.75,1,2,4516,0.0,0.434783,0.166667,0.0,0.818182,0.033333
2,0.636364,0.933918,0.0,0.0,0.9,1,19,4767,0.0,0.478261,0.166667,0.0,0.818182,0.033333
3,0.636364,0.935692,0.0,0.0,0.9,1,19,5026,0.0,0.521739,0.166667,0.0,0.818182,0.033333
4,0.636364,0.938949,0.0,0.0,0.75,1,2,4918,0.0,0.565217,0.166667,0.0,0.818182,0.033333


In [26]:
correl = data3.corr()
correl.style.background_gradient(cmap = 'coolwarm')

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,traffic_volume,date,hour,weekday,year,month,day
holiday,1.0,-0.000473,6.6e-05,0.000432,0.007074,-0.004316,-0.002713,0.018677,-0.002872,0.026043,0.021454,-0.001426,-0.01011,0.007694
temp,-0.000473,1.0,0.00907,-0.019756,-0.101968,-0.033434,-0.049437,0.130161,0.170399,0.112048,-0.007824,0.134916,0.223943,0.022677
rain_1h,6.6e-05,0.00907,1.0,-9e-05,0.004818,0.009537,0.010777,0.004715,0.000613,0.003422,-0.00692,0.000443,0.0013,-0.002293
snow_1h,0.000432,-0.019756,-9e-05,1.0,0.027934,0.036642,0.005104,0.000736,-0.000268,0.009852,-0.014929,-0.003514,0.020422,0.015798
clouds_all,0.007074,-0.101968,0.004818,0.027934,1.0,0.50082,-0.341822,0.067138,-0.074492,0.054522,-0.039816,-0.072855,-0.009118,0.048425
weather_main,-0.004316,-0.033434,0.009537,0.036642,0.50082,1.0,-0.127635,-0.040149,0.041203,-0.053599,-0.038731,0.036449,0.026351,0.023058
weather_description,-0.002713,-0.049437,0.010777,0.005104,-0.341822,-0.127635,1.0,-0.067533,0.114941,-0.019611,0.031962,0.119352,-0.035962,-0.032775
traffic_volume,0.018677,0.130161,0.004715,0.000736,0.067138,-0.040149,-0.067533,1.0,0.004249,0.3523,-0.149551,0.004697,-0.00248,-0.00776
date,-0.002872,0.170399,0.000613,-0.000268,-0.074492,0.041203,0.114941,0.004249,1.0,-0.007499,-0.010818,0.988639,-0.00917,0.026308
hour,0.026043,0.112048,0.003422,0.009852,0.054522,-0.053599,-0.019611,0.3523,-0.007499,1.0,-0.003808,-0.007561,0.001844,-0.009531


In [27]:
data3.shape

(48187, 14)

In [28]:


data3 = data3.drop(columns=['day'])
data3 = data3.drop(columns=['month'])
data3 = data3.drop(columns=['year'])
data3 = data3.drop(columns=['snow_1h'])
data3 = data3.drop(columns=['rain_1h'])



In [29]:
data3.head()

Unnamed: 0,holiday,temp,clouds_all,weather_main,weather_description,traffic_volume,date,hour,weekday
0,0.636364,0.929726,0.4,1,24,5545,0.0,0.391304,0.166667
1,0.636364,0.933209,0.75,1,2,4516,0.0,0.434783,0.166667
2,0.636364,0.933918,0.9,1,19,4767,0.0,0.478261,0.166667
3,0.636364,0.935692,0.9,1,19,5026,0.0,0.521739,0.166667
4,0.636364,0.938949,0.75,1,2,4918,0.0,0.565217,0.166667


In [30]:
correl = data3.corr()
correl.style.background_gradient(cmap = 'coolwarm')

Unnamed: 0,holiday,temp,clouds_all,weather_main,weather_description,traffic_volume,date,hour,weekday
holiday,1.0,-0.000473,0.007074,-0.004316,-0.002713,0.018677,-0.002872,0.026043,0.021454
temp,-0.000473,1.0,-0.101968,-0.033434,-0.049437,0.130161,0.170399,0.112048,-0.007824
clouds_all,0.007074,-0.101968,1.0,0.50082,-0.341822,0.067138,-0.074492,0.054522,-0.039816
weather_main,-0.004316,-0.033434,0.50082,1.0,-0.127635,-0.040149,0.041203,-0.053599,-0.038731
weather_description,-0.002713,-0.049437,-0.341822,-0.127635,1.0,-0.067533,0.114941,-0.019611,0.031962
traffic_volume,0.018677,0.130161,0.067138,-0.040149,-0.067533,1.0,0.004249,0.3523,-0.149551
date,-0.002872,0.170399,-0.074492,0.041203,0.114941,0.004249,1.0,-0.007499,-0.010818
hour,0.026043,0.112048,0.054522,-0.053599,-0.019611,0.3523,-0.007499,1.0,-0.003808
weekday,0.021454,-0.007824,-0.039816,-0.038731,0.031962,-0.149551,-0.010818,-0.003808,1.0


In [31]:
apend = data3['traffic_volume']
data3 = data3.drop(columns=['traffic_volume'])
data3['traffic'] = apend.values


In [32]:
data3 = data3.drop(columns=['date'])


In [33]:
data3.head()



Unnamed: 0,holiday,temp,clouds_all,weather_main,weather_description,hour,weekday,traffic
0,0.636364,0.929726,0.4,1,24,0.391304,0.166667,5545
1,0.636364,0.933209,0.75,1,2,0.434783,0.166667,4516
2,0.636364,0.933918,0.9,1,19,0.478261,0.166667,4767
3,0.636364,0.935692,0.9,1,19,0.521739,0.166667,5026
4,0.636364,0.938949,0.75,1,2,0.565217,0.166667,4918


In [34]:
print (data3.shape)

(48187, 8)


In [35]:
data3 = normalize(data3, 'weather_main')
data3 = normalize(data3, 'weather_description')


In [36]:
data3.head()


Unnamed: 0,holiday,temp,clouds_all,weather_main,weather_description,hour,weekday,traffic
0,0.636364,0.929726,0.4,0.1,0.648649,0.391304,0.166667,5545
1,0.636364,0.933209,0.75,0.1,0.054054,0.434783,0.166667,4516
2,0.636364,0.933918,0.9,0.1,0.513514,0.478261,0.166667,4767
3,0.636364,0.935692,0.9,0.1,0.513514,0.521739,0.166667,5026
4,0.636364,0.938949,0.75,0.1,0.054054,0.565217,0.166667,4918


In [37]:
correl = data3.corr()
correl.style.background_gradient(cmap = 'coolwarm')

Unnamed: 0,holiday,temp,clouds_all,weather_main,weather_description,hour,weekday,traffic
holiday,1.0,-0.000473,0.007074,-0.004316,-0.002713,0.026043,0.021454,0.018677
temp,-0.000473,1.0,-0.101968,-0.033434,-0.049437,0.112048,-0.007824,0.130161
clouds_all,0.007074,-0.101968,1.0,0.50082,-0.341822,0.054522,-0.039816,0.067138
weather_main,-0.004316,-0.033434,0.50082,1.0,-0.127635,-0.053599,-0.038731,-0.040149
weather_description,-0.002713,-0.049437,-0.341822,-0.127635,1.0,-0.019611,0.031962,-0.067533
hour,0.026043,0.112048,0.054522,-0.053599,-0.019611,1.0,-0.003808,0.3523
weekday,0.021454,-0.007824,-0.039816,-0.038731,0.031962,-0.003808,1.0,-0.149551
traffic,0.018677,0.130161,0.067138,-0.040149,-0.067533,0.3523,-0.149551,1.0


In [38]:
dataNOnorm = data3

In [39]:
data3 = normalize(data3, 'traffic')


# train-test

In [40]:

x = data3.iloc[:, :7]
y = data3['traffic']

In [41]:
x.head()

Unnamed: 0,holiday,temp,clouds_all,weather_main,weather_description,hour,weekday
0,0.636364,0.929726,0.4,0.1,0.648649,0.391304,0.166667
1,0.636364,0.933209,0.75,0.1,0.054054,0.434783,0.166667
2,0.636364,0.933918,0.9,0.1,0.513514,0.478261,0.166667
3,0.636364,0.935692,0.9,0.1,0.513514,0.521739,0.166667
4,0.636364,0.938949,0.75,0.1,0.054054,0.565217,0.166667


In [42]:
y.head()

0    0.761676
1    0.620330
2    0.654808
3    0.690385
4    0.675549
Name: traffic, dtype: float64

In [43]:
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn import metrics as me


In [44]:
from sklearn.model_selection import train_test_split

In [45]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=9, shuffle = True)
x_train2, x_test2, y_train2, y_test2 = train_test_split(x, y, test_size=0.3, random_state=19, shuffle = True)

In [46]:
x_train.shape

(33730, 7)

In [47]:
x_test.shape

(14457, 7)

In [48]:
y_train.shape

(33730,)

In [49]:
y_test.shape

(14457,)

In [85]:
def metrics(target, result, model = "Unnamed"):
    print(model)
    print("MSE: %0.5f" % (me.mean_squared_error(target, result)))
    print("MAD: %0.5f" % (me.mean_absolute_error(target, result)))
    print("R2: %0.5f" % (me.r2_score(target, result)))
    print("")



As métricas de desempenho : o erro médio quadrático (MSE), erro médio absoluto (MAD) e o coeficiente de determinação (R2). 
Para os dois primeiros, valores próximos de 0 indicam um modelo com baixo erro, enquanto para o último um valor igual a 1 indica que o modelo se ajusta perfeitamente aos dados.


In [51]:
classifier = tree.DecisionTreeRegressor(random_state = 42)
classifier.fit(x_train, y_train)
predictions = classifier.predict(x_test)


classifier2 = tree.DecisionTreeRegressor(random_state = 42)
classifier2.fit(x_train2, y_train2)
predictions2 = classifier2.predict(x_test2)


In [52]:
metrics(y_test, predictions, "tree 1")
metrics(y_test2, predictions2, "tree 2")



tree 1
MSE: 0.00754
MAD: 0.04633
R2: 0.89841

tree 2
MSE: 0.00812
MAD: 0.04793
R2: 0.88997



In [53]:
from sklearn import neighbors, svm

In [54]:
x_train3, x_test3, y_train3, y_test3 = train_test_split(x, y, test_size=0.3, random_state=25, shuffle = True)
x_train4, x_test4, y_train4, y_test4 = train_test_split(x, y, test_size=0.3, random_state=123, shuffle = True)

In [55]:
k1 = 3
k2 = 5
classifier3 = neighbors.KNeighborsRegressor(n_neighbors = k1)
classifier3.fit(x_train3, y_train3)
predictions3 = classifier.predict(x_test3)


classifier4 = neighbors.KNeighborsRegressor(n_neighbors = k2)
classifier4.fit(x_train4, y_train4)
predictions4 = classifier2.predict(x_test4)


In [56]:
metrics(y_test3, predictions3, "knn  k = 3")
metrics(y_test4, predictions4, "knn k = 5 ")


knn  k = 3
MSE: 0.00237
MAD: 0.01465
R2: 0.96802

knn k = 5 
MSE: 0.00242
MAD: 0.01443
R2: 0.96758



In [57]:
classifier = svm.SVR(kernel='linear')
classifier.fit(x_train, y_train)
predictions = classifier.predict(x_test)


classifier2 = svm.SVR(kernel='linear')
classifier2.fit(x_train2, y_train2)
predictions2 = classifier2.predict(x_test2)


In [58]:
metrics(y_test, predictions, "SVM 1")
metrics(y_test2, predictions2, "SVM 2")


SVM 1
MSE: 0.06303
MAD: 0.21402
R2: 0.15115

SVM 2
MSE: 0.06407
MAD: 0.21578
R2: 0.13205



# validacao cruzada

In [59]:
from sklearn import model_selection
from sklearn.model_selection import cross_val_score


In [60]:
predicoes = data3;


num = predicoes.select_dtypes(['float64', 'int64']).columns.values
num = np.delete(num, np.argwhere(num == 'traffic'))

scaler = preprocessing.MinMaxScaler()
scaled = pd.concat([pd.DataFrame(scaler.fit_transform(predicoes[num]), columns = num), dataNOnorm['traffic']], axis=1)

scaled.head()

Unnamed: 0,holiday,temp,clouds_all,weather_main,weather_description,hour,weekday,traffic
0,0.636364,0.929726,0.4,0.1,0.648649,0.391304,0.166667,5545.0
1,0.636364,0.933209,0.75,0.1,0.054054,0.434783,0.166667,4516.0
2,0.636364,0.933918,0.9,0.1,0.513514,0.478261,0.166667,4767.0
3,0.636364,0.935692,0.9,0.1,0.513514,0.521739,0.166667,5026.0
4,0.636364,0.938949,0.75,0.1,0.054054,0.565217,0.166667,4918.0


In [61]:
kf = model_selection.KFold(n_splits=10,random_state=42, shuffle=True)

knn = neighbors.KNeighborsRegressor(n_neighbors=3)
tee = tree.DecisionTreeRegressor()

In [63]:
splits = 10
results = []
fold = model_selection.KFold(n_splits = splits, random_state = 7, shuffle = True)

x = dataNOnorm.copy(deep = True)
y = x['traffic']

x.drop(columns = ['traffic'], inplace = True)

x = x.reset_index()
y = y.reset_index()





In [88]:
msetree=[]

for train_index, test_index in kf.split(data3):
    x_train, x_test = x.loc[train_index], x.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    knn.fit(x_train,y_train)
    resultknn = knn.predict(x_test)
    
    tee.fit(x_train,y_train)
    resulttree = tee.predict(x_test)
    
    
    metrics(y_test, resultknn, "knn metrics")
 
    
    print("Validação Cruzada - KNN")
    print("MSE = %0.5f" % (me.mean_squared_error(y_test,resultknn)))
    print("MAE = %0.5f" % (me.mean_absolute_error(y_test,resultknn)))
    print("R² = %0.5f" % (me.r2_score(y_test,resultknn)))


    print("\nValidação Cruzada - Árvore de Decisão")
    print("MSE = % 0.5f" % (me.mean_squared_error(y_test,resulttree)))
    print("MAE = %0.5f" % (me.mean_absolute_error(y_test,resulttree)))
    print("R² = %0.5f" % (me.r2_score(y_test,resulttree)))

    print('\n')
    


    

tree metrics
MSE: 251803.56235
MAD: 232.20914
R2: 0.93672

Validação Cruzada - KNN
MSE = 251803.56235
MAE = 232.20914
R² = 0.93672

Validação Cruzada - Árvore de Decisão
MSE =  72196.13654
MAE = 131.69392
R² = 0.98288


tree metrics
MSE: 234196.18036
MAD: 223.64062
R2: 0.94170

Validação Cruzada - KNN
MSE = 234196.18036
MAE = 223.64062
R² = 0.94170

Validação Cruzada - Árvore de Decisão
MSE =  76003.73428
MAE = 133.94449
R² = 0.98209


tree metrics
MSE: 253672.51326
MAD: 232.46970
R2: 0.93677

Validação Cruzada - KNN
MSE = 253672.51326
MAE = 232.46970
R² = 0.93677

Validação Cruzada - Árvore de Decisão
MSE =  73005.77475
MAE = 131.83804
R² = 0.98294


tree metrics
MSE: 251549.69347
MAD: 234.12101
R2: 0.93678

Validação Cruzada - KNN
MSE = 251549.69347
MAE = 234.12101
R² = 0.93678

Validação Cruzada - Árvore de Decisão
MSE =  79316.00540
MAE = 138.75846
R² = 0.98130


tree metrics
MSE: 233577.94655
MAD: 226.78094
R2: 0.94059

Validação Cruzada - KNN
MSE = 233577.94655
MAE = 226.78094
R²

In [78]:
from sklearn.model_selection import cross_val_score

scores1 = cross_val_score(knn, x, y, cv = fold, scoring= 'neg_mean_squared_error')
scores2 = cross_val_score(tee, x, y, cv=fold, scoring='neg_mean_squared_error' )

scores3 = cross_val_score(knn, x, y, cv=fold, scoring='neg_mean_absolute_error' )
scores4 = cross_val_score(tee, x, y, cv=fold, scoring='neg_mean_absolute_error')


scores5 = cross_val_score(knn, x, y, cv=fold, scoring='r2')
scores6 = cross_val_score(tee, x, y, cv=fold,scoring='r2')









In [None]:
from sklearn import cross_validation model = RandomForestClassifier(n_estimators=100) #Simple K-Fold cross validation. 5 folds. #(Note: in older scikit-learn versions the "n_folds" argument is named "k".) cv = cross_validation.KFold(len(train), n_folds=5, indices=False) results = [] # "model" can be replaced by your model object # "Error_function" can be replaced by the error function of your analysis for traincv, testcv in cv:             probas = model.fit(train[traincv], target[traincv]).predict_proba(train[testcv])             results.append( Error_function ) #print out the mean of the cross-validated results print "Results: " + str( np.array(results).mean() )

In [79]:
print(scores1,'\n\n\n',scores2,'\n\n\n',scores3,'\n\n\n',scores4,'\n\n',scores5,'\n\n',scores6)

[-242984.68012035 -231737.18575776 -239772.208757   -253554.50379286
 -227347.56436328 -231087.91100043 -239025.83108528 -249923.12218071
 -251334.27196624 -243357.76911812] 


 [-75578.63851421 -81476.117348   -92429.95445113 -71393.29186553
 -74713.6444283  -71093.85401536 -80367.62326209 -79407.39809049
 -77994.48619759 -79454.70257368] 


 [-228.3510756  -226.95680293 -228.71812963 -233.89057204 -223.48799889
 -225.1437366  -228.04627516 -234.07409714 -231.05251142 -230.75927079] 


 [-134.22556547 -138.37964308 -140.00601785 -134.86376842 -131.60583109
 -132.98744553 -136.38368956 -137.03455791 -135.88428809 -138.67382731] 

 [0.93884709 0.94144817 0.93890534 0.93590841 0.94162591 0.94109377
 0.93984743 0.93704585 0.93667033 0.9381394 ] 

 [0.98241742 0.98025929 0.97806864 0.98284345 0.98158047 0.98239871
 0.98033527 0.98146408 0.98096741 0.9811455 ]


In [83]:
print('mean sq error \n', 'knn:', scores1.mean(),'\ntree: ',scores2.mean(),'\n\n','mean absolut eror\n', 'knn: ', scores3.mean() , '\ntree: ' , scores4.mean() , '\n\nR² \n \n' , 'knn:' , scores5.mean(), '\ntree:' , scores6.mean())

mean sq error 
 knn: -241012.50481420424 
tree:  -78390.97107463916 

 mean absolut eror
 knn:  -229.0480470196998 
tree:  -136.00446342961834 

R² 
 
 knn: 0.9389531698832323 
tree: 0.9811480229780569
