In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sklearn
%matplotlib inline
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity

# Load the data
data = pd.read_csv("train_set_dirty.csv")

# Display the first few rows of the dataframe
print(data.head())



   traffic_volume holiday    temp  rain_1h  snow_1h  clouds_all weather_main  \
0          1493.0    None  287.15     0.00      0.0        90.0         Mist   
1             NaN    None  282.25     0.00      0.0        90.0       Clouds   
2             NaN    None     NaN     0.00      0.0        90.0      Drizzle   
3          5626.0    None  252.08     0.00      0.0        20.0         Haze   
4          5357.0    None  291.25     1.02      0.0         NaN         Rain   

  weather_description         date_time  
0                 NaN  18-09-2017 22:00  
1     overcast clouds  23-05-2013 11:00  
2             drizzle  04-06-2016 01:00  
3                haze  04-01-2017 08:00  
4       moderate rain  13-06-2013 13:00  


In [4]:
data.info()
data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30415 entries, 0 to 30414
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   traffic_volume       27411 non-null  float64
 1   holiday              27377 non-null  object 
 2   temp                 27395 non-null  float64
 3   rain_1h              27488 non-null  float64
 4   snow_1h              27404 non-null  float64
 5   clouds_all           27350 non-null  float64
 6   weather_main         27360 non-null  object 
 7   weather_description  27392 non-null  object 
 8   date_time            30414 non-null  object 
dtypes: float64(5), object(4)
memory usage: 2.1+ MB


Unnamed: 0,traffic_volume,temp,rain_1h,snow_1h,clouds_all
count,27411.0,27395.0,27488.0,27404.0,27350.0
mean,3278.871001,281.227741,0.448326,0.000147,45.626618
std,1987.279406,14.019289,59.304018,0.006633,38.809853
min,0.0,0.0,0.0,0.0,0.0
25%,1228.5,271.9,0.0,0.0,1.0
50%,3404.0,282.74,0.0,0.0,40.0
75%,4943.0,292.09,0.0,0.0,90.0
max,7280.0,308.43,9831.3,0.51,100.0


In [5]:
data.shape
data = data.drop_duplicates(subset = ['date_time'], ignore_index = True)
data.shape

(30415, 9)

In [6]:
data['weather_main'].nunique()
data['holiday'].nunique()

12

In [7]:
data['holiday'].value_counts()

None                         27346
Memorial Day                     4
Columbus Day                     4
Veterans Day                     4
New Years Day                    3
Labor Day                        3
Christmas Day                    3
State Fair                       3
Martin Luther King Jr Day        2
Thanksgiving Day                 2
Washingtons Birthday             2
Independence Day                 1
Name: holiday, dtype: int64

In [8]:
data['holiday'] = data['holiday'].apply(lambda x:0 if pd.isna(x) else 1)
data['holiday'].value_counts()

1    27377
0     3038
Name: holiday, dtype: int64

In [9]:
data.head()

Unnamed: 0,traffic_volume,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time
0,1493.0,1,287.15,0.0,0.0,90.0,Mist,,18-09-2017 22:00
1,,1,282.25,0.0,0.0,90.0,Clouds,overcast clouds,23-05-2013 11:00
2,,1,,0.0,0.0,90.0,Drizzle,drizzle,04-06-2016 01:00
3,5626.0,1,252.08,0.0,0.0,20.0,Haze,haze,04-01-2017 08:00
4,5357.0,1,291.25,1.02,0.0,,Rain,moderate rain,13-06-2013 13:00


In [10]:
data_columns = ['traffic_volume','holiday','temp','rain_1h','snow_1h','clouds_all','weather_main']

In [11]:
category_columns = ['holiday','weather_main']

In [12]:
data_wo_dates_wdesc = data[data_columns]
data_wo_dates_wdesc.shape

(30415, 7)

In [13]:
data_trn = pd.get_dummies(data_wo_dates_wdesc, columns = category_columns)
data_trn.shape
data_trn.head

<bound method NDFrame.head of        traffic_volume    temp  rain_1h  snow_1h  clouds_all  holiday_0  \
0              1493.0  287.15     0.00      0.0        90.0          0   
1                 NaN  282.25     0.00      0.0        90.0          0   
2                 NaN     NaN     0.00      0.0        90.0          0   
3              5626.0  252.08     0.00      0.0        20.0          0   
4              5357.0  291.25     1.02      0.0         NaN          0   
...               ...     ...      ...      ...         ...        ...   
30410           393.0  267.02     0.00      0.0        90.0          0   
30411          4727.0  267.59     0.00      0.0         5.0          0   
30412          3009.0  256.64     0.00      0.0         1.0          0   
30413           295.0  286.69     0.00      NaN         1.0          0   
30414           746.0  290.18     0.00      0.0         NaN          0   

       holiday_1  weather_main_Clear  weather_main_Clouds  \
0              1    

In [14]:
traffic = data_trn.iloc[:,0]
traffic.head

<bound method NDFrame.head of 0        1493.0
1           NaN
2           NaN
3        5626.0
4        5357.0
          ...  
30410     393.0
30411    4727.0
30412    3009.0
30413     295.0
30414     746.0
Name: traffic_volume, Length: 30415, dtype: float64>

In [15]:
data_train = data_trn.iloc[:,1:]
data_train.head

<bound method NDFrame.head of          temp  rain_1h  snow_1h  clouds_all  holiday_0  holiday_1  \
0      287.15     0.00      0.0        90.0          0          1   
1      282.25     0.00      0.0        90.0          0          1   
2         NaN     0.00      0.0        90.0          0          1   
3      252.08     0.00      0.0        20.0          0          1   
4      291.25     1.02      0.0         NaN          0          1   
...       ...      ...      ...         ...        ...        ...   
30410  267.02     0.00      0.0        90.0          0          1   
30411  267.59     0.00      0.0         5.0          0          1   
30412  256.64     0.00      0.0         1.0          0          1   
30413  286.69     0.00      NaN         1.0          0          1   
30414  290.18     0.00      0.0         NaN          0          1   

       weather_main_Clear  weather_main_Clouds  weather_main_Drizzle  \
0                       0                    0                     0 

In [16]:
sum(traffic.isnull())

3004

In [17]:
traffic.ffill(inplace = True)

In [18]:
regr = HistGradientBoostingRegressor(random_state = 32)

In [19]:
regr.fit(data_train, traffic)

In [20]:
y_pred = regr.predict(data_train)

In [21]:
print(f' r2_score : {r2_score(traffic, y_pred)} \n mean squared error : {mean_squared_error(traffic, y_pred)}')

 r2_score : 0.08065327264810762 
 mean squared error : 3633621.664738271


In [25]:
test_pd = pd.read_csv('test_set_nogt.csv')
test = test_pd.copy(deep = True)

In [26]:
test.shape

(9641, 8)

In [27]:
test.head

<bound method NDFrame.head of      holiday    temp  rain_1h  snow_1h  clouds_all weather_main  \
0       None  289.58     0.00      0.0          90       Clouds   
1       None  290.13     0.00      0.0          90       Clouds   
2       None  291.14     0.00      0.0          75       Clouds   
3       None  291.72     0.00      0.0           1        Clear   
4       None  281.18     0.00      0.0           1        Clear   
...      ...     ...      ...      ...         ...          ...   
9636    None  280.28     0.00      0.0          90       Clouds   
9637    None  282.18     0.00      0.0          90       Clouds   
9638    None  283.48     0.00      0.0          90         Rain   
9639    None  283.48     0.00      0.0          90      Drizzle   
9640    None  284.20     0.25      0.0          75         Rain   

          weather_description         date_time  
0             overcast clouds  02-10-2012 11:00  
1             overcast clouds  02-10-2012 12:00  
2              

In [28]:
test['weather_main'].nunique()
test['holiday'].nunique()

10

In [29]:
test['holiday'].value_counts()

None                         9625
Labor Day                       3
Independence Day                3
Washingtons Birthday            2
Thanksgiving Day                2
State Fair                      2
Christmas Day                   1
New Years Day                   1
Martin Luther King Jr Day       1
Memorial Day                    1
Name: holiday, dtype: int64

In [30]:
#data['holiday'].value_counts()

In [31]:
test['holiday'] = test['holiday'].apply(lambda x:0 if pd.isna(x) else 1)
test['holiday'].value_counts()
test.head

<bound method NDFrame.head of       holiday    temp  rain_1h  snow_1h  clouds_all weather_main  \
0           1  289.58     0.00      0.0          90       Clouds   
1           1  290.13     0.00      0.0          90       Clouds   
2           1  291.14     0.00      0.0          75       Clouds   
3           1  291.72     0.00      0.0           1        Clear   
4           1  281.18     0.00      0.0           1        Clear   
...       ...     ...      ...      ...         ...          ...   
9636        1  280.28     0.00      0.0          90       Clouds   
9637        1  282.18     0.00      0.0          90       Clouds   
9638        1  283.48     0.00      0.0          90         Rain   
9639        1  283.48     0.00      0.0          90      Drizzle   
9640        1  284.20     0.25      0.0          75         Rain   

          weather_description         date_time  
0             overcast clouds  02-10-2012 11:00  
1             overcast clouds  02-10-2012 12:00  
2  

In [32]:
test_columns = ['holiday', 'temp', 'rain_1h', 'snow_1h', 'clouds_all', 'weather_main']
test_wo_dates_wdesc = test[test_columns]
test_wo_dates_wdesc.shape

(9641, 6)

In [33]:
test_trn = pd.get_dummies(test_wo_dates_wdesc, columns = category_columns)
test_trn.shape
test_trn.head
print(test_trn)

        temp  rain_1h  snow_1h  clouds_all  holiday_1  weather_main_Clear  \
0     289.58     0.00      0.0          90          1                   0   
1     290.13     0.00      0.0          90          1                   0   
2     291.14     0.00      0.0          75          1                   0   
3     291.72     0.00      0.0           1          1                   1   
4     281.18     0.00      0.0           1          1                   1   
...      ...      ...      ...         ...        ...                 ...   
9636  280.28     0.00      0.0          90          1                   0   
9637  282.18     0.00      0.0          90          1                   0   
9638  283.48     0.00      0.0          90          1                   0   
9639  283.48     0.00      0.0          90          1                   0   
9640  284.20     0.25      0.0          75          1                   0   

      weather_main_Clouds  weather_main_Drizzle  weather_main_Fog  \
0     

In [34]:
traffic = test_trn.iloc[:,0]
traffic.head

test_trn = test_trn.iloc[:,1:]
test_trn.head

<bound method NDFrame.head of       rain_1h  snow_1h  clouds_all  holiday_1  weather_main_Clear  \
0        0.00      0.0          90          1                   0   
1        0.00      0.0          90          1                   0   
2        0.00      0.0          75          1                   0   
3        0.00      0.0           1          1                   1   
4        0.00      0.0           1          1                   1   
...       ...      ...         ...        ...                 ...   
9636     0.00      0.0          90          1                   0   
9637     0.00      0.0          90          1                   0   
9638     0.00      0.0          90          1                   0   
9639     0.00      0.0          90          1                   0   
9640     0.25      0.0          75          1                   0   

      weather_main_Clouds  weather_main_Drizzle  weather_main_Fog  \
0                       1                     0                 0   
1  

In [35]:
sum(traffic.isnull())

0

In [36]:
traffic.ffill(inplace = True)
regr = HistGradientBoostingRegressor(random_state = 32)
regr.fit(test_trn, traffic)

In [37]:
y_pred_test = regr.predict(test_trn)

In [41]:
submission = pd.DataFrame({'ID': test_trn.index, 'traffic_volume': y_pred_test})

In [42]:
submission.to_csv('basemodel_1.submission.csv', index = False)