In [19]:
%matplotlib inline
import os
import importlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

In [2]:
data = pd.read_csv(os.path.join('data', 'train.csv.bz2'))
data.loc[:, 'DateOfDeparture'] = pd.to_datetime(data.loc[:, 'DateOfDeparture'])

In [3]:
import problem
X, y = problem.get_train_data()

In [None]:
X

In [None]:
    #get dummies for evvent
    for i in range(len(external)):
        external['Rain'][i] = 1 if external['Events'] == '%Rain%' else 0
        external['Fog'][i] = 1 if external['Events'] == '%Fog%' else 0
        external['Hail'][i] = 1 if external['Events'] == '%Hail%' else 0
        external['Thunderstorm'][i] = 1 if external['Events'] == '%Thunderstorm%' else 0
        external['Tornado'][i] = 1 if external['Events'] == '%Tornado%' else 0

In [None]:
def _merge_external_data(X):
    
    from geopy.distance import geodesic

    external = pd.read_csv(r"submissions/use_external_data/external_data_mod.csv")
    external.loc[:,"Date"] = pd.to_datetime(external.loc[:,"Date"])
    
    #deal with data format
    external['Precipitationmm'].replace('T',0.0, inplace=True)
    external['Precipitationmm'] = external['Precipitationmm'].astype('float')
    for i in range(len(external['POP'])):
        external['POP'][i] = external.loc[:,'POP'][i].replace(',','')
    external['POP'] = external['POP'].astype('int')
    

    
    external.drop(columns=['Events', 'City', 'StateCodes'],inplace=True)
    
    # define the departure and arrival dataframe
    col_dep = ['d_' + name for name in list(external.columns)]
    col_arr = [w.replace('d_', 'a_') for w in col_dep]

    # adjust the column name for merge
    col_dep = [w.replace('d_AirPort', 'Departure') for w in col_dep]
    col_dep = [w.replace('d_Date', 'DateOfDeparture') for w in col_dep]
    col_arr = [w.replace('a_AirPort', 'Arrival') for w in col_arr]
    col_arr = [w.replace('a_Date', 'DateOfDeparture') for w in col_arr]

    # 
    d_external = external.copy()
    a_external = external.copy()
    
    # rename the column
    d_external.columns = col_dep
    a_external.columns = col_arr

     # merge with X_encoded
    X_encoded = X.copy()
    X_encoded.loc[:,'DateOfDeparture'] = pd.to_datetime(X_encoded.loc[:,'DateOfDeparture'])
    X_encoded = pd.merge(X_encoded, d_external, how='left', on=['DateOfDeparture', 'Departure'],
                        sort=False)
    X_encoded = pd.merge(X_encoded, a_external, how='left', on=['DateOfDeparture', 'Arrival'],
                        sort=False)

    # compute geographic distance
    X_encoded["Distance"] = X_encoded.apply(
            lambda x: geodesic((x["d_Latitude"],x["d_longitude"]),(x["a_Latitude"],x["a_longitude"])).km, axis=1)

    X_encoded = X_encoded.join(pd.get_dummies(X_encoded.loc[:,'Departure'], prefix='d'))
    X_encoded = X_encoded.join(pd.get_dummies(X_encoded.loc[:,'Arrival'], prefix='a'))

    # split year, month and etc.
    X_encoded['year'] = X_encoded.loc[:,'DateOfDeparture'].dt.year
    X_encoded['month'] = X_encoded.loc[:,'DateOfDeparture'].dt.month
    X_encoded['day'] = X_encoded.loc[:,'DateOfDeparture'].dt.day
    X_encoded['weekday'] = X_encoded.loc[:,'DateOfDeparture'].dt.weekday
    X_encoded['week'] = X_encoded.loc[:,'DateOfDeparture'].dt.week
    X_encoded['n_days'] = X_encoded.loc[:,'DateOfDeparture'].apply(lambda date: 
                                                                     (date - pd.to_datetime("1970-01-01")).days)

    X_encoded = X_encoded.join(pd.get_dummies(X_encoded['year'], prefix='y'))
    X_encoded = X_encoded.join(pd.get_dummies(X_encoded['month'], prefix='m'))
    X_encoded = X_encoded.join(pd.get_dummies(X_encoded['day'], prefix='d'))
    X_encoded = X_encoded.join(pd.get_dummies(X_encoded['weekday'], prefix='wd'))
    X_encoded = X_encoded.join(pd.get_dummies(X_encoded['week'], prefix='w'))

    # drop the original data
    X_encoded.drop(columns=['Departure','Arrival','DateOfDeparture',
                            'd_Unnamed: 0','d_Unnamed: 0.1','a_Unnamed: 0','a_Unnamed: 0.1',
                           'd_coordinates','a_coordinates','d_State','a_State',
                           'd_iso_region','a_iso_region'], inplace=True)


    
    
    return X_encoded

data_merger = FunctionTransformer(_merge_external_data)

In [None]:
X_train = data_merger.fit_transform(X)

In [None]:
X_train.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

n_estimators = 10
max_depth = 10
max_features = 10

categorical_cols = [c for c in X_train.columns
                       if X_train[c].dtype.kind not in ["i", "f",'u']]

regressor = RandomForestRegressor(
    n_estimators=n_estimators, max_depth=max_depth, max_features=max_features
)

In [None]:
for column in list(X_train.columns[X_train.isnull().sum() > 0]):
    mean_val = X_train[column].mean()
    X_train[column].fillna(mean_val, inplace=True)

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer


scores = cross_val_score(
    regressor, X_train, y, cv=5, scoring='neg_mean_squared_error'
)
rmse_scores = np.sqrt(-scores)

print(
    f"RMSE: {np.mean(rmse_scores):.4f} +/- {np.std(rmse_scores):.4f}"
)

In [9]:
import datetime as dt
import holidays

#get holidays 
Holidays_US = holidays.US()[dt.date(2011,7, 1):dt.date(2013,6, 5)] + holidays.US()[dt.date(2012,1, 1):dt.date(2012,12, 31)]

school_break = pd.read_csv('schoolholiday.csv')
school_break.loc[:, 'start'] =pd.to_datetime(school_break.loc[:, 'start']).dt.date
school_break.loc[:, 'end'] = pd.to_datetime(school_break.loc[:, 'end']).dt.date

def nextworkday(date):
    one_day = dt.timedelta(days=1)
    next_day = date + one_day 
    while next_day.weekday() in holidays.WEEKEND or next_day in Holidays_US:
        next_day += one_day 
    return (next_day - date).days
    
def lastworkday(date):
    one_day = dt.timedelta(days=1)
    last_day = date - one_day 
    while last_day.weekday() in holidays.WEEKEND or last_day in Holidays_US:
        last_day -= one_day 
    return (date - last_day).days   

def schoolbreak(date):
    one_day = dt.timedelta(days=1)
    lastschoolday = lastworkday(date)
    nextschoolday = nextworkday(date)
    for i in range(len(school_break['start'])-1):
        if date >= school_break['start'][i] and date <= school_break['end'][i]:
            lastschoolday = (date - school_break['start'][i] + one_day).days
            nextschoolday = (school_break['end'][i] + one_day - date).days   
        elif date == school_break['end'][i] + one_day:
            lastschoolday = (date - school_break['start'][i] + one_day).days
        elif date == school_break['start'][i] - one_day:
            nextschoolday = (school_break['end'][i] + one_day - date).days   
    return lastschoolday, nextschoolday



In [None]:
    X_encoded['lastschoolday'][i] = schoolbreak(X_encoded.loc[:,'DateOfDeparture'][i])[0]
    X_encoded['nextschoolday'][i] = schoolbreak(X_encoded.loc[:,'DateOfDeparture'][i])[1]
    X_encoded['lastschoolday']=0
X_encoded['nextschoolday']=0

In [59]:
def _nextworkday(date):
    one_day = dt.timedelta(days=1)
    next_day = date + one_day 
    while next_day.weekday() in holidays.WEEKEND or next_day in Holidays_US:
        next_day += one_day 
    return (next_day - date).days

def _lastworkday(date):
    one_day = dt.timedelta(days=1)
    last_day = date - one_day 
    while last_day.weekday() in holidays.WEEKEND or last_day in Holidays_US:
        last_day -= one_day 
    return (date - last_day).days   

def _schoolbreak(date):
    one_day = dt.timedelta(days=1)
    lastschoolday = lastworkday(date)
    nextschoolday = nextworkday(date)
    for i in range(len(school_break['start'])-1):
        if date >= school_break['start'][i] and date <= school_break['end'][i]:
            lastschoolday = (date - school_break['start'][i] + one_day).days
            nextschoolday = (school_break['end'][i] + one_day - date).days   
        elif date == school_break['end'][i] + one_day:
            lastschoolday = (date - school_break['start'][i] + one_day).days
        elif date == school_break['start'][i] - one_day:
            nextschoolday = (school_break['end'][i] + one_day - date).days   
    return lastschoolday, nextschoolday

def _merge_data(X):
    from geopy.distance import geodesic

    external = pd.read_csv(r"submissions/use_external_data/external_data_mod.csv")
    external.loc[:,"Date"] = pd.to_datetime(external.loc[:,"Date"])

    #deal with data format
    external['Precipitationmm'].replace('T',0.0, inplace=True)
    external['Precipitationmm'] = external['Precipitationmm'].astype('float')
    for i in range(len(external['POP'])):
        external['POP'][i] = external.loc[:,'POP'][i].replace(',','')
    external['POP'] = external['POP'].astype('int')   
    external.drop(columns=['Events', 'City', 'StateCodes'],inplace=True)

    external.drop(columns =['Year','Region'],inplace=True)

    # define the departure and arrival dataframe
    col_dep = ['d_' + name for name in list(external.columns)]
    col_arr = [w.replace('d_', 'a_') for w in col_dep]

    # adjust the column name for merge
    col_dep = [w.replace('d_AirPort', 'Departure') for w in col_dep]
    col_dep = [w.replace('d_Date', 'DateOfDeparture') for w in col_dep]
    col_arr = [w.replace('a_AirPort', 'Arrival') for w in col_arr]
    col_arr = [w.replace('a_Date', 'DateOfDeparture') for w in col_arr]

    # 
    d_external = external.copy()
    a_external = external.copy()

    # rename the column
    d_external.columns = col_dep
    a_external.columns = col_arr

     # merge with X_encoded
    X_encoded = X.copy()
    X_encoded.loc[:,'DateOfDeparture'] = pd.to_datetime(X_encoded.loc[:,'DateOfDeparture'])
    X_encoded = pd.merge(X_encoded, d_external, how='left', on=['DateOfDeparture', 'Departure'],
                        sort=False)
    X_encoded = pd.merge(X_encoded, a_external, how='left', on=['DateOfDeparture', 'Arrival'],
                        sort=False)
    #

    X_encoded['nextworkday']=0
    X_encoded['lastworkday']=0

    for i in range(len(X_encoded)):
        X_encoded['nextworkday'][i] = nextworkday(X_encoded.loc[:,'DateOfDeparture'][i])
        X_encoded['lastworkday'][i] = lastworkday(X_encoded.loc[:,'DateOfDeparture'][i])



    # compute geographic distance
    X_encoded["Distance"] = X_encoded.apply(
            lambda x: geodesic((x["d_Latitude"],x["d_longitude"]),(x["a_Latitude"],x["a_longitude"])).km, axis=1)

    # split year, month and etc.
    X_encoded['year'] = X_encoded.loc[:,'DateOfDeparture'].dt.year
    X_encoded['month'] = X_encoded.loc[:,'DateOfDeparture'].dt.month
    X_encoded['day'] = X_encoded.loc[:,'DateOfDeparture'].dt.day
    X_encoded['weekday'] = X_encoded.loc[:,'DateOfDeparture'].dt.weekday
    X_encoded['week'] = X_encoded.loc[:,'DateOfDeparture'].dt.week
    X_encoded['n_days'] = X_encoded.loc[:,'DateOfDeparture'].apply(lambda date: 
                                                                     (date - pd.to_datetime("1970-01-01")).days)

    X_encoded = X_encoded.join(pd.get_dummies(X_encoded['year'], prefix='y'))
    X_encoded = X_encoded.join(pd.get_dummies(X_encoded['month'], prefix='m'))
    X_encoded = X_encoded.join(pd.get_dummies(X_encoded['day'], prefix='d'))
    X_encoded = X_encoded.join(pd.get_dummies(X_encoded['weekday'], prefix='wd'))
    X_encoded = X_encoded.join(pd.get_dummies(X_encoded['week'], prefix='w'))

    # drop the original data
    X_encoded.drop(columns=['Departure','Arrival','DateOfDeparture',
                            'd_Unnamed: 0','d_Unnamed: 0.1','a_Unnamed: 0','a_Unnamed: 0.1',
                           'd_coordinates','a_coordinates','d_State','a_State',
                           'd_iso_region','a_iso_region'], inplace=True)



    return X_encoded

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [60]:

for column in list(X_train.columns[X_train.isnull().sum() > 0]):
    mean_val = X_train[column].mean()
    X_train[column].fillna(mean_val, inplace=True)

In [61]:
X_train

Unnamed: 0,WeeksToDeparture,std_wtd,d_Max TemperatureC,d_Mean TemperatureC,d_Min TemperatureC,d_Dew PointC,d_MeanDew PointC,d_Min DewpointC,d_Max Humidity,d_Mean Humidity,d_Min Humidity,d_Max Sea Level PressurehPa,d_Mean Sea Level PressurehPa,d_Min Sea Level PressurehPa,d_Max VisibilityKm,d_Mean VisibilityKm,d_Min VisibilitykM,d_Max Wind SpeedKm/h,d_Mean Wind SpeedKm/h,d_Max Gust SpeedKm/h,d_Precipitationmm,d_CloudCover,d_WindDirDegrees,d_Total_ops,d_Total Delays,d_Avg_delay_time,d_Total_delay_time,d_Actual Departures,d_Actual Arrivals,d_Departure Cancellations,d_Arrival Cancellations,d_Delayed Arrivals,d_Average Delay Per Delayed Arrival,d_Price,d_oil_price,d_elevation_ft,d_Division,d_Coast,d_Great Lakes,d_StPOP,d_StRBirth,d_StRDeath,d_StRMig,d_GDP,d_POP,d_RPI,d_Latitude,d_longitude,a_Max TemperatureC,a_Mean TemperatureC,a_Min TemperatureC,a_Dew PointC,a_MeanDew PointC,a_Min DewpointC,a_Max Humidity,a_Mean Humidity,a_Min Humidity,a_Max Sea Level PressurehPa,a_Mean Sea Level PressurehPa,a_Min Sea Level PressurehPa,a_Max VisibilityKm,a_Mean VisibilityKm,a_Min VisibilitykM,a_Max Wind SpeedKm/h,a_Mean Wind SpeedKm/h,a_Max Gust SpeedKm/h,a_Precipitationmm,a_CloudCover,a_WindDirDegrees,a_Total_ops,a_Total Delays,a_Avg_delay_time,a_Total_delay_time,a_Actual Departures,a_Actual Arrivals,a_Departure Cancellations,a_Arrival Cancellations,a_Delayed Arrivals,a_Average Delay Per Delayed Arrival,a_Price,a_oil_price,a_elevation_ft,a_Division,a_Coast,a_Great Lakes,a_StPOP,a_StRBirth,a_StRDeath,a_StRMig,a_GDP,a_POP,a_RPI,a_Latitude,a_longitude,nextworkday,lastworkday,Distance,year,month,day,weekday,week,n_days,y_2011,y_2012,y_2013,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,m_10,m_11,m_12,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,d_20,d_21,d_22,d_23,d_24,d_25,d_26,d_27,d_28,d_29,d_30,d_31,wd_0,wd_1,wd_2,wd_3,wd_4,wd_5,wd_6,w_1,w_2,w_3,w_4,w_5,w_6,w_7,w_8,w_9,w_10,w_11,w_12,w_13,w_14,w_15,w_16,w_17,w_18,w_19,w_20,w_21,w_22,w_23,w_24,w_25,w_26,w_27,w_28,w_29,w_30,w_31,w_32,w_33,w_34,w_35,w_36,w_37,w_38,w_39,w_40,w_41,w_42,w_43,w_44,w_45,w_46,w_47,w_48,w_49,w_50,w_51,w_52
0,12.875000,9.812647,35,31,26,20,18,17,62,51,39,1014,1012,1009,16,16,16,37,25,56.0,0.00,1,208,2686.0,2.0,28.0,56.0,933,931,9,13,143,71.16,95.140000,95.140000,672.0,3.0,0.0,1.0,12873763,12.445823,7.869352,-3.382339,579667,2719141,43033,41.978600,-87.904800,34,29,24,22,21,19,82,63,44,1012,1010,1009,16,16,16,48,29,60.0,0.00,5,161,1909.0,0.0,0.0,0.0,789,786,4,6,84,76.02,95.140000,95.140000,607.0,7.0,1.0,0.0,26094422,14.612295,6.532359,8.724163,430109,1242115,43672,32.896801,-97.038002,1,1,1290.346856,2012,6,19,1,25,15510,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,14.285714,9.466734,36,32,27,16,13,10,51,36,21,1011,1009,1005,16,15,11,32,18,50.0,0.00,5,187,1586.0,3.0,23.0,69.0,407,403,0,1,35,46.77,113.840000,113.840000,2181.0,8.0,0.0,0.0,2755245,12.680698,7.457300,8.145301,86924,593275,35572,36.080101,-115.152000,33,25,16,-2,-6,-8,21,14,7,1011,1008,1005,16,16,16,35,15,42.0,0.00,3,207,1781.0,0.0,0.0,0.0,668,667,1,1,54,56.50,113.840000,113.840000,5431.0,8.0,0.0,0.0,5191709,12.556818,6.343095,7.660864,167964,635163,46675,39.861698,-104.672997,1,3,1011.046677,2012,9,10,0,37,15593,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,10.863636,9.035883,6,2,-1,0,-2,-4,92,76,60,1027,1023,1020,16,9,1,27,15,32.0,0.00,7,357,1790.0,0.0,0.0,0.0,660,662,2,2,218,37.93,112.170000,112.170000,5431.0,8.0,0.0,0.0,5191709,12.556818,6.343095,7.660864,167964,635163,46675,39.861698,-104.672997,22,19,16,17,16,14,93,77,61,1018,1016,1014,16,13,8,24,8,29.0,0.00,5,266,1686.0,6.0,26.0,156.0,630,632,6,5,188,40.67,112.170000,112.170000,125.0,9.0,1.0,0.0,38062780,13.130815,6.326351,2.772770,806415,3851202,38192,33.942501,-118.407997,4,1,1387.023784,2012,10,5,4,40,15618,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,11.480000,7.990202,22,19,16,12,10,6,67,56,44,1026,1024,1021,16,16,16,37,26,45.0,0.00,8,70,2528.0,15.0,24.0,360.0,1068,1076,3,4,161,42.04,111.530554,111.530554,1026.0,5.0,1.0,0.0,9813201,13.584010,7.280748,3.375007,286108,437812,40064,33.636700,-84.428101,27,19,11,12,10,9,83,58,33,1028,1026,1024,16,16,16,23,6,29.0,0.00,1,93,2373.0,0.0,0.0,0.0,824,822,3,5,50,52.08,111.530554,111.530554,672.0,3.0,0.0,1.0,12858725,12.641046,7.907522,-3.182714,551983,2708114,41743,41.978600,-87.904800,2,2,974.957134,2011,10,9,6,40,15256,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,11.450000,9.517159,8,2,-4,-6,-9,-13,46,39,31,1012,1008,1004,16,16,16,61,30,77.0,0.00,5,270,1804.0,2.0,29.0,58.0,679,677,4,4,114,44.47,120.850000,120.850000,5431.0,8.0,0.0,0.0,5191709,12.556818,6.343095,7.660864,167964,635163,46675,39.861698,-104.672997,16,12,8,10,8,7,93,79,64,1027,1025,1024,16,12,3,24,8,29.0,0.00,7,300,1114.0,83.0,76.0,6308.0,427,434,18,14,129,67.81,120.850000,120.850000,13.0,9.0,1.0,0.0,38062780,13.130815,6.326351,2.772770,373546,828876,53395,37.618999,-122.375000,1,4,1556.391964,2012,2,21,1,8,15391,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8897,9.263158,7.316967,17,11,4,6,2,-2,85,56,27,1021,1020,1017,16,16,16,35,19,50.0,0.00,4,327,1183.0,0.0,0.0,0.0,242,239,1,1,31,40.90,111.530554,111.530554,645.0,3.0,0.0,1.0,9875736,11.541682,9.100641,-2.546041,206508,705118,40025,42.212399,-83.353401,20,14,8,4,2,-2,76,51,26,1020,1019,1018,16,16,16,32,16,39.0,0.00,0,329,2534.0,14.0,18.0,252.0,1079,1084,0,1,79,57.24,111.530554,111.530554,1026.0,5.0,1.0,0.0,9813201,13.584010,7.280748,3.375007,286108,437812,40064,33.636700,-84.428101,1,2,956.518345,2011,10,2,6,39,15249,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
8898,12.772727,10.641034,35,29,23,19,16,11,79,52,24,1015,1013,1010,16,16,16,48,30,60.0,0.00,4,187,1838.0,0.0,0.0,0.0,755,754,24,25,166,55.47,110.770000,110.770000,607.0,7.0,1.0,0.0,26094422,14.612295,6.532359,8.724163,430109,1242115,43672,32.896801,-97.038002,25,17,9,17,11,4,93,68,43,1012,1011,1009,16,16,16,27,15,34.0,0.00,3,216,2564.0,11.0,16.0,176.0,899,897,9,11,141,54.70,110.770000,110.770000,672.0,3.0,0.0,1.0,12873763,12.445823,7.869352,-3.382339,579667,2719141,43033,41.978600,-87.904800,1,1,1290.346856,2012,9,25,1,39,15608,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
8899,11.047619,7.908705,11,7,3,8,5,2,93,82,71,1025,1021,1017,16,15,8,24,10,32.0,2.03,7,125,1019.0,143.0,74.0,10582.0,402,410,42,38,231,70.05,109.540000,109.540000,13.0,9.0,1.0,0.0,38062780,13.130815,6.326351,2.772770,373546,828876,53395,37.618999,-122.375000,13,8,3,-7,-9,-12,40,31,22,1021,1016,1013,16,16,16,16,8,19.0,0.00,6,197,1352.0,1.0,22.0,22.0,361,364,10,9,83,56.19,109.540000,109.540000,2181.0,8.0,0.0,0.0,2755245,12.680698,7.457300,8.145301,86924,593275,35572,36.080101,-115.152000,1,1,666.249783,2012,1,19,3,3,15358,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8900,6.076923,4.030334,-6,-9,-13,-8,-12,-18,91,77,62,1018,1016,1014,16,9,2,34,14,42.0,1.02,7,268,2117.0,12.0,20.0,240.0,726,729,11,12,201,52.73,111.530554,111.530554,672.0,3.0,0.0,1.0,12890552,12.259127,8.001377,-2.733083,589812,2725731,43050,41.978600,-87.904800,1,-1,-3,-3,-6,-9,92,72,51,1018,1013,1010,16,13,2,40,9,58.0,0.25,7,296,1109.0,11.0,19.0,209.0,198,194,2,4,22,67.18,111.530554,111.530554,36.0,2.0,0.0,1.0,12781296,11.152840,10.004955,-0.211183,379899,1558313,45492,39.871899,-75.241096,1,2,1090.917520,2013,2,3,6,5,15739,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [82]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_classification

import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import xgboost
from xgboost import plot_importance

regressor = xgboost.XGBRegressor(colsample_bytree=0.7,
                     gamma=0.3,                 
                     learning_rate=0.05,
                     max_depth=6,
                     min_child_weight=3,
                     n_estimators=200,                                                                    
                     reg_alpha=0.75,
                     reg_lambda=0.5,
                     subsample=0.6,
                     seed=42) 


In [83]:
%%time
scores = cross_val_score(
    regressor, X_train, y, cv=5, scoring='neg_mean_squared_error'
)
rmse_scores = np.sqrt(-scores)

print(
    f"RMSE: {np.mean(rmse_scores):.4f} +/- {np.std(rmse_scores):.4f}"
)


RMSE: 0.4258 +/- 0.0205
CPU times: user 57.9 s, sys: 919 ms, total: 58.8 s
Wall time: 8.73 s


In [62]:
from sklearn.ensemble import GradientBoostingRegressor
regressor = GradientBoostingRegressor(learning_rate=0.09, n_estimators=160,min_samples_split=90,max_depth=4,subsample=0.8, random_state=12)
scores = cross_val_score(
    regressor, X_train, y, cv=5, scoring='neg_mean_squared_error'
)
rmse_scores = np.sqrt(-scores)

print(
    f"RMSE: {np.mean(rmse_scores):.4f} +/- {np.std(rmse_scores):.4f}"
)



RMSE: 0.3972 +/- 0.0213


In [33]:
X_encoded = X.copy()
X_encoded.loc[:,'DateOfDeparture'] = pd.to_datetime(X_encoded.loc[:,'DateOfDeparture'])
X_encoded['lastschoolday']=0
X_encoded['nextschoolday']=1

In [34]:
X_encoded['nextschoolday'][1] = schoolbreak(X_encoded.loc[:,'DateOfDeparture'][1])[1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [23]:
X_encoded['lastworkday'][i] = lastworkday(X_encoded.loc[:,'DateOfDeparture'][i])


NameError: name 'i' is not defined

In [65]:
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn import metrics

In [None]:
categorical_encoder = OrdinalEncoder()
preprocessor = make_column_transformer(
    (categorical_encoder, categorical_cols),
    remainder='passthrough',  # passthrough numerical columns as they are
)


In [64]:
!pip install xgboost

Collecting xgboost
  Using cached xgboost-1.2.1-py3-none-macosx_10_13_x86_64.macosx_10_14_x86_64.macosx_10_15_x86_64.whl (1.2 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.2.1


In [88]:
stack=['f']
a=stack.pop()