In [1]:
import pandas as pd
import numpy as np
import pylab as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
from sklearn import cluster
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.linear_model import LinearRegression 
from sklearn.compose import TransformedTargetRegressor
from sklearn import pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin

%matplotlib inline

In [2]:
# Calculate average speed, and use that 
class AvgSpeed(BaseEstimator, RegressorMixin):
    def fit(self, X, y):
        if 'Trip_distance' not in X.columns:
            raise KeyError('X Dataframe needs to have column "Trip_distance"')
        self.avg = (y / X['Trip_distance']).mean()
        return self
    def predict(self, X):
        if 'Trip_distance' not in X.columns:
            raise KeyError('X Dataframe needs to have column "Trip_distance"')
        return self.avg * X['Trip_distance']

In [4]:
from math import radians, cos, sin, asin, sqrt 
def distance(lat1, lat2, lon1, lon2): 
      
    # The math module contains a function named 
    # radians which converts from degrees to radians. 
    lon1 = radians(lon1) 
    lon2 = radians(lon2) 
    lat1 = radians(lat1) 
    lat2 = radians(lat2) 
       
    # Haversine formula  
    dlon = lon2 - lon1  
    dlat = lat2 - lat1 
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
  
    c = 2 * asin(sqrt(a))  
     
    # Radius of earth in kilometers. Use 3956 for miles 
    r = 6371
       
    # calculate the result 
    return(c * r*1000) 

# Load data

In [5]:
# load training
training = pd.read_csv('Train (4).csv').set_index('ID')
training['Timestamp'] = pd.to_datetime(training['Timestamp'])
training.head()

Unnamed: 0_level_0,Timestamp,Origin_lat,Origin_lon,Destination_lat,Destination_lon,Trip_distance,ETA
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
000FLWA8,2019-12-04 20:01:50+00:00,3.258,36.777,3.003,36.718,39627,2784
000RGOAM,2019-12-10 22:37:09+00:00,3.087,36.707,3.081,36.727,3918,576
001QSGIH,2019-11-23 20:36:10+00:00,3.144,36.739,3.088,36.742,7265,526
002ACV6R,2019-12-01 05:43:21+00:00,3.239,36.784,3.054,36.763,23350,3130
0039Y7A8,2019-12-17 20:30:20+00:00,2.912,36.707,3.207,36.698,36613,2138


In [None]:
training.shape

(83924, 7)

In [None]:
lat1 = np.array(training['Origin_lat'])
lat2 = np.array(training['Destination_lat'])
lon1 = np.array(training['Origin_lon'])
lon2 = np.array(training['Destination_lon'])

direct_dist = []
for j in range(len(lat1)):
  ds = distance(lat1[j],lat2[j],lon1[j],lon2[j])
  direct_dist.append(ds)

NameError: ignored

In [None]:
# sample set 
sample_set = pd.read_csv('SampleSubmission (6).csv').set_index('ID')
sample_set.head()

Unnamed: 0_level_0,ETA
ID,Unnamed: 1_level_1
000V4BQX,0
003WBC5J,0
004O4X3A,0
006CEI5B,0
009G0M2T,0


In [3]:
# testing data
testing = pd.read_csv('Test (2).csv').set_index('ID')
testing['Timestamp'] = pd.to_datetime(testing['Timestamp'])
testing

Unnamed: 0_level_0,Timestamp,Origin_lat,Origin_lon,Destination_lat,Destination_lon,Trip_distance
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
000V4BQX,2019-12-21 05:52:37+00:00,2.981,36.688,2.978,36.754,17549
003WBC5J,2019-12-25 21:38:53+00:00,3.032,36.769,3.074,36.751,7532
004O4X3A,2019-12-29 21:30:29+00:00,3.035,36.711,3.010,36.758,10194
006CEI5B,2019-12-31 22:51:57+00:00,2.902,36.738,3.208,36.698,32768
009G0M2T,2019-12-28 21:47:22+00:00,2.860,36.692,2.828,36.696,4513
...,...,...,...,...,...,...
ZZXSJW3Q,2019-12-21 04:10:59+00:00,2.947,36.748,2.949,36.747,3413
ZZYPNYYY,2019-12-30 20:31:22+00:00,3.037,36.742,2.924,36.762,14341
ZZYVPKXY,2019-12-27 20:21:38+00:00,2.993,36.723,2.983,36.747,4465
ZZZXGRIO,2019-12-29 22:00:31+00:00,2.954,36.743,3.057,36.760,13105


In [None]:
# weather data
weather_df = pd.read_csv('Weather.csv')
weather_df.head()

Unnamed: 0,date,dewpoint_2m_temperature,maximum_2m_air_temperature,mean_2m_air_temperature,mean_sea_level_pressure,minimum_2m_air_temperature,surface_pressure,total_precipitation,u_component_of_wind_10m,v_component_of_wind_10m
0,2019-11-01,290.630524,296.434662,294.125061,101853.617188,292.503998,100806.351562,0.004297,3.561323,0.941695
1,2019-11-02,289.135284,298.432404,295.551666,101225.164062,293.337921,100187.25,0.001767,5.318593,3.258237
2,2019-11-03,287.667694,296.612122,295.182831,100806.617188,293.674316,99771.414062,0.000797,8.447649,3.172982
3,2019-11-04,287.634644,297.173737,294.368134,101240.929688,292.376221,100200.84375,0.000393,5.991428,2.2367
4,2019-11-05,286.413788,294.284851,292.496979,101131.75,289.143066,100088.5,0.004658,6.96273,2.655364


In [None]:
training = training.sample(frac=1)

# Make train and OOT set

In [None]:
# make sure sorted by date
training = training.sort_values('Timestamp')
train_df = training.iloc[:70000]
oot_df = training.iloc[70000:]
oot_df.shape,train_df.shape

((13924, 7), (70000, 7))

# AVG Speed Model

In [None]:
# baseline model
split = model_selection.TimeSeriesSplit(n_splits=5)
result= model_selection.cross_validate(AvgSpeed(),train_df, train_df['ETA'],cv=split,
                               scoring='neg_root_mean_squared_error')
result, -np.mean(result['test_score']), np.std(result['test_score'])

({'fit_time': array([0.035748  , 0.0034349 , 0.00329566, 0.00370669, 0.00488329]),
  'score_time': array([0.00306201, 0.00146937, 0.00132155, 0.00131321, 0.00144005]),
  'test_score': array([-573.62735986, -587.88913103, -610.46965647, -624.62302806,
         -592.96717577])},
 597.9152702376128,
 17.80913607008961)

In [None]:
# check score on Dev set
reg = AvgSpeed()
reg.fit(train_df, train_df.ETA)
np.sqrt(metrics.mean_squared_error(oot_df.ETA, reg.predict(oot_df)))

626.3906189303457

## Make Submission

In [None]:
# fit model on all data
reg = AvgSpeed()
reg.fit(training, training.ETA)
reg.avg # time / distance

0.10475614041193526

In [None]:
# run test data through
submission = testing.copy()
submission['ETA'] = reg.predict(testing)
submission.head()

Unnamed: 0_level_0,Timestamp,Origin_lat,Origin_lon,Destination_lat,Destination_lon,Trip_distance,ETA
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
000V4BQX,2019-12-21 05:52:37+00:00,2.981,36.688,2.978,36.754,17549,1838.365508
003WBC5J,2019-12-25 21:38:53+00:00,3.032,36.769,3.074,36.751,7532,789.02325
004O4X3A,2019-12-29 21:30:29+00:00,3.035,36.711,3.01,36.758,10194,1067.884095
006CEI5B,2019-12-31 22:51:57+00:00,2.902,36.738,3.208,36.698,32768,3432.649209
009G0M2T,2019-12-28 21:47:22+00:00,2.86,36.692,2.828,36.696,4513,472.764462


In [None]:
# save
submission[['ETA']].to_csv('baseline_submit_base.csv')

# Do clustering on coordinates

In [None]:
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import MinMaxScaler
preprocessor = ColumnTransformer(
        transformers=[
            ('Coordninates-depart', cluster.KMeans(n_clusters=200),['Origin_lat','Origin_lon']),
              ('Coordninates-arrive', cluster.KMeans(n_clusters=200),[
                                                           'Destination_lat','Destination_lon']),
            ('Distance', StandardScaler(), ['Trip_distance']),
        
])
    # create model
pipeline = pipeline.Pipeline(steps=[
                          ('preprocessor', preprocessor),
    
                          ('Regression', RandomForestRegressor())])   

In [None]:
training = training.sample(frac=1)

In [None]:
preprocessor.fit_transform(training.drop(columns=['Timestamp','ETA']))


array([[ 0.16971891,  0.1387157 ,  0.03959022, ...,  0.03156608,
         0.07264158, -1.0368047 ],
       [ 0.38981514,  0.2095373 ,  0.25154577, ...,  0.23927702,
         0.2940072 , -0.26018175],
       [ 0.29918001,  0.09169834,  0.15599014, ...,  0.17076265,
         0.18913668,  1.23519391],
       ...,
       [ 0.26690517,  0.21085413,  0.11529514, ...,  0.11827917,
         0.20286513, -0.09635948],
       [ 0.25351574,  0.38065651,  0.27528311, ...,  0.25032519,
         0.27944712,  3.8892652 ],
       [ 0.14608012,  0.14686096,  0.08532154, ...,  0.08700882,
         0.04603094, -0.47681868]])

In [None]:
d_train = preprocessor.transform(training.drop(columns=['Timestamp','ETA']))

In [None]:
d_test = preprocessor.transform(testing)

In [None]:
prepro_y = MinMaxScaler()
prepro_y.fit(np.array(training.ETA).reshape(-1,1))
y = prepro_y.transform(np.array(training.ETA).reshape(-1,1))
y

array([[0.21118961],
       [0.15161352],
       [0.26274585],
       ...,
       [0.15848768],
       [0.26713767],
       [0.23467634]])

In [None]:
d_test = preprocessor.transform(testing)

In [None]:
forest_model = RandomForestRegressor()
forest_model.fit(d_train , y)

  


In [None]:
for j in range(0,80000,10000):
  X = d_train[j:j+10000]
  y =  training.ETA[j:j+10000]

  forest_model.fit(X,y)
  forest_model.n_estimators += 100
  print(np.sqrt(metrics.mean_squared_error(oot_df.ETA, forest_model.predict(d_train[70000:]))))



232.65038852404592
223.03812348728613
219.64375799334945
217.4335096352256
217.09047686539898
215.69386315761744
213.97300127169868
200.3266006414932


In [None]:
split = model_selection.TimeSeriesSplit(n_splits=20)
result= model_selection.cross_validate(pipeline,train_df.drop(columns=['ETA']), train_df['ETA'],cv=split,
                               scoring='neg_root_mean_squared_error')
result, -np.mean(result['test_score']), np.std(result['test_score'])

({'fit_time': array([ 1.96954727,  2.56212234,  3.16104078,  4.00381875,  4.91647792,
          5.84028339,  6.47989941,  7.86856914,  8.87671208,  9.32742667,
         10.57050085, 10.59952807, 12.22195888, 14.2204237 , 16.50415683,
         16.71502852, 16.78873849, 20.08929944, 21.70467567, 22.34933376]),
  'score_time': array([0.20513368, 0.25644565, 0.30045629, 0.32934189, 0.38876724,
         0.39366817, 0.40257025, 0.43843651, 0.42500973, 0.44950747,
         0.45359874, 0.49203897, 0.50861835, 0.47976851, 0.48998427,
         0.54773903, 0.53048944, 0.55715632, 0.5255456 , 0.56212568]),
  'test_score': array([-320.73313431, -252.38656324, -209.25529236, -189.52446208,
         -251.0086018 , -256.93604721, -222.55438091, -193.06151848,
         -183.04017131, -194.56721594, -235.35828086, -202.02487209,
         -176.79287082, -187.03032544, -202.41832104, -202.09121428,
         -217.34373576, -206.34423727, -200.71503986, -259.99529438])},
 218.1590789717261,
 34.315517335369

In [None]:
# test on DEV set
pipeline.fit(training.drop(columns=['ETA']), training.ETA)
np.sqrt(metrics.mean_squared_error(oot_df.ETA, pipeline.predict(oot_df)))



154.40718082594822

In [None]:
# make submission
# run test data through
submission = testing.copy()
submission['ETA'] = forest_model.predict(d_test)
submission.head()

Unnamed: 0_level_0,Timestamp,Origin_lat,Origin_lon,Destination_lat,Destination_lon,Trip_distance,ETA
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
000V4BQX,2019-12-21 05:52:37+00:00,2.981,36.688,2.978,36.754,17549,1440.430751
003WBC5J,2019-12-25 21:38:53+00:00,3.032,36.769,3.074,36.751,7532,938.129423
004O4X3A,2019-12-29 21:30:29+00:00,3.035,36.711,3.01,36.758,10194,1186.400455
006CEI5B,2019-12-31 22:51:57+00:00,2.902,36.738,3.208,36.698,32768,2144.383981
009G0M2T,2019-12-28 21:47:22+00:00,2.86,36.692,2.828,36.696,4513,635.802963


In [None]:
submission[['ETA']].to_csv('_rnd_mini_submit_base.csv')

In [None]:
# see what pipeline is doing
# extract feature names from pipeline
def get_column_names_from_ColumnTransformer(column_transformer):    
    col_name = []
    for transformer_in_columns in column_transformer.transformers_:#the last transformer is ColumnTransformer's 'remainder'
        raw_col_name = transformer_in_columns[2]
        methods = transformer_in_columns[0]
        if isinstance(transformer_in_columns[1],Pipeline): 
            transformer = transformer_in_columns[1].steps[-1][1]
        else:
            transformer = transformer_in_columns[1]
        try:
            names = transformer.get_feature_names()
        except AttributeError: # if no 'get_feature_names' function, use raw column name
            names = raw_col_name
        if isinstance(names,np.ndarray): # eg.
            col_name += [methods + '__'+ str(i) for i in names.tolist()]
        elif isinstance(names,list):
            col_name += [methods + '__'+ str(i) for i in names]    
        elif isinstance(names,str):
            col_name.append(methods + '__'+ names)
    return col_name



In [None]:
cols = get_column_names_from_ColumnTransformer(pipeline.named_steps['preprocessor'])
a = pipeline.named_steps['preprocessor']
a.transformers[0][2]

['Origin_lat', 'Origin_lon', 'Destination_lat', 'Destination_lon']

In [None]:
a.named_transformers_['Coordninates'].predict(train_df[a.transformers[0][2]])

array([7, 7, 2, ..., 5, 5, 0], dtype=int32)

# add weather data and try other models