In [None]:
# craizy imports
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv('train_revised.csv')
test = pd.read_csv('test_questions.csv')
submission_format = pd.read_csv('sample_submission.csv')

In [None]:
print('Train', train.shape)
print('Test', test.shape)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
submission_format.head()

In [None]:
train.car_type.unique()
# we will convert this to categorical value

In [None]:
train.travel_from.unique() 
# probably this will be dummy encoded

In [None]:
train.travel_to.unique()
# only one variable probably will drop it

In [None]:
train.payment_method.unique()
# will convert to categorical value

In [None]:
train.seat_number.unique()

In [125]:
# appropriate features
features =  ['ride_id','travel_date', 'travel_from','travel_time','car_type','max_capacity']
training_features = train[features]
training_labels = train['seat_number']
test_features = test[features]

print('training_features', training_features.shape)
print('test_features', test_features.shape)
print('training_labels', training_labels.shape)

training_features (51645, 6)
test_features (1111, 6)
training_labels (51645,)


In [126]:
# for easy preprocessing
df_train_test = [training_features,test_features]

# Handle null values

In [None]:
training_features.isnull().any(axis = 0)

In [None]:
training_labels.isnull().any(axis = 0)

In [None]:
test_features.isnull().any(axis = 0)

# Explanatory data analysis


In [None]:
training_features.dtypes

In [127]:
# convert car type to categorical
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
for df in df_train_test:
    df['car_type'] = encoder.fit_transform(df.car_type)
training_labels = encoder.fit_transform(training_labels)

In [None]:
test_features.dtypes

In [None]:
training_features.travel_from.unique()

In [129]:
training_features = pd.get_dummies(training_features, columns= cols_to_dummy)
test_features = pd.get_dummies(test_features, columns=cols_to_dummy)

In [130]:
training_features.shape

(51645, 22)

In [131]:
training_features.head()

Unnamed: 0,ride_id,travel_date,travel_time,car_type,max_capacity,travel_from_Awendo,travel_from_Homa Bay,travel_from_Kehancha,travel_from_Kendu Bay,travel_from_Keroka,...,travel_from_Kisii,travel_from_Mbita,travel_from_Migori,travel_from_Ndhiwa,travel_from_Nyachenge,travel_from_Oyugis,travel_from_Rodi,travel_from_Rongo,travel_from_Sirare,travel_from_Sori
0,1442,17-10-17,7:15,0,49,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,5437,19-11-17,7:12,0,49,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,5710,26-11-17,7:05,0,49,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,5777,27-11-17,7:10,0,49,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5778,27-11-17,7:12,0,49,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [133]:
test_features.columns

Index(['ride_id', 'travel_date', 'travel_time', 'car_type', 'max_capacity',
       'travel_from_Awendo', 'travel_from_Homa Bay', 'travel_from_Kehancha',
       'travel_from_Keroka', 'travel_from_Kijauri', 'travel_from_Kisii',
       'travel_from_Mbita', 'travel_from_Migori', 'travel_from_Ndhiwa',
       'travel_from_Nyachenge', 'travel_from_Oyugis', 'travel_from_Rodi',
       'travel_from_Rongo', 'travel_from_Sirare', 'travel_from_Sori'],
      dtype='object')

In [134]:
training_features.columns

Index(['ride_id', 'travel_date', 'travel_time', 'car_type', 'max_capacity',
       'travel_from_Awendo', 'travel_from_Homa Bay', 'travel_from_Kehancha',
       'travel_from_Kendu Bay', 'travel_from_Keroka', 'travel_from_Keumbu',
       'travel_from_Kijauri', 'travel_from_Kisii', 'travel_from_Mbita',
       'travel_from_Migori', 'travel_from_Ndhiwa', 'travel_from_Nyachenge',
       'travel_from_Oyugis', 'travel_from_Rodi', 'travel_from_Rongo',
       'travel_from_Sirare', 'travel_from_Sori'],
      dtype='object')

In [135]:
train.travel_from.unique()

array(['Migori', 'Keroka', 'Homa Bay', 'Kisii', 'Keumbu', 'Rongo',
       'Kijauri', 'Oyugis', 'Awendo', 'Sirare', 'Nyachenge', 'Kehancha',
       'Kendu Bay', 'Sori', 'Rodi', 'Mbita', 'Ndhiwa'], dtype=object)

In [136]:
test.travel_from.unique()

array(['Kisii', 'Rongo', 'Sirare', 'Homa Bay', 'Rodi', 'Awendo', 'Migori',
       'Kijauri', 'Nyachenge', 'Kehancha', 'Keroka', 'Sori', 'Ndhiwa',
       'Mbita', 'Oyugis'], dtype=object)

In [137]:
# Keumbu and kendu bay are missing in the test set, so we are going to great their dummy encodings
test_features['travel_from_Keumbu'] = np.zeros(1111, dtype=int)
test_features['travel_from_Kendu Bay'] = np.zeros(1111, dtype=int)

In [138]:
test_features.shape

(1111, 22)

In [139]:
training_features.describe()

Unnamed: 0,ride_id,car_type,max_capacity,travel_from_Awendo,travel_from_Homa Bay,travel_from_Kehancha,travel_from_Kendu Bay,travel_from_Keroka,travel_from_Keumbu,travel_from_Kijauri,travel_from_Kisii,travel_from_Mbita,travel_from_Migori,travel_from_Ndhiwa,travel_from_Nyachenge,travel_from_Oyugis,travel_from_Rodi,travel_from_Rongo,travel_from_Sirare,travel_from_Sori
count,51645.0,51645.0,51645.0,51645.0,51645.0,51645.0,51645.0,51645.0,51645.0,51645.0,51645.0,51645.0,51645.0,51645.0,51645.0,51645.0,51645.0,51645.0,51645.0,51645.0
mean,10188.645793,0.380676,34.534321,0.034621,0.122064,0.036964,1.9e-05,0.019131,0.000426,0.019963,0.437738,0.007319,0.136064,0.00457,0.009914,9.7e-05,0.007842,0.076058,0.086146,0.001065
std,2211.295708,0.485558,18.451193,0.18282,0.327363,0.188675,0.0044,0.136985,0.020635,0.139875,0.496113,0.085239,0.342859,0.067445,0.099074,0.009839,0.088208,0.265093,0.280582,0.032617
min,1442.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8287.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,10353.0,0.0,49.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,12029.0,1.0,49.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,20117.0,1.0,49.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Feature engineering

In [112]:
import featuretools as ft

In [113]:
training_features.columns

Index(['travel_date', 'travel_time', 'car_type', 'max_capacity',
       'travel_from_Awendo', 'travel_from_Homa Bay', 'travel_from_Kehancha',
       'travel_from_Kendu Bay', 'travel_from_Keroka', 'travel_from_Keumbu',
       'travel_from_Kijauri', 'travel_from_Kisii', 'travel_from_Mbita',
       'travel_from_Migori', 'travel_from_Ndhiwa', 'travel_from_Nyachenge',
       'travel_from_Oyugis', 'travel_from_Rodi', 'travel_from_Rongo',
       'travel_from_Sirare', 'travel_from_Sori'],
      dtype='object')

In [150]:
# features to feature engineer
features = ['travel_date','travel_time','car_type','max_capacity']
combi['id'] = combi['ride_id'] + combi['max_capacity']
# combine training and test for easy feature engineering
combi = training_features.append(test_features, ignore_index = True)
combi.shape

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


(52756, 22)

In [151]:
# creating an entity set 'es'
es = ft.EntitySet(id = 'seat_number')

# adding a dataframe
es.entity_from_dataframe(entity_id = 'Nairobi_Movement', dataframe = combi, index = 'id')




Entityset: seat_number
  Entities:
    Nairobi_Movement [Rows: 52756, Columns: 23]
  Relationships:
    No relationships

In [None]:
es.normalize_entity(base_entity_id = 'Nairobi_Movement', new_entity_id ='seats', index = 'ride_id',
                   additional_variables = features)

In [144]:
feature_matrix, feature_names = ft.dfs(entityset = es, target_entity = 'Nairobi_Movement',
                                      max_depth = 2, verbose = 1, n_jobs = 3)

Built 131 features
EntitySet scattered to workers in 12.239 seconds
Elapsed: 01:03 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks


In [146]:
feature_matrix.columns

Index(['ride_id', 'travel_from_Awendo', 'travel_from_Homa Bay',
       'travel_from_Kehancha', 'travel_from_Kendu Bay', 'travel_from_Keroka',
       'travel_from_Keumbu', 'travel_from_Kijauri', 'travel_from_Kisii',
       'travel_from_Mbita',
       ...
       'seats.MEAN(Nairobi_Movement.travel_from_Sori)',
       'seats.COUNT(Nairobi_Movement)', 'seats.DAY(travel_date)',
       'seats.DAY(travel_time)', 'seats.YEAR(travel_date)',
       'seats.YEAR(travel_time)', 'seats.MONTH(travel_date)',
       'seats.MONTH(travel_time)', 'seats.WEEKDAY(travel_date)',
       'seats.WEEKDAY(travel_time)'],
      dtype='object', length=131)

In [160]:
feature_matrix.drop(['ride_id'], axis = 1, inplace = True)
train= feature_matrix[:51645]
test = feature_matrix[51645:]

# Modeling 

In [184]:
from xgboost import XGBClassifier

In [213]:
from sklearn.model_selection import train_test_split

# splitting train data into training and validation set
xtrain, xvalid, ytrain, yvalid = train_test_split(train, training_labels, test_size=0.25, random_state=11)

In [185]:
# feature selection
xgb = XGBClassifier()
xgb.fit(xtrain, ytrain)
imp = pd.DataFrame(xgb.feature_importances_ ,columns = ['Importance'],index = xtrain.columns)
imp = imp.sort_values(['Importance'], ascending = False)

print(imp[100:])

                                                    Importance
seats.car_type                                        0.235630
seats.DAY(travel_date)                                0.079611
seats.MONTH(travel_date)                              0.055879
seats.COUNT(Nairobi_Movement)                         0.055592
seats.WEEKDAY(travel_date)                            0.051527
seats.SUM(Nairobi_Movement.travel_from_Kehancha)      0.040565
seats.SUM(Nairobi_Movement.travel_from_Kisii)         0.040360
seats.SUM(Nairobi_Movement.travel_from_Homa Bay)      0.039580
seats.SUM(Nairobi_Movement.travel_from_Rongo)         0.038348
seats.SUM(Nairobi_Movement.travel_from_Awendo)        0.035474
travel_from_Rongo                                     0.031450
seats.SUM(Nairobi_Movement.travel_from_Kijauri)       0.029520
travel_from_Awendo                                    0.026154
seats.SUM(Nairobi_Movement.travel_from_Migori)        0.019256
seats.SUM(Nairobi_Movement.travel_from_Rodi)          0

In [231]:
feature_importance = ['seats.car_type','seats.DAY(travel_date)','seats.MONTH(travel_date)',
'seats.COUNT(Nairobi_Movement)','seats.WEEKDAY(travel_date)',                          
'seats.SUM(Nairobi_Movement.travel_from_Kehancha)',     
'seats.SUM(Nairobi_Movement.travel_from_Kisii)',       
'seats.SUM(Nairobi_Movement.travel_from_Homa Bay)',   
'seats.SUM(Nairobi_Movement.travel_from_Rongo)',       
'seats.SUM(Nairobi_Movement.travel_from_Awendo)',    
'travel_from_Rongo',                                   
'seats.SUM(Nairobi_Movement.travel_from_Kijauri)',     
'travel_from_Awendo',                                   
'seats.SUM(Nairobi_Movement.travel_from_Migori)',       
'seats.SUM(Nairobi_Movement.travel_from_Rodi)',         
'travel_from_Kijauri',                                
'seats.SUM(Nairobi_Movement.travel_from_Ndhiwa)',       
'travel_from_Kehancha',                                 
'seats.SUM(Nairobi_Movement.travel_from_Keroka)',       
'seats.SUM(Nairobi_Movement.travel_from_Nyachenge)',    
'seats.STD(Nairobi_Movement.travel_from_Awendo)',       
'travel_from_Kisii',                                   
'seats.SUM(Nairobi_Movement.travel_from_Sirare)',      
'travel_from_Nyachenge',                                
'travel_from_Rodi',                                     
'travel_from_Sirare',                                  
'travel_from_Keroka',                                   
'travel_from_Homa Bay',                               
'seats.YEAR(travel_date)',                              
'seats.SKEW(Nairobi_Movement.travel_from_Awendo)',     
'seats.SUM(Nairobi_Movement.travel_from_Mbita)',        
'seats.SUM(Nairobi_Movement.travel_from_Sori)',        
'travel_from_Keumbu',                                  
'travel_from_Migori',                                   
'travel_from_Ndhiwa',                                   
'travel_from_Mbita',                                    
'travel_from_Sori','seats.SUM(Nairobi_Movement.travel_from_Keumbu)']      

In [233]:
train_important = train[feature_importance]
test_important = test[feature_importance]

In [234]:
train_important.head()

Unnamed: 0_level_0,seats.car_type,seats.DAY(travel_date),seats.MONTH(travel_date),seats.COUNT(Nairobi_Movement),seats.WEEKDAY(travel_date),seats.SUM(Nairobi_Movement.travel_from_Kehancha),seats.SUM(Nairobi_Movement.travel_from_Kisii),seats.SUM(Nairobi_Movement.travel_from_Homa Bay),seats.SUM(Nairobi_Movement.travel_from_Rongo),seats.SUM(Nairobi_Movement.travel_from_Awendo),...,seats.YEAR(travel_date),seats.SKEW(Nairobi_Movement.travel_from_Awendo),seats.SUM(Nairobi_Movement.travel_from_Mbita),seats.SUM(Nairobi_Movement.travel_from_Sori),travel_from_Keumbu,travel_from_Migori,travel_from_Ndhiwa,travel_from_Mbita,travel_from_Sori,seats.SUM(Nairobi_Movement.travel_from_Keumbu)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,17,10,1,1,0,0,0,0,0,...,2017,,0,0,0,1,0,0,0,0
1,0,19,11,1,6,0,0,0,0,0,...,2017,,0,0,0,1,0,0,0,0
2,0,26,11,1,6,0,0,0,0,0,...,2017,,0,0,0,0,0,0,0,0
3,0,27,11,5,0,0,0,5,0,0,...,2017,0.0,0,0,0,0,0,0,0,0
4,0,27,11,31,0,0,0,0,0,0,...,2017,0.0,0,0,0,1,0,0,0,0


In [238]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train_important, training_labels, test_size=0.25, random_state=11)

In [241]:
model = XGBClassifier(learning_rate=0.01)
model.fit(xtrain,ytrain)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [242]:
from sklearn.metrics import mean_absolute_error
pred = model.predict(xvalid)
mean_absolute_error(yvalid, pred)

  if diff:


19.556691449814128