In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor

In [2]:
# Load training data, reshape, add departure time as an integer number of seconds and add day of week:
df = pd.read_csv('train_revised.csv', parse_dates=['travel_date'], dayfirst=True)
train = df.groupby(['ride_id', 'travel_date', 'travel_time', 'travel_from', 'max_capacity']).size().reset_index(name='Count') #sort=False if needed?
train["travel_time"] = train["travel_time"].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))
train['day'] = train['travel_date'].dt.dayofweek

In [3]:
# The same for the test data
test = pd.read_csv('test_questions.csv', parse_dates=['travel_date'], dayfirst=True).drop(['car_type', 'travel_to'], axis=1)
test["travel_time"] = test["travel_time"].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))
test['day'] = test['travel_date'].dt.dayofweek

In [4]:
# The sample submission file
sample = pd.read_csv('sample_submission.csv')

In [5]:
# Combine training and test data for now, so that we can add uber movement data all in one go
train['t'] = 0
test['t'] = 1
X = pd.concat([train, test], sort=False)

In [6]:
# Load travel times from Uber movement data ( 3 x 3month periods)
t1 = pd.read_csv('Travel_Times_Daily_1.csv',parse_dates=['Date'])
t2 = pd.read_csv('Travel_Times_Daily_2.csv',parse_dates=['Date'])
t3 = pd.read_csv('Travel_Times_Daily_3.csv',parse_dates=['Date'])
travel_times = pd.concat([t1, t2, t3], ignore_index=True)
travel_times = travel_times.fillna(method='ffill')[['Daily Mean Travel Time (Seconds)', 'Date']]
travel_times['Date'] = pd.to_datetime(travel_times['Date'])
travel_times.head(1)

Unnamed: 0,Daily Mean Travel Time (Seconds),Date
0,2926.0,2017-12-15


In [7]:
# Merge with our contest data
X['Date'] = X['travel_date']
X.set_index('travel_date', inplace=True)
X = X.merge(travel_times, how='left', on='Date')
X.head(1)

Unnamed: 0,ride_id,travel_time,travel_from,max_capacity,Count,day,t,Date,Daily Mean Travel Time (Seconds)
0,1442,435,Migori,49,1.0,1,0,2017-10-17,2698.0


In [8]:
# Create the model
model = CatBoostRegressor(iterations=200, 
                          depth=4, 
                          learning_rate=0.5, 
                          loss_function='RMSE', verbose=False)

In [9]:
in_cols = ['travel_time', 'travel_from', 'max_capacity', 'day'] #'Daily Mean Travel Time (Seconds)' as an option

In [10]:
# Train model
tr = X.loc[X.t == 0]
model.fit(tr[in_cols], tr['Count'], cat_features=['travel_from', 'max_capacity', 'day'])

<catboost.core.CatBoostRegressor at 0x7f9566ec9860>

In [11]:
# Score model
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(model.predict(tr[in_cols]), tr['Count']))

3.7743491597009817


In [12]:
# Make predictions and append to the sample submission data, and save as csv
te = X.loc[X.t == 1]
te[in_cols].head()
te = X.loc[X.t == 1]
sample['number_of_ticket'][5:] = model.predict(te[in_cols])[5:] # Ignore the warning
sample.to_csv('catboost_predictions.csv', index=False)
sample.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,ride_id,number_of_ticket
0,247,4.0
1,256,2.0
2,275,5.0
3,285,7.0
4,286,9.0
5,287,6.747785
6,288,8.711852
7,292,9.485677
8,298,8.711852
9,302,8.711852
