In [544]:
#import train data using pandas
#key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count

import pandas as pd
import numpy as np
from pyproj import Geod
from scipy import stats
import seaborn
import matplotlib.pyplot as plt
import math
from math import sqrt
from scipy.stats import zscore
from sklearn import preprocessing
import os
import csv

taxi_train_df = pd.read_csv('train.csv',  error_bad_lines=False, usecols=range(1,8))
print("Loaded %d rows" % len(taxi_train_df))

Loaded 55423856 rows


In [None]:
#clean the data



num_cols = ['fare_amount', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']
int_cols = ['passenger_count']
date_cols = ['pickup_datetime']

#filter out rows where numerical values are not numerical
taxi_train_df[num_cols] = taxi_train_df[num_cols].apply(pd.to_numeric, errors='coerce').astype(np.float32)

#filter out invalid dates
taxi_train_df[date_cols] = taxi_train_df[date_cols].apply(pd.to_datetime, errors='coerce')

#filter out latitides==0 or longitudes=0
taxi_train_df.loc[taxi_train_df['pickup_longitude']==0, 'pickup_longitude'] = np.nan
taxi_train_df.loc[taxi_train_df['pickup_latitude']==0, 'pickup_latitude'] = np.nan
taxi_train_df.loc[taxi_train_df['dropoff_longitude']==0, 'dropoff_longitude'] = np.nan
taxi_train_df.loc[taxi_train_df['dropoff_latitude']==0, 'dropoff_latitude'] = np.nan
taxi_train_df.loc[taxi_train_df['passenger_count']>6, 'passenger_count'] = np.nan
taxi_train_df.loc[taxi_train_df['fare_amount']==0, 'passenger_count'] = np.nan


#drop NaN values
taxi_train_df = taxi_train_df.dropna()

#taxi_train_df.pickup_longitude = taxi_train_df.pickup_longitude.round(3)
#taxi_train_df['pickup_latitude'] = taxi_train_df['pickup_latitude'].round(6)
'''taxi_train_df.style.format({
    'pickup_longitude': '{:,.1f}'.format,
    'dropoff_longitude': '{:,.3f}'.format,
})'''

print(taxi_train_df.dtypes)

#filter out rows where pick_lat==drop_lat and pick_long==drop_lat
#taxi_train_df = [row for index,row in taxi_train_df.iterrows() if row['pickup_latitude'] != row['dropoff_latitude'] or row['pickup_longitude'] != row['dropoff_longitude'] or row['pickup_latitude']==0 or row['dropoff_latitude'] == 0 or row['pickup_longitude']==0 or row['dropoff_longitude']==0]



print(len(taxi_train_df))
print(taxi_train_df.values[0])


In [None]:
wgs84_geod = Geod(ellps='WGS84')

#get euclidean distance between 2 lat long points
def get_euclidean_distance(row):
  az12,az21,dist = wgs84_geod.inv(row['pickup_longitude'],row['pickup_latitude'],row['dropoff_longitude'],row['dropoff_latitude'])
  return dist

#get manhattan distance between 2 lat long points
def get_manhattan_distance(row):
    y = abs(row['pickup_longitude']-row['dropoff_longitude']) 
    x = abs(row['pickup_latitude']-row['dropoff_latitude'])
    if(x==0 or y==0):
        return row['euclidean_dist'] 
    z=y/x;
    x = row['euclidean_dist']/sqrt(1+(z*z))
    return x+z*x
    

#get hour and minute distance between 2 lat long points
def get_hour_minute(row):
    return int(row['pickup_datetime'].hour)*100 + int(row['pickup_datetime'].minute)

def get_date_month(row):
    return int(row['pickup_datetime'].month)*100 + int(row['pickup_datetime'].day)

def get_fare_per_km(row):
     return (row['fare_amount']*1000)/row['manhattan_dist']

def get_year_month(row):
    return int(row['pickup_datetime'].year)*100 + int(row['pickup_datetime'].month)

def get_time_slot(row):
    return int(row['time_of_day']/100)

def get_day_of_week(row):
    return row['pickup_datetime'].weekday()

def add_set1_features(taxi_train_df):
    taxi_train_df['euclidean_dist'] = taxi_train_df.apply (lambda row: get_euclidean_distance(row),axis=1)
    taxi_train_df['manhattan_dist'] = taxi_train_df.apply (lambda row: get_manhattan_distance(row),axis=1)
    taxi_train_df['time_of_day'] = taxi_train_df.apply (lambda row: get_hour_minute(row),axis=1)
    taxi_train_df['day_of_year'] = taxi_train_df.apply (lambda row: get_date_month(row),axis=1)

def add_set2_features(taxi_train_df):
    taxi_train_df['month_of_year'] = taxi_train_df.apply (lambda row: get_year_month(row),axis=1)
    taxi_train_df['time_slot'] = taxi_train_df.apply (lambda row: get_time_slot(row),axis=1)
    taxi_train_df['day_of_week'] = taxi_train_df.apply (lambda row: get_day_of_week(row),axis=1)
    print(taxi_train_df.values[0])
    
add_set1_features(taxi_train_df) 

taxi_train_df.loc[taxi_train_df['euclidean_dist']==0, 'euclidean_dist'] = np.nan
taxi_train_df = taxi_train_df.dropna()

taxi_train_df['fare_per_km'] = taxi_train_df.apply (lambda row: get_fare_per_km(row),axis=1)
add_set2_features(taxi_train_df)

taxi_train_df.loc[taxi_train_df['fare_per_km']>30, 'fare_per_km'] = np.nan
taxi_train_df.loc[taxi_train_df['fare_per_km']<0.3, 'fare_per_km'] = np.nan
taxi_train_df = taxi_train_df.dropna()


In [None]:
#2.1
print("Pearson correlation between fare amount and euclidean distance: %f" % stats.pearsonr(taxi_train_df.fare_amount, taxi_train_df.euclidean_dist)[0])

#2.2
print("Pearson correlation between time of day and distance travelled: %f" % stats.pearsonr(taxi_train_df.time_of_day, taxi_train_df.manhattan_dist)[0])

#2.3
print("Pearson correlation between fare amount and manhattan distance: %f" % stats.pearsonr(taxi_train_df.time_of_day, taxi_train_df.fare_amount)[0])


In [None]:
#3.1.1 Plot between taxi fare and distance travelled
plot = taxi_train_df.plot.scatter('fare_amount', 'euclidean_dist')
plot = taxi_train_df.plot.scatter('day_of_year', 'fare_amount')

In [None]:
#3.2.1 Plot between time of day and distance travelled
plot = taxi_train_df.plot.scatter('time_of_day', 'manhattan_dist')

In [None]:
#3.3.1 Plot between time of day and distance travelled
plot = taxi_train_df.plot.scatter('time_of_day', 'fare_amount')

In [None]:
taxi_train_df.groupby('time_slot')['fare_per_km'].aggregate('mean')

In [None]:
taxi_train_df.groupby('time_slot')['fare_per_km'].aggregate('mean').plot()

In [None]:


#taxi_train_df[(taxi_train_df["time_slot"] >7) & (taxi_train_df["time_slot"] < 13)].groupby('day_of_week')['fare_per_km'].aggregate('mean')

#taxi_train_df[(taxi_train_df["time_slot"] >14) & (taxi_train_df["time_slot"] < 20)].groupby('day_of_week')['fare_per_km'].aggregate('mean').plot().bar()
#plt.show()

In [None]:
taxi_train_df[(taxi_train_df["time_slot"] >14) & (taxi_train_df["time_slot"] < 20)].groupby('day_of_week')['fare_per_km'].aggregate('mean')


In [None]:
#taxi_train_df.day_of_week = pd.Categorical(pd.factorize(taxi_train_df.day_of_week)[0])



#taxi_train_df.manhattan_dist = taxi_train_df.manhattan_dist/taxi_train_df.manhattan_dist.max()
#taxi_train_df.time_slot = taxi_train_df.time_slot/taxi_train_df.time_slot.max()

#taxi_train_df.day_of_week = taxi_train_df.day_of_week/taxi_train_df.day_of_week.max()

#filtered_df = taxi_train_df.filter(['A','B','D'], axis=1)


def get_input_matrix(taxi_train_df):
    return np.column_stack((taxi_train_df.manhattan_dist, np.ones(len(taxi_train_df))))

features_cols = get_input_matrix(taxi_train_df)
predict_cols =  np.array(taxi_train_df['fare_amount'])
            
print(features_cols.shape)
print(predict_cols.shape)


In [None]:
(coefficient_weight, _, _, _) = np.linalg.lstsq(features_cols, predict_cols, rcond = None)
print(coefficient_weight)

In [None]:
test_data_df = pd.read_csv('test.csv')
test_data_df[date_cols] = test_data_df[date_cols].apply(pd.to_datetime, errors='coerce')

add_set1_features(test_data_df)
add_set2_features(test_data_df)

test_X = get_input_matrix(test_data_df)
test_y_predictions = np.matmul(test_X, coefficient_weight).round(decimals = 2)

submission = pd.DataFrame(
    {'key': test_data_df.key, 'fare_amount': test_y_predictions},
    columns = ['key', 'fare_amount'])
submission.to_csv('prediction-baseline.csv', index = False)


In [None]:
#taxi_train_df.day_of_week = pd.Categorical(pd.factorize(taxi_train_df.day_of_week)[0])



#taxi_train_df.manhattan_dist = taxi_train_df.manhattan_dist/taxi_train_df.manhattan_dist.max()
#taxi_train_df.time_slot = taxi_train_df.time_slot/taxi_train_df.time_slot.max()

#taxi_train_df.day_of_week = taxi_train_df.day_of_week/taxi_train_df.day_of_week.max()

#filtered_df = taxi_train_df.filter(['A','B','D'], axis=1)


def get_input_matrix(taxi_train_df):
    return np.column_stack((taxi_train_df.manhattan_dist, taxi_train_df.time_slot, taxi_train_df.day_of_week))

features_cols = get_input_matrix(taxi_train_df)
predict_cols =  np.array(taxi_train_df['fare_amount'])
            
print(features_cols.shape)
print(predict_cols.shape)


In [None]:
(coefficient_weight, _, _, _) = np.linalg.lstsq(features_cols, predict_cols, rcond = None)
print(coefficient_weight)

In [None]:
test_data_df = pd.read_csv('test.csv')
test_data_df[date_cols] = test_data_df[date_cols].apply(pd.to_datetime, errors='coerce')

add_set1_features(test_data_df)
add_set2_features(test_data_df)

test_X = get_input_matrix(test_data_df)
test_y_predictions = np.matmul(test_X, coefficient_weight).round(decimals = 2)

submission = pd.DataFrame(
    {'key': test_data_df.key, 'fare_amount': test_y_predictions},
    columns = ['key', 'fare_amount'])
submission.to_csv('prediction-with-extra-features.csv', index = False)





In [None]:
def get_inflation_data(file):
    with open(file, mode='r') as f:
        reader = csv.reader(f)
        return {rows[0]:rows[1] for rows in reader}

cpi_inflation = get_inflation_data('inflation-data.csv')

def get_inflation_adjusted_price(row):
    return (row['fare_amount']*float(cpi_inflation['201808']))/float(cpi_inflation[str(row['month_of_year'])])
    
taxi_train_df['inflation_adjusted_fare'] = taxi_train_df.apply (lambda row: get_inflation_adjusted_price(row),axis=1)


In [None]:
def get_input_matrix(taxi_train_df):
    return np.column_stack((taxi_train_df.manhattan_dist, taxi_train_df.pickup_latitude, taxi_train_df.pickup_longitude, np.ones(len(taxi_train_df))))


features_cols = get_input_matrix(taxi_train_df)
predict_cols =  np.array(taxi_train_df['inflation_adjusted_fare'])

(coefficient_weight, _, _, _) = np.linalg.lstsq(features_cols, predict_cols, rcond = None)
print(coefficient_weight)


In [None]:
test_data_df = pd.read_csv('test.csv')


test_data_df[date_cols] = test_data_df[date_cols].apply(pd.to_datetime, errors='coerce')

add_set1_features(test_data_df)
add_set2_features(test_data_df)


test_X = get_input_matrix(test_data_df)
test_y_predictions = np.matmul(test_X, coefficient_weight).round(decimals = 2)

def get_deflated_data(df, predictions):
    for i in range(0, len(predictions)):
        predictions[i] = (predictions[i]*float(cpi_inflation[str(test_data_df.iloc[1]['month_of_year'])])/float(cpi_inflation['201808'])).round(decimals = 2)
    return predictions


deflated_predictions = get_deflated_data(test_data_df, test_y_predictions)

submission = pd.DataFrame(
    {'key': test_data_df.key, 'fare_amount': deflated_predictions},
    columns = ['key', 'fare_amount'])
submission.to_csv('prediction-with-inflation.csv', index = False)
