In [None]:
import pandas as pd
import calendar
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [None]:
#Opens the training data to read it
train = pd.read_csv(r'train.csv')
#Removes unecessary columns
train.drop(labels='Unnamed: 0', axis = 1, inplace = True)
train.drop(labels='key', axis = 1, inplace = True)
#Adds a distance column in the excel file using pythagorean theorem
#Next four columns no longer needed
train.drop(labels='pickup_longitude', axis = 1, inplace = True)
train.drop(labels='pickup_latitude', axis = 1, inplace = True)
train.drop(labels='dropoff_longitude', axis = 1, inplace = True)
train.drop(labels='dropoff_latitude', axis = 1, inplace = True)
#Changing the date and time from object to datetime value
train["pickup_datetime"] = pd.to_datetime(train["pickup_datetime"])

In [None]:
#Opens the testing data to read it
test = pd.read_csv(r'test.csv')
#Removes these four columns due to adding a distance column in this file too
test.drop(labels='pickup_longitude', axis = 1, inplace = True)
test.drop(labels='pickup_latitude', axis = 1, inplace = True)
test.drop(labels='dropoff_longitude', axis = 1, inplace = True)
test.drop(labels='dropoff_latitude', axis = 1, inplace = True)

In [None]:
#Stores every distance value in 'train' that is greater than the max distance value in 'test'
too_long = train[train['distance']>test['distance'].max()]
#Removes previously stored values
train.drop(too_long.index, inplace = True)

In [None]:
#Creates columns for specific times
train['hour'] = train['pickup_datetime'].apply(lambda x : x.hour)
train['day'] = train['pickup_datetime'].apply(lambda x : x.day)
train['month'] = train['pickup_datetime'].apply(lambda x : x.month)
train['year'] = train['pickup_datetime'].apply(lambda x : x.year)
train['weekday'] = train['pickup_datetime'].apply(lambda x : calendar.day_name[x.weekday()])

In [None]:
#Creates a hashmap of each weekday with their corresponding 0-indexed positions
train.weekday = train.weekday.map({'Sunday' : 0, 'Monday' : 1, 'Tuesday' : 2, 'Wednesday' : 3, 'Wednesday' : 4, 'Thursday' : 5, 'Friday' : 6, 'Saturday' : 7})

In [None]:
#Removes the obsolete 'pickup_datetime' column
train.drop(labels = 'pickup_datetime',axis=1,inplace=True)

In [None]:
#Splits "train" into two variables, one with only the "fare_amount" column, and one with every other column
#"inplace" is not used since this is creating a variable and not editing the original .csv file
x = train.drop("fare_amount", axis=1)
y = train["fare_amount"]

In [None]:
#Splits x and y each into two variable, one of which is to be used as training data and the otherto be used as the testing data
#test_size of 0.2 is used soo the training and testing data keep the same shape ratio as the original train and test .csv files
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)

In [None]:
#Creates random forest with 100 trees instead of the default 10 trees since it is a larger data set
#It is important to note that it does not matter what number is used for random_state as long as the same number is used throughout the code
rfrmodel = RandomForestRegressor(n_estimators=100, random_state=1)

In [None]:
#Trains the data
rfrmodel.fit(x_train,y_train)

In [None]:
#Predicts cost of the trip since data has been fitted
rfrmodel_pred= rfrmodel.predict(x_test)

In [None]:
#Reopens the original test file so all original columns are there
test = pd.read_csv(r'test.csv')
test.drop(labels = 'key',axis=1,inplace=True)

In [None]:
#Changes the format of "pickup_datetime" column so it can be labeled by hour, day, month, etc...
test["pickup_datetime"] = pd.to_datetime(test["pickup_datetime"])

In [None]:
#Formats "pickup_datetime"
#It is important to note that every column must be added in the same order they were added before the data was fitted
test['hour']=test['pickup_datetime'].apply(lambda x:x.hour)
test['day']=test['pickup_datetime'].apply(lambda x:x.day)
test['month']=test['pickup_datetime'].apply(lambda x:x.month)
test['year']=test['pickup_datetime'].apply(lambda x:x.year)
test['weekday']=test['pickup_datetime'].apply(lambda x: calendar.day_name[x.weekday()])

In [None]:
#Creates a hashmap of each weekday with their corresponding 0-indexed positions
test.weekday = test.weekday.map({'Sunday':0,'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6})

In [None]:
#Drops unecessary columns
test.drop(labels = "pickup_datetime",axis=1,inplace=True)
test.drop(labels='pickup_longitude', axis = 1, inplace = True)
test.drop(labels='pickup_latitude', axis = 1, inplace = True)
test.drop(labels='dropoff_longitude', axis = 1, inplace = True)
test.drop(labels='dropoff_latitude', axis = 1, inplace = True)

In [None]:
#Does the prediction calculations for each line 
rfrmodel_prediction = rfrmodel.predict(test)

In [None]:
#Creates a variable for all the predictions
df = pd.DataFrame(rfrmodel_pred)

In [None]:
#Stores the predictions in a .csv file
df.to_csv('pred.csv')