In [70]:
import pandas as pd
import numpy as np

## Import the dataset and set the index

In [71]:
df_test = pd.read_csv(r'D:\TODO\REPOSITORY_GITHUB\DataScience\SUPERVISED_LEARNING_REGRESSION\FILES\test.csv')
df_test = df_test.set_index('key')
df_train = pd.read_csv(r'D:\TODO\REPOSITORY_GITHUB\DataScience\SUPERVISED_LEARNING_REGRESSION\FILES\train.csv')
df_train = df_train.set_index('key')

## Set a sample of data train

In [72]:
# Sometimes the amount of data is quite large and therefore very heavy, so it is very helpful to take a sample of this dataset
#in order to optimize the resources of our computer.
df_train_sample = df_train.sample(100000)

# Feature Enginnering

## Separate the data, time and hour

In [73]:
df_train_sample['pickup_datetime'] = pd.to_datetime(df_train_sample['pickup_datetime'], format= "%Y-%m-%d %H:%M:%S UTC")

df_train_sample['year'] = df_train_sample.pickup_datetime.apply(lambda t: t.year)
df_train_sample['weekday'] = df_train_sample.pickup_datetime.apply(lambda t: t.weekday())
df_train_sample['hour'] = df_train_sample.pickup_datetime.apply(lambda t: t.hour)

df_test['pickup_datetime'] = pd.to_datetime(df_test['pickup_datetime'], format= "%Y-%m-%d %H:%M:%S UTC")

df_test['year'] = df_test.pickup_datetime.apply(lambda t: t.year)
df_test['weekday'] = df_test.pickup_datetime.apply(lambda t: t.weekday())
df_test['hour'] = df_test.pickup_datetime.apply(lambda t: t.hour)

## set a function to calculate the distance

In [74]:
def distance(lat1, lon1, lat2, lon2):
    
    p = 0.017453292519943295 # Pi/180
    a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
    return 0.6213712 * 12742 * np.arcsin(np.sqrt(a))

In [75]:
df_train_sample['distance'] = distance(df_train_sample.pickup_latitude, df_train_sample.pickup_longitude, 
                                    df_train_sample.dropoff_latitude, df_train_sample.dropoff_longitude)
df_test['distance'] = distance(df_test.pickup_latitude, df_test.pickup_longitude, 
                                    df_test.dropoff_latitude, df_test.dropoff_longitude)

# Data Cleaning

## Drop the missing values

In [76]:
df_train_sample = df_train_sample.dropna()
df_test = df_test.dropna()

### removing the rows with fare amount less than zero

In [77]:
df_train_sample = df_train_sample[df_train_sample.fare_amount > 0]

### removing the rows with distance less than zero

In [78]:
df_train_sample = df_train_sample[df_train_sample.distance > 0]
df_test = df_test[df_test.distance > 0]

### removing pickup_datetime column

In [79]:
df_train_sample = df_train_sample.drop(['pickup_datetime'], axis=1)
df_test = df_test.drop(['pickup_datetime'], axis=1)

In [80]:
df_train_sample.head()

Unnamed: 0_level_0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,weekday,hour,distance
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2014-05-10 01:44:25.0000001,10.0,-73.996943,40.714611,-73.982931,40.731077,1,2014,5,1,1.35377
2014-05-08 15:15:35.0000001,19.5,-73.948047,40.775132,-73.968212,40.796963,3,2014,3,15,1.840666
2012-06-23 10:01:00.00000035,10.1,-73.990252,40.731488,-73.97967,40.765465,5,2012,5,10,2.412044
2012-02-03 00:29:00.000000121,14.1,-74.002895,40.73345,-73.990118,40.6905,5,2012,4,0,3.042069
2012-02-25 02:59:25.0000004,6.9,-74.000426,40.718208,-73.990814,40.741296,1,2012,5,2,1.672731


In [81]:
df_test.head()

Unnamed: 0_level_0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,weekday,hour,distance
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-01-27 13:08:24.0000002,-73.97332,40.763805,-73.98143,40.743835,1,2015,1,13,1.443607
2015-01-27 13:08:24.0000003,-73.986862,40.719383,-73.998886,40.739201,1,2015,1,13,1.507044
2011-10-08 11:53:44.0000002,-73.982524,40.75126,-73.979654,40.746139,1,2011,5,11,0.384398
2012-12-01 21:12:12.0000002,-73.98116,40.767807,-73.990448,40.751635,1,2012,5,21,1.218529
2012-12-01 21:12:12.0000003,-73.966046,40.789775,-73.988565,40.744427,1,2012,5,21,3.347514


### Set the features and target

In [82]:
features = ['passenger_count','year','weekday','hour','distance']

In [83]:
X_train_sample = df_train_sample[features]
y_train_sample = df_train_sample['fare_amount']
X_test = df_test[features]

In [84]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_train_sample, y_train_sample, test_size=0.25, random_state=42)

## fit the model and predictions

In [85]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler


model_lin = Pipeline((
        ("standard_scaler", StandardScaler()),
        ("lin_reg", LinearRegression()),
    ))
model_lin.fit(X_train, y_train)

In [86]:
from sklearn.metrics import mean_absolute_error

y_valid_pred = model_lin.predict(X_valid)
mae = mean_absolute_error(y_valid, y_valid_pred)
print(f"The mean absolute error of our model is {mae}")

The mean absolute error of our model is 5.919836683521551


In [87]:
y_test = model_lin.predict(X_test)

In [88]:
X_test['fare_amount'] = y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['fare_amount'] = y_test


In [89]:
X_test.head()

Unnamed: 0_level_0,passenger_count,year,weekday,hour,distance,fare_amount
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-01-27 13:08:24.0000002,1,2015,1,13,1.443607,13.254289
2015-01-27 13:08:24.0000003,1,2015,1,13,1.507044,13.254362
2011-10-08 11:53:44.0000002,1,2011,5,11,0.384398,10.85322
2012-12-01 21:12:12.0000002,1,2012,5,21,1.218529,11.181942
2012-12-01 21:12:12.0000003,1,2012,5,21,3.347514,11.184406


In [90]:
X_test.to_csv("submission.csv", columns=["fare_amount"])