In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


sns.set()

In [None]:
df = pd.read_parquet('data-yellow-202103.parquet')

In [None]:
df.head()

In [None]:
df

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
df = df.drop(['airport_fee', 'VendorID', 'congestion_surcharge'], axis= 1)

In [None]:
len(df[df['total_amount'] <= 0])

In [None]:
df = df[df['total_amount'] > 0]

In [None]:
len(df[df['fare_amount'] <= 0])

In [None]:
len(df[df['trip_distance'] <= 0])

In [None]:
df.groupby('RatecodeID')['trip_distance'].median()

In [None]:
df.loc[(df['trip_distance'] <= 0) & (df['RatecodeID'] == 1.0)] = 2.8
df.loc[(df['trip_distance'] <= 0) & (df['RatecodeID'] == 2.0)] = 20
df.loc[(df['trip_distance'] <= 0) & (df['RatecodeID'] == 3.0)] = 24
df.loc[(df['trip_distance'] <= 0) & (df['RatecodeID'] == 4.0)] = 12
df.loc[(df['trip_distance'] <= 0) & (df['RatecodeID'] == 5.0)] = 3.1

In [None]:
for i in df.columns[[0, 1]]:
    df[i] = pd.to_datetime(df[i])

In [None]:
timedeltas = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
trip_duration = []
for i in timedeltas :
    trip_duration.append(i.total_seconds() / 60)

df['trip_duration'] = trip_duration
df['trip_duration'] = df['trip_duration'].round(2)

In [None]:
df = df[df['trip_duration'] > 1]
df.shape

In [None]:
df['store_and_fwd_flag'].value_counts()

In [None]:
df['store_and_fwd_flag'].fillna('N', inplace= True)

In [None]:
df['payment_type'].value_counts(normalize= True)

In [None]:
df.loc[(df['payment_type'].isna()) & (df['total_amount'] > 50), 'payment_type'] = 1.0
df.loc[(df['payment_type'].isna()) & (df['total_amount'] <= 50), 'payment_type'] = 2.0

In [None]:
df.loc[(df['RatecodeID'].isna()) & (df['total_amount'] >= 184), 'RatecodeID'] = 4.0
df.loc[(df['RatecodeID'].isna()) & (138 <= df['total_amount']) & (df['total_amount']< 184), 'RatecodeID'] = 3.0
df.loc[(df['RatecodeID'].isna()) & (79 <= df['total_amount']) & (df['total_amount']< 138), 'RatecodeID'] = 2.0
df.loc[(df['RatecodeID'].isna()) & (0 <= df['total_amount']) & (df['total_amount']< 79), 'RatecodeID'] = 1.0

In [None]:
df.loc[df['passenger_count']==0, 'passenger_count'] = 1.0
df.loc[df['passenger_count']==7, 'passenger_count'] = 1.0
df.loc[df['passenger_count']==32, 'passenger_count'] = 1.0

In [None]:
df.loc[df['passenger_count'].isna(), 'passenger_count'] = 1.0

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
df.shape

In [None]:
plt.figure(figsize= (18, 10))
sns.heatmap(df.corr(), annot= True);



##### Busiest day

In [None]:
df['day_of_week'] = df['tpep_pickup_datetime'].dt.day_name()

In [None]:
plt.figure(figsize= (18, 7))
sns.countplot(y= 'day_of_week', data= df)
plt.ylabel('');

##### Busiest hour

In [None]:
plt.figure(figsize= (18, 7))
sns.countplot(x= df['tpep_pickup_datetime'].dt.hour, data= df, color= 'goldenrod')
plt.ylabel('')
plt.xlabel('Hour of Day');

In [None]:
pip install scikit-learn

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

X = df[['trip_distance', 'passenger_count', 'RatecodeID', 'trip_duration']]
y = df['total_amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 42)

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_test,y_pred)))
print('Mean squared error:', mean_squared_error(y_test, y_pred))
print('R2 Score:', r2_score(y_test, y_pred))
plt.plot(y_pred, y_test, 'o')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw= 4)
plt.xlabel('Predicted')
plt.ylabel('Measured')
plt.show()



In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators= 100, random_state= 42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_test,y_pred)))
print('Mean squared error:', mean_squared_error(y_test, y_pred))
print('R2 Score:', r2_score(y_test, y_pred))
plt.plot(y_pred, y_test, 'o')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw= 4)
plt.xlabel('Predicted')
plt.ylabel('Measured')
plt.show()


In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators= 100, random_state= 42)
xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)

print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_test,y_pred)))
print('Mean squared error:', mean_squared_error(y_test, y_pred))
print('R2 Score:', r2_score(y_test, y_pred))
plt.plot(y_pred, y_test, 'o')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw= 4)
plt.xlabel('Predicted')
plt.ylabel('Measured')
plt.show()


In [None]:
from lightgbm import LGBMRegressor


lgbm = LGBMRegressor(n_estimators= 100, random_state= 42)
lgbm.fit(X_train, y_train)

y_pred = lgbm.predict(X_test)

print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_test,y_pred)))
print('Mean squared error:', mean_squared_error(y_test, y_pred))
print('R2 Score:', r2_score(y_test, y_pred))
plt.plot(y_pred, y_test, 'o')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw= 4)
plt.xlabel('Predicted')
plt.ylabel('Measured')
plt.show()




In [None]:
!pip install catboost

In [None]:
from catboost import CatBoostRegressor

cat = CatBoostRegressor(n_estimators= 100, random_state= 42)
cat.fit(X_train, y_train)

y_pred = cat.predict(X_test)

print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_test,y_pred)))
print('Mean squared error:', mean_squared_error(y_test, y_pred))
print('R2 Score:', r2_score(y_test, y_pred))
plt.plot(y_pred, y_test, 'o')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw= 4)
plt.xlabel('Predicted')
plt.ylabel('Measured')
plt.show()


In [None]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_test,y_pred)))
print('Mean squared error:', mean_squared_error(y_test, y_pred))
print('R2 Score:', r2_score(y_test, y_pred))
plt.plot(y_pred, y_test, 'o')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw= 4)
plt.xlabel('Predicted')
plt.ylabel('Measured')
plt.show()



In [None]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_test,y_pred)))
print('Mean squared error:', mean_squared_error(y_test, y_pred))
print('R2 Score:', r2_score(y_test, y_pred))
import matplotlib.pyplot as plt

plt.plot(y_pred, y_test, 'o')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw= 4)
plt.xlabel('Predicted')
plt.ylabel('Measured')
plt.show()
