In [73]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Load dataset

In [74]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DataSet/Taxis_Dataset_Seaborn_Processed_EDA.csv")
df.head()

Unnamed: 0,pickup,dropoff,passengers,distance,fare,tip,tolls,total,color,payment,pickup_zone,dropoff_zone,pickup_borough,dropoff_borough,trip_day,trip_month,trip_year,trip_duration_minutes,tip_percent,weekday
0,2019-03-23 20:21:09,2019-03-23 20:27:24,1,1.6,7.0,2.15,0.0,12.95,yellow,credit card,Lenox Hill West,UN/Turtle Bay South,Manhattan,Manhattan,23,3,2019,6.25,30.714286,5
1,2019-03-04 16:11:55,2019-03-04 16:19:00,1,0.79,5.0,0.0,0.0,9.3,yellow,cash,Upper West Side South,Upper West Side South,Manhattan,Manhattan,4,3,2019,7.083333,0.0,0
2,2019-03-27 17:53:01,2019-03-27 18:00:25,1,1.37,7.5,2.36,0.0,14.16,yellow,credit card,Alphabet City,West Village,Manhattan,Manhattan,27,3,2019,7.4,31.466667,2
3,2019-03-10 01:23:59,2019-03-10 01:49:51,1,7.7,27.0,6.15,0.0,36.95,yellow,credit card,Hudson Sq,Yorkville West,Manhattan,Manhattan,10,3,2019,25.866667,22.777778,6
4,2019-03-30 13:27:42,2019-03-30 13:37:14,3,2.16,9.0,1.1,0.0,13.4,yellow,credit card,Midtown East,Yorkville West,Manhattan,Manhattan,30,3,2019,9.533333,12.222222,5


# Check Dataframe

In [95]:
# Select columns
df_model = df[['passengers', 'distance', 'fare', 'tip', 'tolls', 'total', 'trip_day', 'weekday','trip_duration_minutes']].reset_index(drop=True)

In [96]:
# Check correlation
df_model.corr()

Unnamed: 0,passengers,distance,fare,tip,tolls,total,trip_day,weekday,trip_duration_minutes
passengers,1.0,0.005481,-0.000912,0.017838,-0.002885,0.00838,0.006977,0.019587,-0.00772
distance,0.005481,1.0,0.947958,0.47698,0.642333,0.929224,-0.000506,-0.012428,0.822834
fare,-0.000912,0.947958,1.0,0.487101,0.617182,0.972311,-0.006831,-0.021982,0.890197
tip,0.017838,0.47698,0.487101,1.0,0.41467,0.650677,0.017689,-0.02441,0.405853
tolls,-0.002885,0.642333,0.617182,0.41467,1.0,0.690879,0.01451,-0.015226,0.459584
total,0.00838,0.929224,0.972311,0.650677,0.690879,1.0,-0.000766,-0.030639,0.852401
trip_day,0.006977,-0.000506,-0.006831,0.017689,0.01451,-0.000766,1.0,0.070425,-0.010618
weekday,0.019587,-0.012428,-0.021982,-0.02441,-0.015226,-0.030639,0.070425,1.0,-0.045149
trip_duration_minutes,-0.00772,0.822834,0.890197,0.405853,0.459584,0.852401,-0.010618,-0.045149,1.0


# Linear Regression Model 1:

In [97]:
# Select columns
df_model = df[['passengers', 'distance', 'fare', 'tip', 'tolls', 'total', 'trip_day', 'weekday','trip_duration_minutes']].reset_index(drop=True)
df_model.head()

Unnamed: 0,passengers,distance,fare,tip,tolls,total,trip_day,weekday,trip_duration_minutes
0,1,1.6,7.0,2.15,0.0,12.95,23,5,6.25
1,1,0.79,5.0,0.0,0.0,9.3,4,0,7.083333
2,1,1.37,7.5,2.36,0.0,14.16,27,2,7.4
3,1,7.7,27.0,6.15,0.0,36.95,10,6,25.866667
4,3,2.16,9.0,1.1,0.0,13.4,30,5,9.533333


In [78]:
# Build model and prediction
X = df_model[['passengers', 'distance', 'fare', 'tip', 'tolls', 'total', 'trip_day', 'weekday']]
y = df_model['trip_duration_minutes']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Build LR Model
model = LinearRegression()
model.fit(X_train, y_train)

# Prediction
y_pred = model.predict(X_test)

# Mean Squared Error & R2 Score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Show result
print(f"\n\nMean Squared Error : {mse}")
print(f"R-squared : {r2}")
print(f"model intercept : {model.intercept_}")
print(f"model coef : {model.coef_}")



Mean Squared Error : 29.30779182046513
R-squared : 0.7609577359651621
model intercept : 0.09036708465906074
model coef : [-0.08405789 -0.72898269  1.02615142 -0.32014416 -1.28730808  0.25972756
  0.0037296  -0.13870605]


# Linear Regression Model 2:
### Standardization

In [98]:
# Select columns
df_model = df[['passengers', 'distance', 'fare', 'tip', 'tolls', 'total', 'trip_day', 'weekday','trip_duration_minutes']].reset_index(drop=True)
df_model.head()

Unnamed: 0,passengers,distance,fare,tip,tolls,total,trip_day,weekday,trip_duration_minutes
0,1,1.6,7.0,2.15,0.0,12.95,23,5,6.25
1,1,0.79,5.0,0.0,0.0,9.3,4,0,7.083333
2,1,1.37,7.5,2.36,0.0,14.16,27,2,7.4
3,1,7.7,27.0,6.15,0.0,36.95,10,6,25.866667
4,3,2.16,9.0,1.1,0.0,13.4,30,5,9.533333


In [99]:
# Standardization
scaler = StandardScaler()
for col in df_model.columns:
  df_model[col] = scaler.fit_transform(df_model[[col]])

df_model.head()

Unnamed: 0,passengers,distance,fare,tip,tolls,total,trip_day,weekday,trip_duration_minutes
0,-0.450451,-0.37578,-0.549175,0.075071,-0.229932,-0.413941,0.814763,0.959025,-0.702901
1,-0.450451,-0.593552,-0.735718,-0.835286,-0.229932,-0.695808,-1.323093,-1.652267,-0.630579
2,-0.450451,-0.437617,-0.50254,0.16399,-0.229932,-0.3205,1.264838,-0.60775,-0.603096
3,-0.450451,1.264233,1.316252,1.768759,-0.229932,1.439435,-0.647981,1.481283,0.999564
4,1.205381,-0.225221,-0.362633,-0.369522,-0.229932,-0.37919,1.602394,0.959025,-0.417951


In [100]:
# Build model and prediction
X = df_model[['passengers', 'distance', 'fare', 'tip', 'tolls', 'total', 'trip_day', 'weekday']]
y = df_model['trip_duration_minutes']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Build LR Model
model = LinearRegression()
model.fit(X_train, y_train)

# Prediction
y_pred = model.predict(X_test)

# Mean Squared Error & R2 Score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Show result
print(f"\n\nMean Squared Error : {mse}")
print(f"R-squared : {r2}")
print(f"model intercept : {model.intercept_}")
print(f"model coef : {model.coef_}")



Mean Squared Error : 0.22074406400591287
R-squared : 0.7609577359651621
model intercept : -0.000873876123984985
model coef : [-0.00881141 -0.23531663  0.95480806 -0.06561833 -0.1529537   0.2918897
  0.00287667 -0.02304958]


# Linear Regression Model 3:
### MinMaxScaler

In [101]:
# MinMaxScaler
scaler = MinMaxScaler()
for col in df_model.columns:
  df_model[col] = scaler.fit_transform(df_model[[col]])

df_model.head()

Unnamed: 0,passengers,distance,fare,tip,tolls,total,trip_day,weekday,trip_duration_minutes
0,0.166667,0.043597,0.040268,0.092712,0.0,0.067139,0.733333,0.833333,0.057612
1,0.166667,0.021526,0.026846,0.0,0.0,0.046104,0.1,0.0,0.065355
2,0.166667,0.03733,0.043624,0.101768,0.0,0.074112,0.866667,0.333333,0.068298
3,0.166667,0.209809,0.174497,0.265201,0.0,0.205452,0.3,1.0,0.239895
4,0.5,0.058856,0.053691,0.047434,0.0,0.069733,0.966667,0.833333,0.088121


In [104]:
# Build model and prediction
X = df_model[['passengers', 'distance', 'fare', 'tip', 'tolls', 'total', 'trip_day', 'weekday']]
y = df_model['trip_duration_minutes']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Build LR Model
model = LinearRegression()
model.fit(X_train, y_train)

# Prediction
y_pred = model.predict(X_test)

# Mean Squared Error & R2 Score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Show result
print(f"\n\nMean Squared Error : {mse}")
print(f"R-squared : {r2}")
print(f"model intercept : {model.intercept_}")
print(f"model coef : {model.coef_}")



Mean Squared Error : 0.002530603043070394
R-squared : 0.7609577359651623
model intercept : 0.01308248973564681
model coef : [-4.68651685e-03 -2.48601500e-01  1.42075170e+00 -6.89869254e-02
 -2.87326686e-01  4.18782031e-01  1.03969064e-03 -7.73334035e-03]


# Linear Regression Model 4:
### RobustScaler

In [106]:
# Select columns
df_model = df[['passengers', 'distance', 'fare', 'tip', 'tolls', 'total', 'trip_day', 'weekday','trip_duration_minutes']].reset_index(drop=True)

In [107]:
# Robust
scaler = RobustScaler()
for col in df_model.columns:
  df_model[col] = scaler.fit_transform(df_model[[col]])

df_model.head()

Unnamed: 0,passengers,distance,fare,tip,tolls,total,trip_day,weekday,trip_duration_minutes
0,0.0,-0.022624,-0.294118,0.141844,0.0,-0.127368,0.533333,0.666667,-0.391911
1,0.0,-0.38914,-0.529412,-0.620567,0.0,-0.511579,-0.733333,-1.0,-0.322176
2,0.0,-0.126697,-0.235294,0.216312,0.0,0.0,0.8,-0.333333,-0.295676
3,0.0,2.737557,2.058824,1.560284,0.0,2.398947,-0.333333,1.0,1.249651
4,2.0,0.230769,-0.058824,-0.230496,0.0,-0.08,1.0,0.666667,-0.117155


In [109]:
# Build model and prediction
X = df_model[['distance', 'fare', 'tip', 'tolls', 'total']]
y = df_model['trip_duration_minutes']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Build LR Model
model = LinearRegression()
model.fit(X_train, y_train)

# Prediction
y_pred = model.predict(X_test)

# Mean Squared Error & R2 Score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Show result
print(f"\n\nMean Squared Error : {mse}")
print(f"R-squared : {r2}")
print(f"model intercept : {model.intercept_}")
print(f"model coef : {model.coef_}")



Mean Squared Error : 0.2056041254947887
R-squared : 0.7605255245275094
model intercept : 0.025915071597842676
model coef : [-0.13570559  0.72370538 -0.07804424 -0.10863354  0.21506448]
