# <font color = #32CD32> Part 1: Import Libraries and Dataset

### <h2> 1.1. Import Libraries</h2>

In [1]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Set the style of seaborn
sns.set_palette("muted")
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

import pickle

warnings.filterwarnings("ignore")

### <h2> 1.2. Read data from Week 3 - notebook "G2M(EDA & Hypothesis Testing).ipynb" </h2>

In [2]:
df = pd.read_csv('df.csv')
df.head()

Unnamed: 0,Transaction ID,Date of Travel,Company,City,KM Travelled,Price Charged,Cost of Trip,Customer ID,Payment_Mode,Gender,Age,Income (USD/Month),Population,Users,Year,Month,Day,Profit,Age Group,Share_of_users
0,10000011,2016-01-07,Pink Cab,ATLANTA GA,30.45,370.95,313.635,29290,Card,Male,28,10813,814885,24701,2016,1,3,57.315,20-30,3.031
1,10000012,2016-01-05,Pink Cab,ATLANTA GA,28.62,358.52,334.854,27703,Card,Male,27,9237,814885,24701,2016,1,1,23.666,20-30,3.031
2,10000013,2016-01-01,Pink Cab,ATLANTA GA,9.04,125.2,97.632,28712,Cash,Male,53,11242,814885,24701,2016,1,4,27.568,50-60,3.031
3,10000014,2016-01-06,Pink Cab,ATLANTA GA,33.17,377.4,351.602,28020,Cash,Male,23,23327,814885,24701,2016,1,2,25.798,20-30,3.031
4,10000015,2016-01-02,Pink Cab,ATLANTA GA,8.73,114.62,97.776,27182,Card,Male,33,8536,814885,24701,2016,1,5,16.844,30-40,3.031


# <font color = #32CD32> Part 2: Data preprocessing

In [3]:
# Choose features
features = df.drop(columns=['Transaction ID', 'Date of Travel', 'Price Charged', 'Cost of Trip', 'Profit', 'Share_of_users']).columns

X_data = df[features]
y_data = df['Profit']

### <h2> Train and test splitting </h2>

In [4]:
# Divide the data into training and test subsets
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=26)

### <h2> Scaling and Encoding </h2>

In [18]:
# Create columns for categorical and numeric features
categorical_features = ['City', 'Payment_Mode', 'Gender', 'Company']

numerical_features = ['KM Travelled', 'Age', 'Income (USD/Month)', 'Year', 'Month', 'Day']

In [6]:
# Create a preprocessor for columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ])

In [7]:
# Apply transformations to the train subset
X_train = preprocessor.fit_transform(X_train)
# Apply transformations to the test subset
X_test = preprocessor.transform(X_test)

In [25]:
# Get feature names after transformation
feature_names = preprocessor.get_feature_names_out()
# Print or save feature names
print(feature_names)

['num__KM Travelled' 'num__Age' 'num__Income (USD/Month)' 'num__Year'
 'num__Month' 'num__Day' 'cat__City_AUSTIN TX' 'cat__City_BOSTON MA'
 'cat__City_CHICAGO IL' 'cat__City_DALLAS TX' 'cat__City_DENVER CO'
 'cat__City_LOS ANGELES CA' 'cat__City_MIAMI FL' 'cat__City_NASHVILLE TN'
 'cat__City_NEW YORK NY' 'cat__City_ORANGE COUNTY' 'cat__City_PHOENIX AZ'
 'cat__City_PITTSBURGH PA' 'cat__City_SACRAMENTO CA'
 'cat__City_SAN DIEGO CA' 'cat__City_SEATTLE WA'
 'cat__City_SILICON VALLEY' 'cat__City_TUCSON AZ'
 'cat__City_WASHINGTON DC' 'cat__Payment_Mode_Cash' 'cat__Gender_Male'
 'cat__Company_Yellow Cab']


In [26]:
# Save the preprocessor
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

# <font color = #32CD32> Part 3: Modeling

### <h2> Linear Regression </h2>

In [8]:
# Create a model
model_lr = LinearRegression()
# train a model
model_lr.fit(X_train, y_train)

# predictions
y_pred_lr = model_lr.predict(X_test)
# MSE
mse_lr = mean_squared_error(y_test, y_pred_lr)
print(f'MSE LR: {mse_lr}')

MSE LR: 10312.174321701985


### <h2> Lasso </h2>

In [11]:
# Create a model
model_lasso = Lasso(alpha=1)
# train a model
model_lasso.fit(X_train, y_train)

# predictions
y_pred_lasso = model_lasso.predict(X_test)
# MSE
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
print(f'MSE Lasso: {mse_lasso}')

MSE Lasso: 10507.815689910074


### <h2> Ridge </h2>

In [12]:
# Create a model
model_ridge = Ridge(alpha=1)
# train a model
model_ridge.fit(X_train, y_train)

# predictions
y_pred_ridge = model_ridge.predict(X_test)
# MSE
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
print(f'MSE Ridge: {mse_ridge}')

MSE Ridge: 10312.174314410899


### <h2> Random Forest </h2>

In [74]:
# Create a model
rf_model = RandomForestRegressor(n_estimators=100, random_state=26)
# train a model
rf_model.fit(X_train, y_train)

# predictions
y_pred_rf = rf_model.predict(X_test)
# MSE
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f'MSE RF: {mse_rf}')

MSE RF: 5151.276584713742


### <h2> XGBoost </h2>

In [22]:
# Create a model
xgb_model = XGBRegressor(n_estimators=150, random_state=26)
# train a model
xgb_model.fit(X_train, y_train)

# predictions
y_pred_xgb = xgb_model.predict(X_test)
# MSE
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f'MSE XGBoost Regressor: {mse_xgb}')

MSE XGBoost Regressor: 4781.124801964232


### <h2> XGBoost with __GridSearchCV__ </h2>

In [13]:
# Define a grid of hyperparameters for XGBoost
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [-1, 3, 6],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.9, 1.0],
    'colsample_bytree': [0.9, 1.0]
}

In [14]:
# create XGBoost model
xgb = XGBRegressor(random_state=26)

# GridSearchCV
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
# train GridSearchCV
grid_search.fit(X_train, y_train)

# Получаем лучшую модель
best_xgb = grid_search.best_estimator_

Fitting 3 folds for each of 48 candidates, totalling 144 fits


In [17]:
# predictions
y_pred_xgb_grid = best_xgb.predict(X_test)

# MSE
mse_xgb_grid = mean_squared_error(y_test, y_pred_xgb_grid)
print(f'MSE XGBoost with GridSearchCV: {mse_xgb_grid}')

MSE XGBoost with GridSearchCV: 4805.492269948905


### <h2> Save best model XGBoost </h2>

In [23]:
with open('best_xgb_model.pkl', 'wb') as file:
    pickle.dump(xgb_model, file)
    
print("Model saved successfully.")

Model saved successfully.
