In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import xgboost as xgb



# Import any additional modules and start coding below
df = pd.read_csv('rental_info.csv')

# to calculate rental length days, change datatype to datetime format
df['rental_date'] = pd.to_datetime(df['rental_date'])
df['return_date'] = pd.to_datetime(df['return_date'])

# calculate date diff
df['rental_length_days'] = (df['return_date'] - df['rental_date']).dt.days

# adding dummy variables using special features column
df['deleted_scenes'] = np.where(df['special_features'].str.contains("Deleted Scenes"),1,0)
df['behind_the_scenes'] = np.where(df['special_features'].str.contains("Behind the Scenes"),1,0)

# Separate target data from df and drop redundant columns
X = df.drop(['special_features','rental_date','return_date','rental_length_days'], axis=1)
y = df['rental_length_days']

# split data into train and test data
X_train, X_test, y_train , y_test = train_test_split(X, y, test_size= 0.2, random_state= 9)

# Performing feature selection
lasso = Lasso(alpha=0.01, random_state=9)  
lasso.fit(X_train, y_train)
lasso_coef = lasso.coef_
 
X_train_selected = X_train.loc[:, lasso_coef > 0]
X_test_selected = X_test.loc[:, lasso_coef > 0]

In [4]:
# XGBoost
dtrain = xgb.DMatrix(X_train_selected, label=y_train)
dtest = xgb.DMatrix(X_test_selected, label=y_test)

params = {
    'objective': 'reg:squarederror',  
    'learning_rate': 0.15,
    'max_depth': 5,
    'seed': 9
}

xgb_model = xgb.train(params, dtrain, num_boost_round=100)
xgb_pred = xgb_model.predict(dtest)
xgb_mse = mean_squared_error(y_test, xgb_pred)
print('XGBoost MSE:', xgb_mse)

XGBoost MSE: 3.369071296484816
