In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error, f1_score 
from sklearn.svm import SVC, LinearSVR

In [2]:
train = pd.read_csv('../Datasets/trainfinal.csv')
train = train.dropna()
test = pd.read_csv('../Datasets/testfinal.csv')

In [3]:
X = train.drop(columns=['revenue'])
y = train['revenue']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Baseline - average of y

In [5]:
y.mean()

75022033.10793288

# Linear Regression

In [6]:
lr = LinearRegression()
lr.fit(X_train, y_train)
print(lr.score(X_train, y_train), lr.score(X_test, y_test))

0.6671440186505438 0.6433472353523075


No real overfitting. High Bias, no acute need for regularization

# random forest regressor 

In [7]:
ss = StandardScaler()

X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [8]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf.score(X_train, y_train), rf.score(X_test, y_test)

(0.9532423039248441, 0.6915727484344484)

### Extreme overfitting, lets hyperparameter tune . feature to improve overfit for random forest regressor model

In [None]:
rf_params = {
    'n_estimators': [100,302000], #[300, 350, 400, 450],
    'max_depth':[8] #[6,7,8,9]
    #'min_samples_leaf'
}

gs = GridSearchCV(rf, param_grid=rf_params, cv=5, n_jobs=-1)
gs.fit(X_train, y_train)

In [44]:
gs.score(X_train, y_train), gs.score(X_test, y_test)

(0.8782338730560593, 0.7067750164423225)

## Reduced overfitting a little bit but we need to do more

In [45]:
gs.best_params_

{'max_depth': 8, 'n_estimators': 300}

In [46]:
preds_rf = gs.predict(X_test)
MSE_rf = mean_squared_error(y_test, preds_rf)
MSE_rf

4816841466051509.0

### MSE is high, train score is higher than test score, model is overfit.

# Now to Generate our predictions on test

In [47]:
preds_rf.mean()

72559718.4791077

# The following models are for experimentation purposes

### Support vector machine 

In [23]:
svr = LinearSVR(dual='auto', random_state=42, max_iter=10000)
svr.fit(X_train, y_train)
svr.score(X_train, y_train), svr.score(X_test, y_test)

(-0.26847948537131705, -0.277846849348665)

In [24]:
## The train score and test score are negative, this model is bad

## Decision Tree Regressor

In [19]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train,y_train)
dtr.score(X_train, y_train), dtr.score(X_test,y_test)

(1.0, 0.25451342744066385)

In [21]:
grid = GridSearchCV(estimator=DecisionTreeRegressor(),
                   param_grid={'max_depth' : [3, 5, 7, 10],
                              'min_samples_split' : [5, 10, 15, 20],
                              'min_samples_leaf' : [2, 3, 4, 5, 6, 7]},
                   cv = 5,
                   verbose=1)

In [22]:
grid.fit(X_train, y_train)
grid.score(X_train, y_train), grid.score(X_test,y_test)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


(0.7527243630992329, 0.5341858857682387)

In [29]:
preds_dtr = grid.predict(X_test)
MSE_dtr = mean_squared_error(y_test,preds_dtr)
MSE_dtr

6873144575180642.0