In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score 
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

In [2]:
train = pd.read_csv('../datasets/train_dummy.csv')
train.head()

Unnamed: 0,id,budget,popularity,release_date,runtime,genre_Action,genre_Adventure,genre_Animation,genre_Comedy,genre_Crime,...,original_language_ro,original_language_ru,original_language_sr,original_language_sv,original_language_ta,original_language_te,original_language_tr,original_language_ur,original_language_zh,revenue
0,1,14000000.0,6.575393,1424412000,93.0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,False,12314651
1,2,40000000.0,8.248895,1091768400,113.0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,False,95149435
2,3,3300000.0,64.29999,1412917200,105.0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,13092000
3,4,1200000.0,3.174936,1331272800,122.0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,16000000
4,5,,1.14807,1233813600,118.0,1,0,0,0,0,...,False,False,False,False,False,False,False,False,False,3923970


In [3]:
test = pd.read_csv('../datasets/test_dummy.csv')
test.head()

Unnamed: 0,id,budget,popularity,release_date,runtime,genre_Action,genre_Adventure,genre_Animation,genre_Comedy,genre_Crime,...,original_language_pt,original_language_ro,original_language_ru,original_language_sr,original_language_sv,original_language_ta,original_language_te,original_language_tr,original_language_ur,original_language_zh
0,3001,,3.851534,1184389000.0,90.0,0,1,1,0,0,...,False,False,False,False,False,False,False,False,False,False
1,3002,88000.0,3.559789,2789010000.0,65.0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
2,3003,,8.085194,864363600.0,100.0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,False,False
3,3004,6800000.0,8.596012,1283576000.0,130.0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
4,3005,2000000.0,3.21768,1108102000.0,92.0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False


### Train Test split

In [5]:
X = train.drop(columns=["id","revenue"])
y = train['revenue']

In [6]:
X_train,X_test, y_train, y_test = train_test_split(X,y,random_state=42)

### Linear Regression

In [7]:
lr = Pipeline([
    ("it_imp", IterativeImputer(estimator=LinearRegression(),max_iter=100,)),
    ("lr", LinearRegression())]
)

In [8]:
lr.fit(X_train,y_train)

In [9]:
lr.score(X_train, y_train), lr.score(X_test,y_test)

(0.6158711704743762, 0.5433868748776461)

In [10]:
# apply of test data

In [11]:
test.head()

Unnamed: 0,id,budget,popularity,release_date,runtime,genre_Action,genre_Adventure,genre_Animation,genre_Comedy,genre_Crime,...,original_language_pt,original_language_ro,original_language_ru,original_language_sr,original_language_sv,original_language_ta,original_language_te,original_language_tr,original_language_ur,original_language_zh
0,3001,,3.851534,1184389000.0,90.0,0,1,1,0,0,...,False,False,False,False,False,False,False,False,False,False
1,3002,88000.0,3.559789,2789010000.0,65.0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
2,3003,,8.085194,864363600.0,100.0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,False,False
3,3004,6800000.0,8.596012,1283576000.0,130.0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
4,3005,2000000.0,3.21768,1108102000.0,92.0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False


In [12]:
test_preds_lr = lr.predict(test.drop(columns='id'))

In [13]:
test_preds_lr = pd.DataFrame({
    'id': test['id'],
    'revenue': test_preds_lr,
})

test_preds_lr.head()

Unnamed: 0,id,revenue
0,3001,137994900.0
1,3002,-13856020.0
2,3003,63182340.0
3,3004,-9535291.0
4,3005,-26459470.0


In [14]:
test_preds_lr.to_csv('../datasets/lr_model.csv', index=False)

### Randomforest

In [18]:
rf = Pipeline([
    ("it_imp", IterativeImputer(max_iter=100)),
    ("rf", RandomForestRegressor())])

In [22]:
pipe_params = {
    'rf__max_depth' : [8,9,10,11],
    'rf__min_samples_leaf': [5,7,9,11],
    'rf__min_samples_split': [5,7,9,11]}

In [23]:
rf_gs = GridSearchCV(rf, pipe_params) #cv=5)

In [24]:
rf_gs.fit(X_train,y_train)

In [36]:
rf_gs.best_params_

{'rf__max_depth': 10, 'rf__min_samples_leaf': 5, 'rf__min_samples_split': 11}

In [26]:
rf_gs.score(X_train, y_train), rf_gs.score(X_test,y_test)

(0.7969772692095698, 0.6625373662025711)

In [28]:
test_preds_rf = rf_gs.predict(test.drop(columns='id'))

In [30]:
test_preds_rf = pd.DataFrame({
    'id': test['id'],
    'revenue': test_preds_rf,
})

test_preds_rf.head()

Unnamed: 0,id,revenue
0,3001,28263590.0
1,3002,7286749.0
2,3003,28983300.0
3,3004,23729470.0
4,3005,4483525.0


In [31]:
test_preds_rf.to_csv('../datasets/rf_model.csv', index=False)

### ExtraTreesRegressor

In [44]:
etr = Pipeline([
    ('it_imp', IterativeImputer(max_iter=100)),
    ('etr', ExtraTreesRegressor())
])

In [45]:
pipe_params_2 = {
    'etr__max_depth' : [8,9,10,11],
    'etr__min_samples_leaf': [5,7,9,11],
    'etr__min_samples_split': [7,9,11,13]}

In [46]:
etr_gs = GridSearchCV(etr, pipe_params_2)

In [47]:
etr_gs.fit(X_train,y_train)

In [48]:
etr_gs.best_params_

{'etr__max_depth': 11,
 'etr__min_samples_leaf': 5,
 'etr__min_samples_split': 13}

In [49]:
etr_gs.score(X_train, y_train), etr_gs.score(X_test,y_test)

(0.7576253805500509, 0.6407024463213726)

In [50]:
test_preds_etr = etr_gs.predict(test.drop(columns='id'))

In [51]:
test_preds_etr = pd.DataFrame({
    'id': test['id'],
    'revenue': test_preds_etr,
})

test_preds_etr.head()

Unnamed: 0,id,revenue
0,3001,31277180.0
1,3002,9466931.0
2,3003,41568970.0
3,3004,13753300.0
4,3005,5850198.0


In [52]:
test_preds_etr.to_csv('../datasets/etr_model.csv', index=False)

### Conclusion

* Random Forest Regressor is the best performance model for this dataset.
* The train scores are higher than test scores, those models are overfit.
* The test scores are low, improving the test score by applying more feature, different model and different imputing method.