In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report
from sklearn.ensemble import AdaBoostRegressor

import warnings
warnings.simplefilter('ignore')

In [2]:
movies = pd.read_csv("tmdb_5000_movies.csv")
X = movies[['budget','popularity']]
y = movies['revenue']

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)

# AdaBoostRegressor - Linear Regression

Instantiate a normalized linear regression model

In [4]:
reg_lm = LinearRegression(normalize = True)

Build and fit an AdaBoost regressor

In [5]:
reg_ada = AdaBoostRegressor(
    base_estimator = reg_lm, 
    n_estimators = 12, 
    random_state=500)

reg_ada.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True),
         learning_rate=1.0, loss='linear', n_estimators=12,
         random_state=500)

In [6]:
# Calculate the predictions on the test set
pred = reg_ada.predict(X_test)

In [11]:
# Evaluate the performance using the RMSE
rmse = np.sqrt(mean_squared_error(y_test, pred))
print('RMSE: {:.3f}'.format(rmse))

RMSE: 113338632.337


# AdaBoostRegressor - Tree

There's no need to instantiate the decision tree as it is the base estimator by default.

In [16]:
# Build and fit a tree-based AdaBoost regressor
reg_ada = AdaBoostRegressor(
    n_estimators = 100, 
    learning_rate=0.01, 
    random_state=500)
reg_ada.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=None, learning_rate=0.01, loss='linear',
         n_estimators=100, random_state=500)

In [17]:
# Calculate the predictions on the test set
pred = reg_ada.predict(X_test)

In [18]:
# Evaluate the performance using the RMSE
rmse = np.sqrt(mean_squared_error(y_test, pred))
print('RMSE: {:.3f}'.format(rmse))

RMSE: 95964199.394
