In [1]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from tqdm import tqdm
from math import floor, ceil
import os, pickle

from sklearn.metrics import *

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [20]:
DATA_DIR = '..\\data\\raw'
INTERIM_DIR = '..\\data\\interim'
PROCESSED_DIR = '..\\data\\processed'

MODELS_DIR = '..\\models'

### Read data files

In [4]:
train_set = pd.read_csv(os.path.join(PROCESSED_DIR, 'train_set.csv'))
test_set = pd.read_csv(os.path.join(PROCESSED_DIR, 'test_set.csv'))

In [5]:
train_set.columns

Index(['isAdult', 'startYear', 'runtimeMinutes', 'averageRating', 'numVotes',
       'pi_mean_mean', 'pi_mean_std', 'pi_med_mean', 'pi_med_std',
       'pi_std_mean', 'pi_std_std', 'numVersions', 'Action', 'Adult',
       'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary',
       'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Game-Show', 'History',
       'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV',
       'Romance', 'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War',
       'Western'],
      dtype='object')

In [6]:
train_y = train_set['averageRating'].values.reshape(-1,1)
test_y = test_set['averageRating'].values.reshape(-1,1)

del train_set['averageRating']
del test_set['averageRating']

# In my opinion we cannot use numvotes as this feature would not be available for future movies
del train_set['numVotes']
del test_set['numVotes']

train_X = train_set.values
test_X = test_set.values

for array in [train_X, test_X, train_y, test_y]:
    
    print('Shape of array is : {}'.format(array.shape))

Shape of array is : (63391, 38)
Shape of array is : (27173, 38)
Shape of array is : (63391, 1)
Shape of array is : (27173, 1)


#### Fitting the XGBoost regressor 

In [None]:
def fit_model(model, train_X, train_y):
    '''
    trains a model having a scikit-learn interface and returns
    '''
    model.train

In [18]:
def evaluate_regression_model(model, 
                              X_train=train_X, y_train=train_y, 
                              X_test=test_X, y_test=test_y):
    '''
    Calculates and prints metrics for the model
    '''
    print("#"*50+"Train set Results"+"#"*50)
    
    y_pred = model.predict(X_train)
    r2_error = r2_score(y_train, y_pred)
    print("R2 score: {}".format(r2_error))
    
    print("\n"+"#"*50+"Test set Results"+"#"*50)
    
    y_pred = model.predict(X_test)
    r2_error = r2_score(y_test, y_pred)
    print("R2 score: {}".format(r2_error))
    
    print("#"*100)
    

In [21]:
def save_model(model, filename):
    '''
    Saves trained models to Models directory for later use
    '''
    with open(os.path.join(MODELS_DIR, filename), 'wb') as f:
        pickle.dump(model, f)

In [25]:
xgb_model = XGBRegressor(n_jobs=-1)
xgb_model.fit(train_X, train_y)
evaluate_regression_model(xgb_model)

##################################################Train set Results##################################################


  "because it will generate extra copies and increase " +


R2 score: 0.75050169105837

##################################################Test set Results##################################################


  "because it will generate extra copies and increase " +


R2 score: 0.6613990485952934
####################################################################################################


In [22]:
save_model(xgb_model, 'vanilla_xgb.pkl')

In [26]:
rf_model = RandomForestRegressor(n_jobs=-1)
rf_model.fit(train_X, train_y)
evaluate_regression_model(rf_model)

  


##################################################Train set Results##################################################
R2 score: 0.9299453034431845

##################################################Test set Results##################################################
R2 score: 0.614395011084041
####################################################################################################
