In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn import mixture
from sklearn.neural_network import MLPRegressor
from sklearn import ensemble

In [36]:
all_data = pd.read_csv("data/training_data_with_features.csv")
df = all_data[["norm_likes", "norm_views", "norm_comments", "user_avg_views", "user_avg_comments",
               "country_avg_views", "country_avg_comments", "category_avg_views", "category_avg_comments", 
               "cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat7", "cat8", "cat9", "count1", "count2", 
               "count3", "count4", "count5", "count6", "count7", "count8", "count9", "count10", "count11", 
               "count12", "count13", "count14", "count15"]]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 369920 entries, 0 to 369919
Data columns (total 33 columns):
norm_likes               369920 non-null float64
norm_views               369920 non-null float64
norm_comments            369920 non-null float64
user_avg_views           369920 non-null float64
user_avg_comments        369920 non-null float64
country_avg_views        369920 non-null float64
country_avg_comments     369920 non-null float64
category_avg_views       369920 non-null float64
category_avg_comments    369920 non-null float64
cat1                     369920 non-null float64
cat2                     369920 non-null float64
cat3                     369920 non-null float64
cat4                     369920 non-null float64
cat5                     369920 non-null float64
cat6                     369920 non-null float64
cat7                     369920 non-null float64
cat8                     369920 non-null float64
cat9                     369920 non-null float64
count1 

In [37]:
corrMatrix = df.corr()
corrMatrix

Unnamed: 0,norm_likes,norm_views,norm_comments,user_avg_views,user_avg_comments,country_avg_views,country_avg_comments,category_avg_views,category_avg_comments,cat1,...,count6,count7,count8,count9,count10,count11,count12,count13,count14,count15
norm_likes,1.0,0.69655,0.796899,0.595414,0.730856,0.022556,0.026466,0.203681,0.214465,0.000386,...,-0.014923,-0.004728,-0.009089,0.012283,-0.011157,0.005563,-0.007898,-0.006249,0.00017,0.023148
norm_views,0.69655,1.0,0.491115,0.772156,0.406799,0.072709,0.060474,0.199067,0.182939,-0.044912,...,-0.017495,0.00066,0.000306,0.028117,-0.008319,-0.012711,0.006782,0.006274,0.003276,0.029926
norm_comments,0.796899,0.491115,1.0,0.400587,0.760364,0.032449,0.039014,0.109967,0.119661,-0.002967,...,-0.013678,0.004101,0.001907,0.001671,-0.009619,-0.000673,0.003654,-0.006036,0.009219,0.022843
user_avg_views,0.595414,0.772156,0.400587,1.0,0.526835,0.094994,0.079526,0.246069,0.22423,-0.057025,...,-0.02583,0.008089,-0.00069,0.045805,-0.015947,-0.01381,0.00611,0.005481,0.012316,0.035591
user_avg_comments,0.730856,0.406799,0.760364,0.526835,1.0,0.044424,0.053561,0.128605,0.140571,-0.000582,...,-0.018301,0.009104,0.001875,0.006288,-0.014786,2.1e-05,0.003992,-0.010362,0.014876,0.029741
country_avg_views,0.022556,0.072709,0.032449,0.094994,0.044424,1.0,0.831717,-0.008102,-0.018379,0.017794,...,-0.240617,0.009081,0.004209,0.38671,-0.114408,-0.174823,0.093278,0.086284,0.045057,0.411581
country_avg_comments,0.026466,0.060474,0.039014,0.079526,0.053561,0.831717,1.0,-0.016384,-0.01319,0.03847,...,-0.350592,0.10511,0.048889,0.042837,-0.246553,-0.017257,0.093662,-0.154709,0.236287,0.585511
category_avg_views,0.203681,0.199067,0.109967,0.246069,0.128605,-0.008102,-0.016384,1.0,0.918981,-0.22561,...,0.000888,-0.005085,0.016766,0.035846,0.008237,0.00995,-0.021173,-0.010832,0.022621,-0.017429
category_avg_comments,0.214465,0.182939,0.119661,0.22423,0.140571,-0.018379,-0.01319,0.918981,1.0,-0.024798,...,0.01096,-0.014449,0.026105,0.021844,0.011333,0.019334,-0.010916,-0.031923,0.009129,0.001464
cat1,0.000386,-0.044912,-0.002967,-0.057025,-0.000582,0.017794,0.03847,-0.22561,-0.024798,1.0,...,0.013354,-0.009969,0.004602,-0.052085,-0.012075,0.003941,0.024759,-0.007131,-0.021783,0.043657


## Data Splitting into Training and Testing set
We split data in 80-20 proportion

In [38]:
df = df.sample(frac=1).reset_index(drop=True)
data = df.to_numpy()
X, y = data[:,1:], data[:,0]

def get_train_test_data(X, y, k):
    if k == -1:
        return X, y
    else:
        test = SelectKBest(score_func=f_classif, k=k)
        fit = test.fit(X, y)
        X_new = fit.transform(X)
        return X_new, y

## Model Evaluation

In [39]:
def model_testing(model, X, y, iterations):
    result_train = []
    result_test = []
    for i in range(iterations):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        model.fit(X_train, y_train)
        
        y_pred_test = model.predict(X_test)
        result_test.append(np.sqrt(mean_squared_error(y_test, y_pred_test)))
        
        y_pred_train = model.predict(X_train)
        result_train.append(np.sqrt(mean_squared_error(y_train, y_pred_train)))
    return np.mean(result_train)*100, np.mean(result_test)*100

# Linear Models

In [40]:
# X_new, y_new = get_train_test_data(X, y, 20)
X_new, y_new = get_train_test_data(X, y, -1)

## Linear Model - Linear Regression

In [41]:
model_linear = linear_model.LinearRegression()
train, test = model_testing(model_linear, X_new, y_new, 10)
print("training data: ", train)
print("testing data: ", test)

training data:  0.6270885456952023
testing data:  0.6502721409366095


## Linear Model - Ridge

In [42]:
ridge_params = {'alpha':[0.4, 0.3, 0.25, 0.22, 0.2, 0.18, 0.15, 0.1]}
model_ridge = GridSearchCV(linear_model.Ridge(), param_grid = ridge_params, cv = 5).fit(X_new, y_new)

print(model_ridge.best_params_)

train, test = model_testing(model_ridge, X_new, y_new, 10)
print("training data: ", train)
print("testing data: ", test)

{'alpha': 0.2}
training data:  0.6271068080850473
testing data:  0.6505486799653135


# Multilayer Perceptron

### Baseline Hyperparameters

In [48]:
mlp_params = {
    'max_iter': 1000,
    'hidden_layer_sizes': (30,),
    'activation': 'relu',
    'solver': 'adam',
    'alpha': 0.001,
    'learning_rate_init': 0.0009,
    'learning_rate': 'constant'
}

model_mlp_base = MLPRegressor(**mlp_params)
# train, test = model_testing(model_mlp, X[:150000,:], y[:150000,], 1)
train, test = model_testing(model_mlp_base, X, y, 1)
print("training data: ", train)
print("testing data: ", test)

training data:  0.6169007956159719
testing data:  0.6634405697645084


### Applying Grid Search to find best Hyperparameters

In [49]:
mlp_space1 = {
    'hidden_layer_sizes': [(25,), (30,), (35,)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam'],
    'learning_rate': ['constant'],
    'batch_size': ['auto'],
}

model_mlp_grid1 = GridSearchCV(MLPRegressor(max_iter=1000), mlp_space1, cv = 3).fit(X[:100000,:], y[:100000,])
print(model_mlp_grid1.best_params_)

{'activation': 'relu', 'batch_size': 'auto', 'hidden_layer_sizes': (35,), 'learning_rate': 'constant', 'solver': 'adam'}


In [52]:
mlp_space2 = {
    'hidden_layer_sizes': [(35,), (50,)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.001, 0.005],
    'learning_rate': ['constant'],
    'learning_rate_init': [0.0005, 0.0001],
    'batch_size': ['auto'],
}

model_mlp_grid2 = GridSearchCV(MLPRegressor(max_iter=1000), mlp_space2, cv = 3).fit(X[:100000,:], y[:100000,])
print(model_mlp_grid2.best_params_)

{'activation': 'relu', 'alpha': 0.001, 'batch_size': 'auto', 'hidden_layer_sizes': (50,), 'learning_rate': 'constant', 'learning_rate_init': 0.0005, 'solver': 'adam'}


In [53]:
mlp_space2 = {
    'hidden_layer_sizes': [(35,), (50,), (80,), (100,)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.001],
    'learning_rate': ['constant'],
    'learning_rate_init': [0.0005],
    'batch_size': ['auto'],
}

model_mlp_grid2 = GridSearchCV(MLPRegressor(max_iter=1000), mlp_space2, cv = 3).fit(X[:100000,:], y[:100000,])
print(model_mlp_grid2.best_params_)

{'activation': 'relu', 'alpha': 0.001, 'batch_size': 'auto', 'hidden_layer_sizes': (80,), 'learning_rate': 'constant', 'learning_rate_init': 0.0005, 'solver': 'adam'}


### Results with Final Hyperparameters

In [None]:
mlp_final_params = {
    'max_iter': 1000,
    'hidden_layer_sizes': (80,),
    'activation': 'relu',
    'solver': 'adam',
    'alpha': 0.001,
    'learning_rate_init': 0.0005,
    'learning_rate': 'constant'
}

model_mlp_base = MLPRegressor(**mlp_params)
train, test = model_testing(model_mlp_base, X, y, 10)
print("training data: ", train)
print("testing data: ", test)

# Random Forest

### Baseline Hyperparameters

In [43]:
rand_params = {
    "n_estimators" : 10,
    "max_features" : 'auto',
    "max_depth" : 30,
    "min_samples_split" : 3,
    "min_samples_leaf" : 1
}

model_random1 = RandomForestRegressor(**rand_params)
# train, test = model_testing(model_random, X[:150000,:], y[:150000,], 1)
train, test = model_testing(model_random1, X, y, 1)
print("training data: ", train)
print("testing data: ", test)

training data:  0.11991251236725797
testing data:  0.27683596326297016


### Finding Hyperparameters using Grid Search

In [45]:
random_params = {
    'n_estimators': [5, 10, 15],
    'max_features': ['sqrt', 'auto'],
    'max_depth': [30, 40, 50],
}

model_random2 = GridSearchCV(RandomForestRegressor(), random_params, cv = 5).fit(X[:100000,:], y[:100000,])
print(model_random2.best_params_)

{'max_depth': 40, 'max_features': 'auto', 'n_estimators': 15}


In [46]:
random_params = {
    'n_estimators': [15, 20],
    'max_features': ['auto'],
    'max_depth': [40],
    'min_samples_split': [3, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

model_random2 = GridSearchCV(RandomForestRegressor(), random_params, cv = 5).fit(X[:100000,:], y[:100000,])
print(model_random2.best_params_)

{'max_depth': 40, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 15}


### Results with Final Hyperparameters

In [47]:
rand_params = {
    "n_estimators" : 15,
    "max_features" : 'auto',
    "max_depth" : 40,
    "min_samples_split" : 5,
    "min_samples_leaf" : 2
}

model_random3 = RandomForestRegressor(**rand_params)
train, test = model_testing(model_random3, X, y, 10)
print("training data: ", train)
print("testing data: ", test)

training data:  0.14329873827811637
testing data:  0.2882599974775974


# Gradient Boosting Models

## XGBoost

In [None]:
params = {'n_estimators': 500,
          'max_depth': 4,
          'min_samples_split': 5,
          'learning_rate': 0.01,
          'loss': 'ls'}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
err = mean_squared_error(y_test, y_pred)
print(np.sqrt(err)*100)