In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn import mixture
from sklearn.neural_network import MLPRegressor
from sklearn import ensemble

In [2]:
all_data = pd.read_csv("data/all_features.csv")
df = all_data[["norm_likes", "norm_views", "norm_comments", "user_avg_views", "user_avg_comments",
               "country_avg_views", "country_avg_comments", "category_avg_views", "category_avg_comments", 
               "cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat7", "cat8", "cat9", "count1", "count2", 
               "count3", "count4", "count5", "count6", "count7", "count8", "count9", "count10", "count11", 
               "count12", "count13", "count14", "count15"]]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 359274 entries, 0 to 359273
Data columns (total 33 columns):
norm_likes               359274 non-null float64
norm_views               359274 non-null float64
norm_comments            359274 non-null float64
user_avg_views           359274 non-null float64
user_avg_comments        359274 non-null float64
country_avg_views        359274 non-null float64
country_avg_comments     359274 non-null float64
category_avg_views       359274 non-null float64
category_avg_comments    359274 non-null float64
cat1                     359274 non-null float64
cat2                     359274 non-null float64
cat3                     359274 non-null float64
cat4                     359274 non-null float64
cat5                     359274 non-null float64
cat6                     359274 non-null float64
cat7                     359274 non-null float64
cat8                     359274 non-null float64
cat9                     359274 non-null float64
count1 

In [3]:
corrMatrix = df.corr()
corrMatrix

Unnamed: 0,norm_likes,norm_views,norm_comments,user_avg_views,user_avg_comments,country_avg_views,country_avg_comments,category_avg_views,category_avg_comments,cat1,...,count6,count7,count8,count9,count10,count11,count12,count13,count14,count15
norm_likes,1.0,0.700859,0.798142,0.598726,0.732289,0.022014,0.026023,0.20498,0.214873,1.6e-05,...,-0.01486,-0.004435,-0.009409,0.011932,-0.011115,0.005653,-0.008045,-0.005955,-2.6e-05,0.022713
norm_views,0.700859,1.0,0.49629,0.773421,0.411255,0.070405,0.059472,0.200809,0.185431,-0.044072,...,-0.016592,0.00135,0.000458,0.026686,-0.008069,-0.012627,0.006794,0.003948,0.003496,0.03003
norm_comments,0.798142,0.49629,1.0,0.40378,0.759363,0.031928,0.037797,0.110652,0.119828,-0.003223,...,-0.01348,0.003905,0.001884,0.001461,-0.009556,-0.000557,0.003294,-0.005465,0.008889,0.022095
user_avg_views,0.598726,0.773421,0.40378,1.0,0.531735,0.091941,0.078182,0.247687,0.227191,-0.055904,...,-0.024615,0.008647,-0.000567,0.044137,-0.015501,-0.013616,0.006186,0.002467,0.012771,0.035882
user_avg_comments,0.732289,0.411255,0.759363,0.531735,1.0,0.043925,0.052027,0.12956,0.140893,-0.000894,...,-0.018068,0.008934,0.001841,0.006083,-0.014751,0.000186,0.00351,-0.00968,0.014509,0.028789
country_avg_views,0.022014,0.070405,0.031928,0.091941,0.043925,1.0,0.844716,-0.010838,-0.017743,0.021034,...,-0.23566,0.019171,0.006507,0.379031,-0.114608,-0.17935,0.096505,0.056078,0.049653,0.42654
country_avg_comments,0.026023,0.059472,0.037797,0.078182,0.052027,0.844716,1.0,-0.017437,-0.013953,0.040572,...,-0.35665,0.103321,0.049847,0.038649,-0.252834,-0.014736,0.087146,-0.1446,0.235173,0.584562
category_avg_views,0.20498,0.200809,0.110652,0.247687,0.12956,-0.010838,-0.017437,1.0,0.923423,-0.219474,...,0.001531,-0.005823,0.016903,0.034106,0.008018,0.011241,-0.020659,-0.012423,0.023072,-0.017398
category_avg_comments,0.214873,0.185431,0.119828,0.227191,0.140893,-0.017743,-0.013953,0.923423,1.0,-0.026895,...,0.010853,-0.014881,0.026104,0.022507,0.011398,0.019653,-0.01138,-0.031901,0.009416,0.000762
cat1,1.6e-05,-0.044072,-0.003223,-0.055904,-0.000894,0.021034,0.040572,-0.219474,-0.026895,1.0,...,0.012966,-0.008935,0.003678,-0.052842,-0.013418,0.002293,0.024695,-0.005688,-0.021589,0.044847


## Data Splitting into Training and Testing set
We split data in 80-20 proportion

In [4]:
df = df.sample(frac=1).reset_index(drop=True)
data = df.to_numpy()
X, y = data[:,1:], data[:,0]

def get_train_test_data(X, y, k):
    if k == -1:
        return X, y
    else:
        test = SelectKBest(score_func=f_classif, k=k)
        fit = test.fit(X, y)
        X_new = fit.transform(X)
        return X_new, y

## Model Evaluation

In [5]:
def model_testing(model, X, y):
    iterations = 10
    result_train = []
    result_test = []
    for i in range(iterations):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        model.fit(X_train, y_train)
        
        y_pred_test = model.predict(X_test)
        result_test.append(np.sqrt(mean_squared_error(y_test, y_pred_test)))
        
        y_pred_train = model.predict(X_train)
        result_train.append(np.sqrt(mean_squared_error(y_train, y_pred_train)))
    return np.mean(result_train)*100, np.mean(result_test)*100

# Linear Models

In [12]:
# X_new, y_new = get_train_test_data(X, y, 20)
X_new, y_new = get_train_test_data(X, y, -1)

## Linear Model - Linear Regression

In [13]:
model_linear = linear_model.LinearRegression()
train, test = model_testing(model_linear, X_new, y_new)
print("training data: ", train)
print("testing data: ", test)

training data:  0.647325883264826
testing data:  0.5869584894461177


## Linear Model - Ridge

In [14]:
ridge_params = {'alpha':[0.4, 0.3, 0.25, 0.22, 0.2, 0.18, 0.15, 0.1]}
model_ridge = GridSearchCV(linear_model.Ridge(), param_grid = ridge_params, cv = 5).fit(X_new, y_new)

print("best_param", model_ridge.best_estimator_.alpha)

train, test = model_testing(model_ridge, X_new, y_new)
print("training data: ", train)
print("testing data: ", test)

best_param 0.1
training data:  0.6473787841709007
testing data:  0.5866203371904697


# Gradient Boosting Models

## XGBoost

In [6]:
params = {'n_estimators': 500,
          'max_depth': 4,
          'min_samples_split': 5,
          'learning_rate': 0.01,
          'loss': 'ls'}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
err = mean_squared_error(y_test, y_pred)
print(np.sqrt(err)*100)

0.3642259216104246


## Random Forest

In [7]:
rand_forest = RandomForestRegressor()
rand_forest.fit(X_train, y_train)
y_pred = rand_forest.predict(X_test)
err = mean_squared_error(y_test, y_pred)
print(np.sqrt(err))



0.0022299178081743275


## GMM

In [8]:
gmm = mixture.GaussianMixture(n_components=1)
gmm.fit(X_train, y_train)
y_pred = gmm.predict(X_test)
err = mean_squared_error(y_test, y_pred)
print(np.sqrt(err)*100)

1.4236819594306325


## Multilayer Perceptron

In [9]:
regr = MLPRegressor(random_state=1, max_iter=500)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
err = mean_squared_error(y_test, y_pred)
print(np.sqrt(err)*100)

0.6360948754101373
