In [63]:
import pandas as pd
import numpy as np

import pickle

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn import mixture
from sklearn.neural_network import MLPRegressor
from sklearn import ensemble

In [64]:
training_data = pd.read_csv("data/training_data_with_features.csv")
df = training_data[["norm_likes", "norm_views", "norm_comments", "norm_user_views", "norm_user_comments",
               "norm_country_views", "norm_country_comments", "norm_category_views", "norm_category_comments", 
               "cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat7", "cat8", "cat9", "count1", "count2", 
               "count3", "count4", "count5", "count6", "count7", "count8", "count9", "count10", "count11", 
               "count12", "count13", "count14", "count15"]]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365610 entries, 0 to 365609
Data columns (total 33 columns):
norm_likes                365610 non-null float64
norm_views                365610 non-null float64
norm_comments             365610 non-null float64
norm_user_views           365610 non-null float64
norm_user_comments        365610 non-null float64
norm_country_views        365610 non-null float64
norm_country_comments     365610 non-null float64
norm_category_views       365610 non-null float64
norm_category_comments    365610 non-null float64
cat1                      365610 non-null float64
cat2                      365610 non-null float64
cat3                      365610 non-null float64
cat4                      365610 non-null float64
cat5                      365610 non-null float64
cat6                      365610 non-null float64
cat7                      365610 non-null float64
cat8                      365610 non-null float64
cat9                      365610 non-nu

In [65]:
corrMatrix = df.corr()
corrMatrix

Unnamed: 0,norm_likes,norm_views,norm_comments,norm_user_views,norm_user_comments,norm_country_views,norm_country_comments,norm_category_views,norm_category_comments,cat1,...,count6,count7,count8,count9,count10,count11,count12,count13,count14,count15
norm_likes,1.0,0.655375,0.734362,0.606205,0.705128,0.037816,0.056353,0.148829,0.196442,0.038968,...,-0.032142,-0.005806,-0.024928,0.0033,-0.037125,0.018637,-0.016722,-0.014527,0.004662,0.043862
norm_views,0.655375,1.0,0.544742,0.80442,0.498912,0.148362,0.127913,0.190202,0.115225,-0.050575,...,-0.041717,0.011229,0.000502,0.029482,-0.039342,-0.018762,0.019102,0.007152,0.017962,0.061235
norm_comments,0.734362,0.544742,1.0,0.516704,0.833106,0.111656,0.129506,0.095424,0.157516,0.028484,...,-0.042124,0.023211,-0.012426,-0.012594,-0.045726,-0.000979,0.010621,-0.01757,0.038941,0.069015
norm_user_views,0.606205,0.80442,0.516704,1.0,0.620214,0.183697,0.161532,0.223578,0.129783,-0.059416,...,-0.053814,0.024338,-0.000661,0.042473,-0.05121,-0.023597,0.016879,0.008019,0.0315,0.082779
norm_user_comments,0.705128,0.498912,0.833106,0.620214,1.0,0.133896,0.15754,0.101224,0.175487,0.042145,...,-0.051103,0.033624,-0.015017,-0.012939,-0.056414,-0.000387,0.010927,-0.022553,0.051486,0.085118
norm_country_views,0.037816,0.148362,0.111656,0.183697,0.133896,1.0,0.862168,0.02739,0.012864,0.029435,...,-0.281184,0.075685,0.003387,0.198719,-0.265177,-0.126462,0.12875,0.048208,0.121072,0.41274
norm_country_comments,0.056353,0.127913,0.129506,0.161532,0.15754,0.862168,1.0,-0.006312,0.002475,0.041745,...,-0.325265,0.179229,-0.095952,-0.097245,-0.353077,-0.007563,0.082012,-0.135667,0.300688,0.532907
norm_category_views,0.148829,0.190202,0.095424,0.223578,0.101224,0.02739,-0.006312,1.0,0.605805,-0.265902,...,-0.012638,-0.011808,0.019065,0.069049,0.0085,-0.006177,-0.022412,0.011928,0.005227,-0.002703
norm_category_comments,0.196442,0.115225,0.157516,0.129783,0.175487,0.012864,0.002475,0.605805,1.0,0.180833,...,0.007802,-0.030183,0.038959,0.03936,0.010018,0.012251,0.002394,-0.034735,-0.020915,0.039006
cat1,0.038968,-0.050575,0.028484,-0.059416,0.042145,0.029435,0.041745,-0.265902,0.180833,1.0,...,0.013491,-0.010167,0.004187,-0.052171,-0.011687,0.003551,0.024837,-0.006892,-0.021975,0.044485


## Data Splitting into Training and Testing set
We split data in 80-20 proportion

In [82]:
def min_max_normalisation(arr):
    min_val = np.min(arr)
    max_val = np.max(arr)
    return (arr - min_val)/(max_val - min_val)

In [83]:
df = df.sample(frac=1).reset_index(drop=True)
data = df.to_numpy()
X, y = data[:,1:], data[:,0]

y_norm = min_max_normalisation(y)

In [84]:
def save_model(model, X, y, file_name):
    model.fit(X, y)
    pickle.dump(model, open(file_name, 'wb'))
    return True

## Model Evaluation

In [85]:
def model_testing(model, X, y, iterations):
    result_train = []
    result_test = []
    for i in range(iterations):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        model.fit(X_train, y_train)
        
        y_pred_test = model.predict(X_test)
        result_test.append(np.sqrt(mean_squared_error(y_test, y_pred_test)))
        
        y_pred_train = model.predict(X_train)
        result_train.append(np.sqrt(mean_squared_error(y_train, y_pred_train)))
    return np.mean(result_train)*100, np.mean(result_test)*100

# Linear Models

In [94]:
# X_new, y_new = get_train_test_data(X, y, 20)
X_new, y_new = X, y

## Linear Model - Linear Regression

In [95]:
model_linear = linear_model.LinearRegression()
train, test = model_testing(model_linear, X_new, y_new, 10)
print("training data: ", train)
print("testing data: ", test)

training data:  57.10554708501569
testing data:  57.52430589435302


In [90]:
model = linear_model.LinearRegression()
file_name = "trained_models/lin_reg.pickle"
save_model(model, X, y, file_name)

True

## Linear Model - Ridge

In [98]:
ridge_params = {'alpha':[100, 95, 90, 85, 80, 70]}
model_ridge = GridSearchCV(linear_model.Ridge(), param_grid = ridge_params, cv = 5).fit(X_new, y_new)
print(model_ridge.best_params_)

{'alpha': 90}


In [100]:
final_ridge_params = {
    'alpha': 90
}
model_ridge_final = linear_model.Ridge(**final_ridge_params)
train, test = model_testing(model_ridge, X_new, y_new, 10)

file_name = "trained_models/lin_ridge.pickle"
save_model(model, X, y, file_name)

print("training data: ", train)
print("testing data: ", test)

training data:  57.10556883045832
testing data:  57.5245007818703


# Multilayer Perceptron

### Baseline Hyperparameters

In [80]:
mlp_params = {
    'max_iter': 1000,
    'hidden_layer_sizes': (30,),
    'activation': 'relu',
    'solver': 'adam',
    'alpha': 0.001,
    'learning_rate_init': 0.0009,
    'learning_rate': 'constant'
}

model_mlp_base = MLPRegressor(**mlp_params)
train, test = model_testing(model_mlp_base, X[:150000,:], y[:150000,], 1)
# train, test = model_testing(model_mlp_base, X, y, 1)
print("training data: ", train)
print("testing data: ", test)

training data:  45.936444532369705
testing data:  47.18400908325359


### Applying Grid Search to find best Hyperparameters

In [81]:
mlp_space1 = {
    'hidden_layer_sizes': [(40,), (20,), (30,)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
#     'learning_rate': ['constant'],
#     'batch_size': ['auto'],
}

model_mlp_grid1 = GridSearchCV(MLPRegressor(max_iter=1000), mlp_space1, cv = 3).fit(X[:100000,:], y[:100000,])
print(model_mlp_grid1.best_params_)



{'activation': 'relu', 'hidden_layer_sizes': (30,), 'solver': 'adam'}




In [49]:
# mlp_space1 = {
#     'hidden_layer_sizes': [(25,), (30,), (35,)],
#     'activation': ['relu', 'tanh'],
#     'solver': ['adam'],
#     'learning_rate': ['constant'],
#     'batch_size': ['auto'],
# }

# model_mlp_grid1 = GridSearchCV(MLPRegressor(max_iter=1000), mlp_space1, cv = 3).fit(X[:100000,:], y[:100000,])
# print(model_mlp_grid1.best_params_)

{'activation': 'relu', 'batch_size': 'auto', 'hidden_layer_sizes': (35,), 'learning_rate': 'constant', 'solver': 'adam'}


In [52]:
mlp_space2 = {
    'hidden_layer_sizes': [(35,), (50,)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.001, 0.005],
    'learning_rate': ['constant'],
    'learning_rate_init': [0.0005, 0.0001],
    'batch_size': ['auto'],
}

model_mlp_grid2 = GridSearchCV(MLPRegressor(max_iter=1000), mlp_space2, cv = 3).fit(X[:100000,:], y[:100000,])
print(model_mlp_grid2.best_params_)

{'activation': 'relu', 'alpha': 0.001, 'batch_size': 'auto', 'hidden_layer_sizes': (50,), 'learning_rate': 'constant', 'learning_rate_init': 0.0005, 'solver': 'adam'}


In [53]:
mlp_space2 = {
    'hidden_layer_sizes': [(35,), (50,), (80,), (100,)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.001],
    'learning_rate': ['constant'],
    'learning_rate_init': [0.0005],
    'batch_size': ['auto'],
}

model_mlp_grid2 = GridSearchCV(MLPRegressor(max_iter=1000), mlp_space2, cv = 3).fit(X[:100000,:], y[:100000,])
print(model_mlp_grid2.best_params_)

{'activation': 'relu', 'alpha': 0.001, 'batch_size': 'auto', 'hidden_layer_sizes': (80,), 'learning_rate': 'constant', 'learning_rate_init': 0.0005, 'solver': 'adam'}


### Results with Final Hyperparameters

In [None]:
mlp_final_params = {
    'max_iter': 1000,
    'hidden_layer_sizes': (80,),
    'activation': 'relu',
    'solver': 'adam',
    'alpha': 0.001,
    'learning_rate_init': 0.0005,
    'learning_rate': 'constant'
}

model_mlp_base = MLPRegressor(**mlp_params)
train, test = model_testing(model_mlp_base, X, y, 10)
print("training data: ", train)
print("testing data: ", test)

# Random Forest

### Baseline Hyperparameters

In [111]:
rand_params = {
    "n_estimators" : 40,
    "max_features" : 'auto',
    "max_depth" : 30,
    "min_samples_split" : 3,
    "min_samples_leaf" : 1
}

model_random1 = RandomForestRegressor(**rand_params)
train, test = model_testing(model_random1, X[:100000,:], y[:100000,], 1)
# train, test = model_testing(model_random1, X, y, 1)
print("training data: ", train)
print("testing data: ", test)

training data:  15.79908888301264
testing data:  36.89598287355623


### Finding Hyperparameters using Grid Search

In [103]:
random_params = {
    'n_estimators': [15],
#     'max_features': ['sqrt', 'auto'],
#     'max_depth': [30, 40, 50],
}

model_random2 = GridSearchCV(RandomForestRegressor(), random_params, cv = 5).fit(X[:100000,:], y[:100000,])
print(model_random2.best_params_)

{'n_estimators': 15}


In [46]:
random_params = {
    'n_estimators': [15, 20],
    'max_features': ['auto'],
    'max_depth': [40],
    'min_samples_split': [3, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

model_random2 = GridSearchCV(RandomForestRegressor(), random_params, cv = 5).fit(X[:100000,:], y[:100000,])
print(model_random2.best_params_)

{'max_depth': 40, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 15}


### Results with Final Hyperparameters

In [47]:
rand_params = {
    "n_estimators" : 15,
    "max_features" : 'auto',
    "max_depth" : 40,
    "min_samples_split" : 5,
    "min_samples_leaf" : 2
}

model_random3 = RandomForestRegressor(**rand_params)
train, test = model_testing(model_random3, X, y, 10)
print("training data: ", train)
print("testing data: ", test)

training data:  0.14329873827811637
testing data:  0.2882599974775974


# Gradient Boosting Models

## XGBoost

In [None]:
params = {'n_estimators': 500,
          'max_depth': 4,
          'min_samples_split': 5,
          'learning_rate': 0.01,
          'loss': 'ls'}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
err = mean_squared_error(y_test, y_pred)
print(np.sqrt(err)*100)

# Prediiction of test data results

In [45]:
final_model = pickle.load(open("trained_models/lin_reg.pickle", 'rb'))

In [56]:
training_data = pd.read_csv("data/training_data_with_features.csv")

In [53]:
test_data = pd.read_csv("data/testing_data_with_features.csv")
X_predict = test_data[["norm_views", "norm_comments", "norm_user_views", "norm_user_comments",
               "norm_country_views", "norm_country_comments", "norm_category_views", "norm_category_comments", 
               "cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat7", "cat8", "cat9", "count1", "count2", 
               "count3", "count4", "count5", "count6", "count7", "count8", "count9", "count10", "count11", 
               "count12", "count13", "count14", "count15"]].values

In [54]:
y_predict = final_model.predict(X_predict)

In [57]:
y_mean = np.mean(training_data["#likes"].to_numpy())
y_std = np.std(training_data["#likes"].to_numpy())

In [59]:
test_data["#likes"] = np.abs(y_std*y_predict + y_mean)

In [60]:
req_data = test_data[["post_id", "user_id", "country", "category", "#views", "#comments", "#likes"]]

In [61]:
req_data.describe()

Unnamed: 0,post_id,user_id,country,#views,#comments,#likes
count,158541.0,158541.0,158541.0,158541.0,158541.0,158541.0
mean,264527.946128,20965.922172,6.703704,512843.1,2033.359,24129.65
std,152467.591423,12096.910208,4.425441,1716672.0,11958.8,75985.7
min,5.0,0.0,0.0,14.0,0.0,0.0579067
25%,132600.0,10474.0,3.0,62280.0,203.0,3469.572
50%,264590.0,21108.0,6.0,160629.0,570.0,9537.753
75%,396413.0,31457.0,11.0,434204.0,1534.0,22979.21
max,528455.0,41772.0,14.0,175155100.0,1219528.0,5932230.0


In [62]:
req_data.to_csv("final_likes_prediction.csv")