In [1]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
column_names = ['Page Popularity', 'Page Checkins', 'Page Talking About', 'Page Category', 
           'Derived_1', 'Derived_2', 'Derived_3', 'Derived_4', 'Derived_5', 
           'Derived_6', 'Derived_7', 'Derived_8', 'Derived_9', 'Derived_10',
           'Derived_11', 'Derived_12', 'Derived_13', 'Derived_14', 'Derived_15', 
           'Derived_16', 'Derived_17', 'Derived_18', 'Derived_19', 'Derived_20', 
           'Derived_21', 'Derived_22', 'Derived_23', 'Derived_24', 'Derived_25', 
           'CC1', 'CC2', 'CC3', 'CC4', 'CC5',
           'Base Time', 'Post Length', 'Post Share Count', 'Post Promotion Status', 'Local Hours',
           'Sunday Published', 'Monday Published', 'Tuesday Published', 'Wednesday Published', 'Thursday Published', 'Friday Published', 'Saturday Published', 
           'Sunday Base', 'Monday Base', 'Tuesday Base', 'Wednesday Base', 'Thursday Base', 'Friday Base', 'Saturday Base', 
           'target']

In [3]:
data = pd.read_csv('facebook_comment.csv', header=None, names=column_names)

In [4]:
data.head()

Unnamed: 0,Page Popularity,Page Checkins,Page Talking About,Page Category,Derived_1,Derived_2,Derived_3,Derived_4,Derived_5,Derived_6,...,Friday Published,Saturday Published,Sunday Base,Monday Base,Tuesday Base,Wednesday Base,Thursday Base,Friday Base,Saturday Base,target
0,634995,0,463,1,0.0,806.0,11.291045,1.0,70.495138,0.0,...,0,0,0,0,0,0,0,0,1,0
1,634995,0,463,1,0.0,806.0,11.291045,1.0,70.495138,0.0,...,0,0,0,0,0,0,0,1,0,0
2,634995,0,463,1,0.0,806.0,11.291045,1.0,70.495138,0.0,...,1,0,0,0,0,0,0,0,1,0
3,634995,0,463,1,0.0,806.0,11.291045,1.0,70.495138,0.0,...,1,0,0,1,0,0,0,0,0,0
4,634995,0,463,1,0.0,806.0,11.291045,1.0,70.495138,0.0,...,0,0,0,0,0,1,0,0,0,0


## Normalization

In [5]:
from sklearn.preprocessing import StandardScaler

Заменим все NaN на 0.

In [6]:
if data.isna().any().sum() != 0:
    data.fillna(value=0, inplace=True)

Удалим все столбцы, в которых у всех элементов одинаковые значения.

In [7]:
drop_columns = []

In [8]:
print('Data Size: ', data.shape)

for column in data.columns:
    size = len(np.unique(data[column]))
    if size == 1:
        drop_columns += [column]

print('Drop Columns: ', drop_columns)

Data Size:  (40949, 54)
Drop Columns:  ['Post Promotion Status']


In [9]:
data = data.drop(columns=drop_columns)

In [10]:
data.dtypes

Page Popularity          int64
Page Checkins            int64
Page Talking About       int64
Page Category            int64
Derived_1              float64
Derived_2              float64
Derived_3              float64
Derived_4              float64
Derived_5              float64
Derived_6              float64
Derived_7              float64
Derived_8              float64
Derived_9              float64
Derived_10             float64
Derived_11             float64
Derived_12             float64
Derived_13             float64
Derived_14             float64
Derived_15             float64
Derived_16             float64
Derived_17             float64
Derived_18             float64
Derived_19             float64
Derived_20             float64
Derived_21             float64
Derived_22             float64
Derived_23             float64
Derived_24             float64
Derived_25             float64
CC1                      int64
CC2                      int64
CC3                      int64
CC4     

Нормализуем каждую из фич

In [11]:
old_columns = data.drop(columns=['Local Hours', 'target']).columns
new_columns = []

In [12]:
for column in old_columns:
    new_column = column + ' Norm'
    new_columns += [new_column]
    data[new_column] = StandardScaler().fit_transform(np.array(data[column]).reshape(-1,1))

In [13]:
data.shape

(40949, 104)

Преобразуем время

In [14]:
print(np.unique(data['Local Hours']))

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24]


In [15]:
def make_harmonic_features(value, period=24):
    value *= 2 * np.pi / period
    return np.cos(value), np.sin(value)

data['Harmonic Cos Local Hours'] = make_harmonic_features(data['Local Hours'], 24)[0]
data['Harmonic Sin Local Hours'] = make_harmonic_features(data['Local Hours'], 24)[1]

old_columns += ['Local Hours']
new_columns += ['Harmonic Cos Local Hours', 'Harmonic Sin Local Hours']

In [16]:
norm_data = data[new_columns]
norm_data.head()

Unnamed: 0,Page Popularity Norm,Page Checkins Norm,Page Talking About Norm,Page Category Norm,Derived_1 Norm,Derived_2 Norm,Derived_3 Norm,Derived_4 Norm,Derived_5 Norm,Derived_6 Norm,...,Saturday Published Norm,Sunday Base Norm,Monday Base Norm,Tuesday Base Norm,Wednesday Base Norm,Thursday Base Norm,Friday Base Norm,Saturday Base Norm,Harmonic Cos Local Hours,Harmonic Sin Local Hours
0,-0.100037,-0.227075,-0.399678,-1.165633,-0.076435,0.730167,-0.511079,-0.495224,0.037159,-0.021827,...,-0.398309,-0.406216,-0.390827,-0.399501,-0.417774,-0.421477,-0.409962,2.429384,1.0,0.997253
1,-0.100037,-0.227075,-0.399678,-1.165633,-0.076435,0.730167,-0.511079,-0.495224,0.037159,-0.021827,...,-0.398309,-0.406216,-0.390827,-0.399501,-0.417774,-0.421477,2.439248,-0.411627,1.0,0.997253
2,-0.100037,-0.227075,-0.399678,-1.165633,-0.076435,0.730167,-0.511079,-0.495224,0.037159,-0.021827,...,-0.398309,-0.406216,-0.390827,-0.399501,-0.417774,-0.421477,-0.409962,2.429384,1.0,0.997253
3,-0.100037,-0.227075,-0.399678,-1.165633,-0.076435,0.730167,-0.511079,-0.495224,0.037159,-0.021827,...,-0.398309,-0.406216,2.558674,-0.399501,-0.417774,-0.421477,-0.409962,-0.411627,1.0,0.997253
4,-0.100037,-0.227075,-0.399678,-1.165633,-0.076435,0.730167,-0.511079,-0.495224,0.037159,-0.021827,...,-0.398309,-0.406216,-0.390827,-0.399501,2.393637,-0.421477,-0.409962,-0.411627,1.0,0.997253


## GradientBoostingRegressor

In [17]:
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score as R2

In [18]:
def RMSE(y_test, y_pred):
    return np.sqrt(MSE(y_test, y_pred))

In [19]:
X = np.array(norm_data)
y = np.array(data['target'])

X.shape, y.shape

((40949, 53), (40949,))

In [20]:
model_lr = GradientBoostingRegressor()
model_coefs = []
model_stats = []

kf = KFold(n_splits=5)
kf.get_n_splits(X)
for train_index, test_index in tqdm(kf.split(X)):
    X_train = X[train_index]
    y_train = y[train_index]
    X_test = X[test_index]
    y_test = y[test_index]
    
    model_lr.fit(X_train, y_train)
    y_pred = model_lr.predict(X_test)
    
    model_coefs += [model_lr.feature_importances_]
    model_stats += [[np.average(y_test-y_pred), np.std(y_test-y_pred), RMSE(y_test, y_pred), MSE(y_test, y_pred), R2(y_test, y_pred)]]   

5it [00:53, 10.68s/it]


In [21]:
pd.DataFrame(model_coefs)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,43,44,45,46,47,48,49,50,51,52
0,0.005388,0.004389,0.000938,0.00071,6.78487e-07,0.000966,0.002491,0.000222,0.011144,0.0,...,0.0,0.0,0.0,0.000189,0.002213,4.6e-05,0.0,0.0,0.0,0.000278
1,0.004918,0.007381,0.00589,0.004446,0.0001326193,0.000369,0.006334,0.007883,0.001915,0.001143,...,0.0,0.000416,0.0,0.000173,0.0,8e-06,0.0,0.0,5e-06,7.6e-05
2,0.001552,0.00725,0.004855,0.0,0.004593516,0.000516,0.011068,0.000334,0.006733,0.0,...,6e-06,0.0,0.0,0.000102,0.01262,0.0,0.0,7.3e-05,0.0,5e-06
3,0.004486,0.005215,0.002251,0.006924,0.0,0.002312,0.000183,0.001005,0.004244,0.000133,...,0.0,0.0,0.0,0.0,0.002916,0.0,0.0,0.0,2e-05,9.4e-05
4,0.003103,0.006669,0.009523,0.001332,8.827173e-05,0.003746,0.001138,0.001169,0.005691,0.0,...,0.0,0.0,0.0,0.000153,0.00774,0.0,0.0,0.0,3.2e-05,3.5e-05


In [22]:
pd.DataFrame(model_stats, columns = ['E', 'STD', 'RMSE', 'MSE', 'R2'])

Unnamed: 0,E,STD,RMSE,MSE,R2
0,0.705401,21.51289,21.524451,463.302006,0.705519
1,0.531771,23.616758,23.622744,558.034034,0.540829
2,0.145453,20.464792,20.465309,418.828857,0.713762
3,-1.044603,26.043865,26.064806,679.374105,0.467025
4,-0.790837,17.629802,17.647531,311.435358,0.592913


## GradientDescent

In [23]:
class GradientDescent:

    def __init__(self,
                 learning_rate=1e-4, epochs=1e4, min_weight_dist=1e-4,
                 weight=None, c=0):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.min_weight_dist = min_weight_dist
        self.weight = weight
        self.c = c


    def predict(self, X, w=None, c=None):
        if w is None:
            w = self.weight
        if c is None:
            c = self.c

        if len(X.shape) == 1:
            X = X.reshape(-1, 1)

        return np.dot(X, w) + c

    def fit(self, X, y):
        if len(X.shape) == 1:
            X = X.reshape(-1, 1)

        self.weight = np.zeros(X.shape[1])
        self.c = 0
        i = 0
        while (i < self.epochs):
            y_pred = self.predict(X)

            D_weight = (2 * self.learning_rate / len(y)) * np.dot(X.T, y - y_pred)
            D_c = (2 * self.learning_rate / len(y)) * sum(y - y_pred)

            self.weight += D_weight
            self.c += D_c

            if D_weight.sum() + D_c < self.min_weight_dist:
                return

            i += 1

    @property
    def feature_importances_(self):
        return np.append([self.c], self.weight)

In [24]:
model_descent = GradientDescent()
model_coefs = []
model_stats = []

kf = KFold(n_splits=5)
kf.get_n_splits(X)
for train_index, test_index in tqdm(kf.split(X)):
    X_train = X[train_index]
    y_train = y[train_index]
    X_test = X[test_index]
    y_test = y[test_index]
    
    model_descent.fit(X_train, y_train)
    y_pred = model_descent.predict(X_test)
    
    model_coefs += [model_descent.feature_importances_]
    model_stats += [[np.average(y_test-y_pred), np.std(y_test-y_pred), RMSE(y_test, y_pred), MSE(y_test, y_pred), R2(y_test, y_pred)]]   

5it [09:47, 121.03s/it]


In [25]:
pd.DataFrame(model_coefs)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
0,2.401503,-0.391127,-0.702441,-0.510843,0.034823,-0.144732,0.104243,0.520074,0.556833,0.604582,...,-0.186904,-0.098611,0.182967,-0.08318,0.237693,-0.093086,-0.034688,-0.108096,2.489703,2.454422
1,2.34396,-0.140797,-0.609703,-0.370822,0.081393,0.095103,0.069445,0.529124,0.730995,0.40128,...,-0.108899,-0.160534,0.187174,-0.129089,0.079676,0.091383,0.006047,-0.074377,2.407923,2.383851
2,2.371047,-0.105937,-0.590682,-0.381081,-0.057905,-0.086587,0.030448,0.562988,0.579286,0.493823,...,-0.095451,-0.195514,0.11497,-0.088784,0.299727,-0.037894,-0.004103,-0.090426,2.436039,2.407095
3,2.506442,0.104888,-0.889387,-0.090727,-0.027219,-0.221557,-0.022319,0.564328,0.392563,0.631927,...,-0.097769,-0.05152,0.029827,-0.086398,0.387564,-0.045359,-0.036387,-0.202372,2.603331,2.553721
4,2.444726,-0.114523,-0.357001,-0.562574,0.028568,0.066666,0.07031,0.522061,0.577709,0.482703,...,-0.211011,-0.162242,0.143106,-0.058426,0.2495,-0.078694,0.128363,-0.220044,2.530515,2.490387


In [26]:
pd.DataFrame(model_stats, columns = ['E', 'STD', 'RMSE', 'MSE', 'R2'])

Unnamed: 0,E,STD,RMSE,MSE,R2
0,0.25686,31.42311,31.42416,987.477826,0.372346
1,0.812682,29.627624,29.638768,878.456556,0.277174
2,0.576458,31.784752,31.789979,1010.602756,0.309328
3,-1.346139,32.18664,32.214778,1037.791915,0.185843
4,-0.457654,22.666763,22.671382,513.991577,0.328145
