In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.utils import shuffle
import random

In [2]:
BinaryFeatures = ['Post Promotion Status','published_weekday_0','published_weekday_1','published_weekday_2',
                      'published_weekday_3', 'published_weekday_4','published_weekday_5','published_weekday_6',
                     'base_weekday_0','base_weekday_1','base_weekday_2','base_weekday_3','base_weekday_4',
                      'base_weekday_5','base_weekday_6']
DecimalFeatures = ['Page Popularity','Page Checkins','Page talking about', 'Page Category',
                       'extra_0','extra_1','extra_2','extra_3','extra_4','extra_5',
                       'extra_6','extra_7','extra_8','extra_9','extra_10',
                       'extra_11','extra_12','extra_13','extra_14','extra_15','extra_16',
                       'extra_17','extra_18','extra_19','extra_20','extra_21','extra_22','extra_23','extra_24',
                       'CC1','CC2','CC3','CC4','CC5','Base Time','Post Length','Post Share Count','H Local']

In [3]:
def normalizeF(X):
    nrows = X.shape[0]
    BF = X[BinaryFeatures] 
    Y = X['Target']
        
    df = X[DecimalFeatures]
    df = (df - df.mean()) / df.std()

    result = pd.concat([ df, BF, Y], axis=1) 

    return result

In [4]:
def update_W(x,y,w,step, nfeat):
    n = x.shape[0]   
    z = np.dot(x, w)
    yy = y - z
    jj = np.dot(x.T, yy)
    grad = (2/n) * jj

    new_w = w +  step * grad.reshape(nfeat,1)
    return new_w

In [5]:
def RMSE(y_pred, y_true):
    loss = np.subtract(y_true, y_pred)
    mse = np.sum((loss)**2) / len(y_true)
    rmse = math.sqrt(mse)
    return rmse

def R2(y_pred, y_true):
    loss = np.subtract( y_true, y_pred)

    SS_res = np.sum((loss)**2)
    SS_tot = np.sum((y_true - np.mean(y_true))**2)
    rez = 1 - SS_res/SS_tot
    return rez

In [6]:
def splitData(data, parts, i):
    data = shuffle(data)
    Y = data[['Target']]
    X = data[data.columns[:53]]
    
    step = round(len(data) / parts)
    
    new_sizestart = i * step
    if (i + 1 == parts):
        new_sizeend = round(len(data))
    else:
        new_sizeend = (i+1) * step
    
    x_train = X.drop(X.index[new_sizestart:new_sizeend])
    y_train = Y.drop(Y.index[new_sizestart:new_sizeend])
    
    x_test  = X[new_sizestart:new_sizeend]
    y_test = Y[new_sizestart:new_sizeend] 
    
    return x_train, y_train, x_test, y_test

In [7]:
df = pd.read_csv('facebook.csv', index_col='Index')
normalized_df = df.dropna()
data = normalizeF(normalized_df)


In [15]:
e = 10e-4
coef = 0.04
nfolds = 5
table = pd.DataFrame({})
nfeatures = data.shape[1] - 1
features = list(data.columns.values[:-1])

for i in range(nfolds):
    Xtrain, Ytrain, Xtest, Ytest = splitData(data, nfolds, i) 
    W = np.ones(nfeatures).reshape(nfeatures,1)    

    for j in range(1, 1000): 
        step = coef / np.sqrt(j)     
        Wnew = update_W(Xtrain, Ytrain, W, step, nfeatures)
        if np.max(abs(W - Wnew)) < e:
            break
        W = Wnew
    
    
    Xtrain = np.dot(Xtrain, Wnew)
    R2train = R2(Xtrain,Ytrain)
    RMSEtrain = RMSE(Xtrain,Ytrain)

    Xtest = np.dot(Xtest, Wnew)
    R2test = R2(Xtest,Ytest)
    RMSEtest = RMSE(Xtest,Ytest) 
    
    rez = [R2train[0], RMSEtrain, R2test[0], RMSEtest]
    print(rez)
    
    for feature1 in W.tolist():
        rez.append(feature1[0])
        
    
    col = pd.DataFrame({'%d'%(i+1): rez })
    table =  pd.concat([table, col], axis = 1)
    

table.index=[['R2_train', 'RMSE_train', 'R2_test', 'RMSE_test'] + features ]
table =  pd.concat([table, pd.DataFrame({'mean': table.mean(axis = 1) })],axis = 1)
table =  pd.concat([table, pd.DataFrame({'std': table.std(axis = 1) })],axis = 1)
   

[0.3074135571889417, 30.099899559666515, 0.34108120639428174, 26.50702690443571]
[0.33488865914562005, 27.966844063304933, 0.25731577661622296, 34.42085114891004]
[0.31403536852380465, 29.447963266165413, 0.3232755210987932, 28.99632023054935]
[0.31412540264600297, 29.10088288472882, 0.31756436914251174, 30.467480930788014]
[0.31488843253189247, 29.910869487582406, 0.31940296767740417, 27.055447503013024]


In [16]:

table

Unnamed: 0,1,2,3,4,5,mean,std
R2_train,0.307414,0.334889,0.314035,0.314125,0.314888,0.31707,0.00931
RMSE_train,30.0999,27.966844,29.447963,29.100883,29.910869,29.305292,0.755209
R2_test,0.341081,0.257316,0.323276,0.317564,0.319403,0.311728,0.028456
RMSE_test,26.507027,34.420851,28.99632,30.467481,27.055448,29.489425,2.84034
Page Popularity,-0.044792,0.143385,0.155691,-0.036998,-0.313881,-0.019319,0.170193
Page Checkins,-0.542762,-0.622796,-0.582393,-0.646585,-0.67534,-0.613975,0.046852
Page talking about,-1.303251,-1.433319,-1.269544,-0.933099,-1.16171,-1.220185,0.16769
Page Category,-0.084676,-0.004979,-0.070425,0.022924,0.001457,-0.02714,0.042425
extra_0,-0.201348,-0.065895,-0.174274,-0.34156,0.049835,-0.146648,0.131812
extra_1,0.501333,0.295393,0.478941,0.352154,0.305685,0.386701,0.086883
