# Sample Models

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.metrics import mean_squared_error

RANDOM_SEED = 1



In [22]:
df = pd.read_csv('dataset_Facebook.csv', sep=';')

In [23]:
df.columns

Index(['Page total likes', 'Type', 'Category', 'Post Month', 'Post Weekday',
       'Post Hour', 'Paid', 'Lifetime Post Total Reach',
       'Lifetime Post Total Impressions', 'Lifetime Engaged Users',
       'Lifetime Post Consumers', 'Lifetime Post Consumptions',
       'Lifetime Post Impressions by people who have liked your Page',
       'Lifetime Post reach by people who like your Page',
       'Lifetime People who have liked your Page and engaged with your post',
       'comment', 'like', 'share', 'Total Interactions'],
      dtype='object')

In [24]:
df.head()

Unnamed: 0,Page total likes,Type,Category,Post Month,Post Weekday,Post Hour,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,Lifetime Post Consumptions,Lifetime Post Impressions by people who have liked your Page,Lifetime Post reach by people who like your Page,Lifetime People who have liked your Page and engaged with your post,comment,like,share,Total Interactions
0,139441,Photo,2,12,4,3,0.0,2752,5091,178,109,159,3078,1640,119,4,79.0,17.0,100
1,139441,Status,2,12,3,10,0.0,10460,19057,1457,1361,1674,11710,6112,1108,5,130.0,29.0,164
2,139441,Photo,3,12,3,3,0.0,2413,4373,177,113,154,2812,1503,132,0,66.0,14.0,80
3,139441,Photo,2,12,2,10,1.0,50128,87991,2211,790,1119,61027,32048,1386,58,1572.0,147.0,1777
4,139441,Photo,2,12,2,3,0.0,7244,13594,671,410,580,6228,3200,396,19,325.0,49.0,393


In [25]:
df = df.join(pd.get_dummies(df['Type']))

[Link to paper](http://www.math-evry.cnrs.fr/_media/members/aguilloux/enseignements/m1mint/moro2016.pdf) - Three different post categories, check table 1 (and maybe fig 11?)


In [26]:
# action, product, inspiration
categories = {
    "1":"Action",
    "2":"Product",
    "3":"Inspiration"
}

df['Category'] = df['Category'].astype(str).apply(lambda x: categories[x])

df = df.join(pd.get_dummies(df['Category']))

In [28]:
df = df.drop(columns=['Type', 'Category'])

In [29]:
df.columns

Index(['Page total likes', 'Post Month', 'Post Weekday', 'Post Hour', 'Paid',
       'Lifetime Post Total Reach', 'Lifetime Post Total Impressions',
       'Lifetime Engaged Users', 'Lifetime Post Consumers',
       'Lifetime Post Consumptions',
       'Lifetime Post Impressions by people who have liked your Page',
       'Lifetime Post reach by people who like your Page',
       'Lifetime People who have liked your Page and engaged with your post',
       'comment', 'like', 'share', 'Total Interactions', 'Link', 'Photo',
       'Status', 'Video', 'Action', 'Inspiration', 'Product'],
      dtype='object')

In [40]:
# check for any NaNs
for i in df.columns:
    if any(df[i][df[i].isna()]):
           print(i)

Paid
like
share


In [45]:
# remove NaNs
df = df[~((df['Paid'].isna()) | (df['like'].isna()) | (df['share'].isna()))]

In [59]:
X_cols = ['Page total likes', 'Post Month', 'Post Weekday', 'Post Hour', 'Paid', 
         'Link', 'Photo', 'Status', 'Video', 'Action', 'Inspiration', 'Product']

y_col = 'Total Interactions'

X_train, X_test, y_train, y_test = train_test_split(
    df[X_cols],
    df[y_col],
    test_size=.4,
    random_state=1
)

## Linear Regression

In [60]:
lm = LinearRegression()
lm.fit(X_train, y_train)

LinearRegression()

In [61]:
mean_squared_error(y_test, lm.predict(X_test))

78612.77628356002

In [62]:
lm.score(X_test, y_test)

0.002974511885412867

## Ridge

In [63]:
ridge = Ridge(random_state=RANDOM_SEED)
ridge.fit(X_train, y_train)

Ridge(random_state=1)

In [64]:
mean_squared_error(y_test, ridge.predict(X_test))

78486.4952081143

In [65]:
ridge.score(X_test, y_test)

0.004576102070089294

## Lasso

In [66]:
lasso = Lasso(max_iter=10000, fit_intercept=True, random_state=RANDOM_SEED)
lasso.fit(X_train, y_train)

Lasso(max_iter=10000, random_state=1)

In [67]:
mean_squared_error(y_test, lasso.predict(X_test))

78155.66746658705

In [68]:
lasso.score(X_test, y_test)

0.008771904661875163