In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LassoCV
from sklearn.preprocessing import StandardScaler

In [2]:
reddit = pd.read_csv('../data/reddit_data_cleaned.csv')
df_tvec_10 = pd.read_csv('../data/title_tfidf_10.csv')
df_tvec_25 = pd.read_csv('../data/title_tfidf_25.csv')
df_tvec_50 = pd.read_csv('../data/title_tfidf_50.csv')
df_tvec_100 = pd.read_csv('../data/title_tfidf_100.csv')

In [3]:
#Setting up a few dummy variables

reddit = pd.get_dummies(reddit, columns=['posted_hour'])
reddit = pd.get_dummies(reddit, columns=['posted_weekday'])
# reddit = pd.get_dummies(reddit, columns=['subreddit'])

In [4]:
#merge the dataframes
reddit_tfidf_10 = reddit.merge(df_tvec_10, on='post_id', how='left')
#drop redundant columns
reddit_tfidf_10.drop(columns = ['stemmed_title'], inplace = True)

#merge the dataframes
reddit_tfidf_25 = reddit.merge(df_tvec_25, on='post_id', how='left')
#drop redundant columns
reddit_tfidf_25.drop(columns = ['stemmed_title'], inplace = True)

#merge the dataframes
reddit_tfidf_50 = reddit.merge(df_tvec_50, on='post_id', how='left')
#drop redundant columns
reddit_tfidf_50.drop(columns = ['stemmed_title'], inplace = True)

#merge the dataframes
reddit_tfidf_100 = reddit.merge(df_tvec_100, on='post_id', how='left')
#drop redundant columns
reddit_tfidf_100.drop(columns = ['stemmed_title'], inplace = True)

# Model Baseline Accuracy

In [5]:
len(reddit[reddit['target'] == 1])/len(reddit)

0.5059046815689583

The baseline accuracy for my model is 50.14% (subject to change as I collect more data), if we predict that all posts are successful since there are slightly more successful posts that failed posts. For context, succesful is defined as getting above the median number of comments.

# RF Model Without TFIDF Terms

In [6]:
#columns to drop for modeling
drop = ['title', 'hours_ago', 'scraped_time', 'target', 'posted_time', 
        'post_id', 'comments', 'stemmed_title','subreddit']

#creating our X and y
X = reddit.drop(columns = drop)
y = reddit['target']

#random forest model with 100 estimators
my_forest = RandomForestClassifier(class_weight='balanced',
                                  n_estimators=100, 
                                  max_depth=10,
                                  min_samples_split=50,                        
                                  n_jobs=-1,
                                  random_state = 48)

score = cross_val_score(my_forest, X, y)
print(np.mean(score)+2*np.std(score), np.mean(score)-2*np.std(score))
print(np.mean(score))

0.7320479028656166 0.6904186551994136
0.7112332790325151


After playing around with the parameters, I settled on a model that I believe will be less sususceptible to overfitting, with a relatively large number of min_samples_split and limiting max depth to 10. 

This model has a score of 0.71, with a 95% CI of between 0.73 and 0.68

However, if we are asking the question: 'how do you make a successful reddit post' then it feels disingenuous to include upvotes as an independent variable, because as a content creator you have no control over the number of upvotes that your post is receiving. 

In [7]:
#RF model excluding 'upvotes' from X

#columns to drop for modeling
drop = ['title', 'hours_ago', 'scraped_time', 'target', 'posted_time', 
        'post_id', 'comments', 'stemmed_title','subreddit', 'upvotes']

#creating our X and y
X = reddit.drop(columns = drop)
y = reddit['target']

#random forest model with 100 estimators
my_forest = RandomForestClassifier(class_weight='balanced',
                                  n_estimators=100, 
                                  max_depth=10,
                                  min_samples_split=50,                        
                                  n_jobs=-1,
                                  random_state = 48)

score = cross_val_score(my_forest, X, y)
print(np.mean(score)+2*np.std(score), np.mean(score)-2*np.std(score))
print(np.mean(score))

0.6100832293692197 0.5590420252044613
0.5845626272868405


Unsuprisingly, our models performance decreases significantly. We are now looking at a score of 0.57 with a 95% CI of between 0.53 and 0.62

# RF Model Including TFIDF Terms

In [8]:
#RF model with top 10 TFIDF terms 

#columns to drop for modeling
drop = ['title', 'hours_ago', 'scraped_time', 'target', 'posted_time', 
        'post_id', 'comments', 'subreddit', 'upvotes']

#creating our X and y
X = reddit_tfidf_10.drop(columns = drop)
y = reddit_tfidf_10['target']

my_forest = RandomForestClassifier(class_weight='balanced',
                                  n_estimators=100, 
                                  max_depth=10,
                                  min_samples_split=50,                        
                                  n_jobs=-1,
                                  random_state=48)

score = cross_val_score(my_forest, X, y)
print(np.mean(score)+2*np.std(score), np.mean(score)-2*np.std(score))
print(np.mean(score))

0.6070745341954478 0.5541786414143866
0.5806265878049172


Here we see that including the TFIDF terms does not really change the performance of our model. The CI narrows ever so slightly. Next we will check what happens when including more TFIDF terms

In [9]:
#RF model with top 100 TFIDF terms

#columns to drop for modeling
drop = ['title', 'hours_ago', 'scraped_time', 'target', 'posted_time', 
        'post_id', 'comments', 'subreddit', 'upvotes']

#creating our X and y
X = reddit_tfidf_100.drop(columns = drop)
y = reddit_tfidf_100['target']

my_forest = RandomForestClassifier(class_weight='balanced',
                                  n_estimators=100, 
                                  max_depth=10,
                                  min_samples_split=50,                        
                                  n_jobs=-1,
                                  random_state=48)

score = cross_val_score(my_forest, X, y)
print(np.mean(score)+2*np.std(score), np.mean(score)-2*np.std(score))
print(np.mean(score))

0.6094494395890014 0.5560216622071247
0.5827355508980631


Again, we see that including more TFIDF terms does not improve the performance of our model. I believe this to be due to the fact that there are so many different subreddits, with so many different focuses, and titles are very short. On average, the titles in our data set are only 10 words long. For this reason, I will be choosing to exclude titles from my random forest model. 

# Logistic Regression Models

In [10]:
#logistic Regression without TFIDF terms

#columns to drop for modeling
drop = ['title', 'hours_ago', 'scraped_time', 'target', 'posted_time', 
        'post_id', 'comments', 'stemmed_title', 'subreddit', 'upvotes']

#creating our X and y
X = reddit.drop(columns = drop)
y = reddit['target']

logreg = LogisticRegression(n_jobs=-1)
score = cross_val_score(logreg, X, y)
print(np.mean(score)+2*np.std(score), np.mean(score)-2*np.std(score))
print(np.mean(score))

0.6141428798201969 0.5475313473811785
0.5808371136006877


In [11]:
#logistic regression with TFIDF terms

#columns to drop for modeling
drop = ['title', 'hours_ago', 'scraped_time', 'target', 'posted_time', 
        'post_id', 'comments', 'subreddit', 'upvotes']

#creating our X and y
X = reddit_tfidf_10.drop(columns = drop)
y = reddit_tfidf_10['target']

logreg = LogisticRegression(n_jobs=-1)
score = cross_val_score(logreg, X, y)
print(np.mean(score)+2*np.std(score), np.mean(score)-2*np.std(score))
print(np.mean(score))

0.6149191899516084 0.5439425924408468
0.5794308911962276


In [12]:
#logistic regression with TFIDF terms

#columns to drop for modeling
drop = ['title', 'hours_ago', 'scraped_time', 'target', 'posted_time', 
        'post_id', 'comments', 'subreddit', 'upvotes']

#creating our X and y
X = reddit_tfidf_100.drop(columns = drop)
y = reddit_tfidf_100['target']

logreg = LogisticRegression(n_jobs=-1)
score = cross_val_score(logreg, X, y)
print(np.mean(score)+2*np.std(score), np.mean(score)-2*np.std(score))
print(np.mean(score))

0.6052458297820817 0.5463083386805465
0.5757770842313141


Our logistic model has a score of 0.57 with a 95% CI of between 0.52 and 0.62.

Similar to our random forest model, we see that our model's performance does not change significantly with the top 10 TFIDF terms, and worsens slightly with the top 100. I see no reason to include them given the current state of our model. This indicates that, given our data set, there are not any specific key words to include that will improve or hurt your chances of having a successful post. I believe this would change if we looked at data across a wider time frame, as posts related to current events are more likely to succeed, and all of our data was scraped over the course of a few weeks around the winter holidays. 

# Lasso Regression (Logistic)

In [13]:
# Lasso Regression with standard scaled variables

#columns to drop for modeling
drop = ['title', 'hours_ago', 'scraped_time', 'target', 'posted_time', 
        'post_id', 'comments', 'stemmed_title', 'subreddit', 'upvotes']

#creating our X and y
X = reddit.drop(columns = drop)
y = reddit['target']

#scaling the data
sc = StandardScaler()
Z = sc.fit_transform(X)

#creating a list of possible alphas
l_alphas = np.logspace(-10, 0, 50)

#testing for the optimal alpha
lasso_cv = LassoCV(alphas = l_alphas, cv = 5, max_iter = 50000)

lasso_cv.fit(Z, y)

LassoCV(alphas=array([1.00000000e-10, 1.59985872e-10, 2.55954792e-10, 4.09491506e-10,
       6.55128557e-10, 1.04811313e-09, 1.67683294e-09, 2.68269580e-09,
       4.29193426e-09, 6.86648845e-09, 1.09854114e-08, 1.75751062e-08,
       2.81176870e-08, 4.49843267e-08, 7.19685673e-08, 1.15139540e-07,
       1.84206997e-07, 2.94705170e-07, 4.71486636e-07, 7.54312006e-07,
       1.20679264e-06, 1.93069773e-0...
       5.17947468e-05, 8.28642773e-05, 1.32571137e-04, 2.12095089e-04,
       3.39322177e-04, 5.42867544e-04, 8.68511374e-04, 1.38949549e-03,
       2.22299648e-03, 3.55648031e-03, 5.68986603e-03, 9.10298178e-03,
       1.45634848e-02, 2.32995181e-02, 3.72759372e-02, 5.96362332e-02,
       9.54095476e-02, 1.52641797e-01, 2.44205309e-01, 3.90693994e-01,
       6.25055193e-01, 1.00000000e+00]),
        cv=5, max_iter=50000)

In [14]:
var_coefs = list(zip(X.columns, list(lasso_cv.coef_)))

#creating a list of variables to drop
to_drop = []
for item_pair in var_coefs:
    if item_pair[1] == 0:
        to_drop.append(item_pair[0])


In [15]:
to_drop

['contains_video',
 'posted_hour_7',
 'posted_hour_8',
 'posted_hour_12',
 'posted_hour_13',
 'posted_hour_14',
 'posted_hour_17',
 'posted_hour_21',
 'posted_hour_22',
 'posted_weekday_Monday',
 'posted_weekday_Thursday',
 'posted_weekday_Wednesday']

Our lasso regression model zeros out several variables that do not have a significant impact on a post's chances at suceeding. Namely we see that whether or not a post contains a video does not have a significant impact, a list of hours that do not have a significant impact, and that monday and wednesday do not have a significant impact. 

# Final Logistic Model

In [16]:
#creating a regression with the reduced dataframe

#columns to drop for modeling
drop = ['title', 'hours_ago', 'scraped_time', 'target', 'posted_time', 
        'post_id', 'comments', 'stemmed_title', 'subreddit', 'upvotes']

#creating our X and y

X = X.drop(columns = to_drop)
sc = StandardScaler()
Z = sc.fit_transform(X)

logreg = LogisticRegression()
score = cross_val_score(logreg, Z, y)

score = cross_val_score(logreg, Z, y)
print(np.mean(score)+2*np.std(score), np.mean(score)-2*np.std(score))
print(np.mean(score))

0.6254402508496129 0.5560582294273337
0.5907492401384733


Our final model has a score of 0.57 with a 95% CI of between 0.53 and 0.62

In [17]:
cross_validate(logreg, Z, y)

{'fit_time': array([0.01196837, 0.00997353, 0.01396251, 0.00897574, 0.01097083]),
 'score_time': array([0.00099683, 0.00099707, 0.0009985 , 0.00099683, 0.        ]),
 'test_score': array([0.59205903, 0.55852373, 0.60351494, 0.59156415, 0.60808436])}

In [18]:
#selecting a model to examine the coefficients
#we select the third model as it has the score closest to the mean
final_results = cross_validate(logreg, Z, y, return_estimator=True)
final_logreg = final_results['estimator'][2]

In [19]:
coefficient_list = list(zip(X.columns, final_logreg.coef_[0]))
coefficient_df = pd.DataFrame(zip(X.columns, final_logreg.coef_[0]))
coefficient_df = coefficient_df.rename(columns = {0:'Variable',1:'Coefficient'})
coefficient_df.sort_values('Coefficient', ascending = False).head(20)

Unnamed: 0,Variable,Coefficient
1,text_only,0.311264
2,title_len,0.165071
15,posted_hour_18,0.045607
11,posted_hour_10,0.041421
12,posted_hour_11,0.03897
17,posted_hour_20,0.038668
16,posted_hour_19,0.036068
14,posted_hour_16,0.024141
10,posted_hour_9,0.02253
22,posted_weekday_Sunday,0.009033


Examining our coefficients, we can determine that to give our post the best chance of succeeding, it should be a text only post, posted on a thursday, around 10-11 am. The worst things we could do for our chances of having a successful post would be posting at an image on a friday between 11pm and midnight.

It should be noted that our final model does not have a lot of predictive power, but ultimately what we are trying to predict is human behaviour. We are trying to predict what people like, and how they will interact with a post. When the scope of our analysis is the front page, it is very difficult to determine what will succeed. I don't even have a strong understanding of the algorithm used to determine what ends up on the front page. This project would likely be much more successful if we took a deep dive into a single subreddit, and tried to determine what attributes helped a post become successful within the boundaries of that subreddit. 

If we were to narrow the scope of our analysis in this way, I believe that the TFIDF terms would become much more powerful predictors. For example, in the anti-work subreddit, I would imagine that posts with much more negative titles would tend to do better, as the people that are members of that subreddit love to see people talking trash about their bosses and explaining their unpleasant working conditions. 