In [124]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from nba_creation_date import CreationDate, date_string_to_datetime

from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import feature_selection
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [248]:
# load data
df = pd.read_excel('data/training_set.xlsx')
df.rename(columns={'Followers at Posting': 'Followers'}, inplace=True)
df_vid = df[df['Type'] == 'Video']
df.head()

Unnamed: 0,Engagements,Followers,Created,Type,Description
0,502093,36984682,2019-05-21 23:30:51 EDT,Video,The @raptors bench trio of @sergeibaka @norman...
1,603380,36984682,2019-05-21 22:53:33 EDT,Video,@kyle_lowry7 pulls from deep for the @raptors ...
2,603380,36984682,2019-05-21 22:19:58 EDT,Video,@k_mid22 with some english on the @bucks dime!
3,725100,36984682,2019-05-21 22:02:41 EDT,Video,Kawhi punches it home with the left on TNT!
4,661446,36984682,2019-05-21 20:47:49 EDT,Video,@giannis_an34 goes baseline early to rock the ...


In [249]:
# Function to clean word data--removes stopwords, makes lowercase, leaves in numbers and @
def clean_words(sentences):
    words_clean = np.full(len(sentences), None)
    for i, words in enumerate(sentences):
        words = str(words).replace('@', '')
        word_list = re.split('\W+', words)
        words1 = [word.lower() for word in word_list if word.lower() not in stopwords.words('english')]
        #words2 = [word for word in words1 if len(word) >= 3]
        #words2 = [word for word in words1 if not any(char.isdigit() for char in word)]
        words_clean[i] = (' '.join(words1)).strip()
    return words_clean

In [241]:
df['clean_words'] = clean_words(df['Description'])
df.head()

Unnamed: 0,Engagements,Followers,Created,Type,Description,clean_words
0,502093,36984682,2019-05-21 23:30:51 EDT,Video,The @raptors bench trio of @sergeibaka @norman...,raptors bench trio sergeibaka normanpowell4 fr...
1,603380,36984682,2019-05-21 22:53:33 EDT,Video,@kyle_lowry7 pulls from deep for the @raptors ...,kyle_lowry7 pulls deep raptors 4th tnt
2,603380,36984682,2019-05-21 22:19:58 EDT,Video,@k_mid22 with some english on the @bucks dime!,k_mid22 english bucks dime
3,725100,36984682,2019-05-21 22:02:41 EDT,Video,Kawhi punches it home with the left on TNT!,kawhi punches home left tnt
4,661446,36984682,2019-05-21 20:47:49 EDT,Video,@giannis_an34 goes baseline early to rock the ...,giannis_an34 goes baseline early rock rim tnt


In [242]:
# calculate MAPE for model predictions
def calculate_mape(true, predicted):
    mapes = [abs((t - p) / t) for t, p in zip(true, predicted)]
    return 100 * np.mean(mapes)

In [243]:
# model 1: ngrams = 1, min_df = 1, max_features = 100
X_train, X_test, y_train, y_test = train_test_split(df_vid['clean_words'], df_vid['Engagements'], test_size=0.25)
nb_model = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), min_df=1, max_features=100)), 
                     #('feat', feature_selection.SelectKBest(k=20)),
                     ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])
nb_model.fit(X_train, y_train)
preds = nb_model.predict(X_test)
print(calculate_mape(y_test, preds))

10.982961621561275


In [244]:
# model 2: ngrams = 2, min_df = 1, max_features = 100
vect = CountVectorizer(ngram_range=(1,2), min_df=1, max_features=100)
clf = MultinomialNB()
X_train, X_test, y_train, y_test = train_test_split(df_vid['clean_words'], df_vid['Engagements'], test_size=0.25)
nb_model = Pipeline([('vect', vect), 
                     #('feat', feature_selection.SelectKBest(k=20)),
                     ('tfidf', TfidfTransformer()), ('clf', clf),])
nb_model.fit(X_train, y_train)
preds = nb_model.predict(X_test)
print(calculate_mape(y_test, preds))

10.538381271725594


Takeaways: models here aren't an improvement of baseline linear regression model in nba_instagram_models.R. There is promise here though, didn't tinker around with it a ton and it's only slightly worse than baseline. There is potential to combine these somehow (or maybe use the prediction here as an X variable in the baseline model. Haven't really debugged this model (what are most important features? what are the characteristics of the ones that it misses big time on?)