In [1]:
import pandas as pd

## Reading in the Data

In [2]:
df = pd.read_csv('./datasets/training.csv')
df.head()

Unnamed: 0,Engagements,Followers at Posting,Created,Type,Description
0,502093,36984682,2019-05-21 23:30:51 EDT,Video,The @raptors bench trio of @sergeibaka @norman...
1,603380,36984682,2019-05-21 22:53:33 EDT,Video,@kyle_lowry7 pulls from deep for the @raptors ...
2,603380,36984682,2019-05-21 22:19:58 EDT,Video,@k_mid22 with some english on the @bucks dime!
3,725100,36984682,2019-05-21 22:02:41 EDT,Video,Kawhi punches it home with the left on TNT!
4,661446,36984682,2019-05-21 20:47:49 EDT,Video,@giannis_an34 goes baseline early to rock the ...


In [3]:
df.isnull().sum()

Engagements              0
Followers at Posting     0
Created                  0
Type                     0
Description             14
dtype: int64

In [4]:
df.shape

(7766, 5)

## Dropping 14 Instagram Posts with No Description 
- For my modeling, I want to execute NLP; therefore, I'm going to drop the 14 instagrams posts that don't have descriptions. 

In [5]:
#dropping permanently 
df.dropna(subset= ['Description'], axis = 0, inplace = True)

In [6]:
df.shape

(7752, 5)

In [7]:
df.isnull().sum().sum()

0

In [8]:
df.dtypes

Engagements              int64
Followers at Posting     int64
Created                 object
Type                    object
Description             object
dtype: object

In [9]:
#want to see the frequency of different types 
df['Type'].value_counts()

Video    5451
Photo    1588
Album     713
Name: Type, dtype: int64

## Getting Dummy Columns for Type 

In [10]:
#note how i dropped one of the dummy columns to prevent multicollinearity 
dataframe = pd.get_dummies(df, columns = ['Type'], drop_first= True )

In [11]:
dataframe.head()

Unnamed: 0,Engagements,Followers at Posting,Created,Description,Type_Photo,Type_Video
0,502093,36984682,2019-05-21 23:30:51 EDT,The @raptors bench trio of @sergeibaka @norman...,0,1
1,603380,36984682,2019-05-21 22:53:33 EDT,@kyle_lowry7 pulls from deep for the @raptors ...,0,1
2,603380,36984682,2019-05-21 22:19:58 EDT,@k_mid22 with some english on the @bucks dime!,0,1
3,725100,36984682,2019-05-21 22:02:41 EDT,Kawhi punches it home with the left on TNT!,0,1
4,661446,36984682,2019-05-21 20:47:49 EDT,@giannis_an34 goes baseline early to rock the ...,0,1


In [12]:
import seaborn as sns

In [13]:
from sklearn.model_selection import train_test_split

**The following code shows the following features I want to keep (note after the train test split, Im going to separate `Description` for both train and test in order to execute nlp to ultimately vectorize).**



**My target variable is `Engagements`**

In [14]:
features = ['Followers at Posting', 'Description', 'Type_Photo', 'Type_Video']
X = dataframe[features]
y = dataframe['Engagements']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

**Separating `Description` from other features in order to clean text data for vectorization**

In [16]:
X_train_nlp = X_train[['Description']]
X_test_nlp = X_test[['Description']]

# Cleaning Up Train Data
- At this point, I have two text dataframes (train and test)
- The following code will demonstrate my process for cleaning each dataframe. 

In [17]:
#sanity check
X_train_nlp.head()

Unnamed: 0,Description
6709,A new career-high & @pelicansnba franchise-hig...
1615,Swipe through to recap Wednesday’s seven games!
980,@dloading breaks out an array of moves!
3759,No. 2??...@giannis_an34 ELECTRIFIES The Garden...
6991,@dwighthoward takes it coast to coast for the ...


In [18]:
#sanity check
X_train_nlp['Description'][6709]

'A new career-high & @pelicansnba franchise-high 25 assists for @rajonrondo tonight!'

In [19]:
import string 

## Step 1: Removing Punctuations and Numbers

In [20]:
#viewing all punctuations 
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [21]:
#creating a list of all punctuations
remove = list(string.punctuation)

In [22]:
#adding all digits to the remove list 
remove.extend(['0','1','2','3','4','5','6','7','8','9'])

In [23]:
#creating a function that will remove all punctuations and numbers 
from string import punctuation
def strip_punctuation(s):
    return ''.join(c for c in s if c not in remove)

In [24]:
#apply the function to both test and train data 
X_train_nlp['no punctuation and numbers'] = X_train_nlp['Description'].apply(strip_punctuation)
X_test_nlp['no punctuation and numbers'] = X_test_nlp['Description'].apply(strip_punctuation)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [25]:
#sanity check
X_train_nlp.head()

Unnamed: 0,Description,no punctuation and numbers
6709,A new career-high & @pelicansnba franchise-hig...,A new careerhigh pelicansnba franchisehigh a...
1615,Swipe through to recap Wednesday’s seven games!,Swipe through to recap Wednesday’s seven games
980,@dloading breaks out an array of moves!,dloading breaks out an array of moves
3759,No. 2??...@giannis_an34 ELECTRIFIES The Garden...,No giannisan ELECTRIFIES The Garden BESTofNBA
6991,@dwighthoward takes it coast to coast for the ...,dwighthoward takes it coast to coast for the N...


In [26]:
#sanity check
X_test_nlp.head()

Unnamed: 0,Description,no punctuation and numbers
5087,?? “Ain’t no time better than this” ...listen ...,“Ain’t no time better than this” listen in to...
5603,How’d @spidadmitchell do that!?,How’d spidadmitchell do that
1626,@giannis_an34 finds @malcolmbrogdon for the @b...,giannisan finds malcolmbrogdon for the bucks t...
6832,@pushat305 in traffic!,pushat in traffic
3896,10-year-old Phoenix Johnson shows off CRAZY ha...,yearold Phoenix Johnson shows off CRAZY handle...


## Step 2: Adding Vader Polarity Scores

In [27]:
#import and instantiate vader
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [28]:
#applying vader scores to the text without punctuations and numbers 
X_train_nlp['vader'] = [analyser.polarity_scores(text) for text in X_train_nlp['no punctuation and numbers']]
X_test_nlp['vader'] = [analyser.polarity_scores(text) for text in X_test_nlp['no punctuation and numbers']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [29]:
#sanity check
X_train_nlp.head()

Unnamed: 0,Description,no punctuation and numbers,vader
6709,A new career-high & @pelicansnba franchise-hig...,A new careerhigh pelicansnba franchisehigh a...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
1615,Swipe through to recap Wednesday’s seven games!,Swipe through to recap Wednesday’s seven games,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
980,@dloading breaks out an array of moves!,dloading breaks out an array of moves,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
3759,No. 2??...@giannis_an34 ELECTRIFIES The Garden...,No giannisan ELECTRIFIES The Garden BESTofNBA,"{'neg': 0.306, 'neu': 0.694, 'pos': 0.0, 'comp..."
6991,@dwighthoward takes it coast to coast for the ...,dwighthoward takes it coast to coast for the N...,"{'neg': 0.27, 'neu': 0.73, 'pos': 0.0, 'compou..."


In [30]:
#sanity check
X_test_nlp.head()

Unnamed: 0,Description,no punctuation and numbers,vader
5087,?? “Ain’t no time better than this” ...listen ...,“Ain’t no time better than this” listen in to...,"{'neg': 0.078, 'neu': 0.642, 'pos': 0.279, 'co..."
5603,How’d @spidadmitchell do that!?,How’d spidadmitchell do that,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
1626,@giannis_an34 finds @malcolmbrogdon for the @b...,giannisan finds malcolmbrogdon for the bucks t...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
6832,@pushat305 in traffic!,pushat in traffic,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
3896,10-year-old Phoenix Johnson shows off CRAZY ha...,yearold Phoenix Johnson shows off CRAZY handle...,"{'neg': 0.18, 'neu': 0.688, 'pos': 0.132, 'com..."


## Step 3: Pull out all Vader Scores

In [31]:
#this code creates new columns of vader scores for train
X_train_nlp['neg'] = X_train_nlp['vader'].apply(lambda x: x.get('neg'))
X_train_nlp['neu'] = X_train_nlp['vader'].apply(lambda x: x.get('neu'))
X_train_nlp['pos'] = X_train_nlp['vader'].apply(lambda x: x.get('pos'))
X_train_nlp['compound'] = X_train_nlp['vader'].apply(lambda x: x.get('compound'))

In [32]:
#this code creates new columns of vader scores for test
X_test_nlp['neg'] = X_test_nlp['vader'].apply(lambda x: x.get('neg'))
X_test_nlp['neu'] = X_test_nlp['vader'].apply(lambda x: x.get('neu'))
X_test_nlp['pos'] = X_test_nlp['vader'].apply(lambda x: x.get('pos'))
X_test_nlp['compound'] = X_test_nlp['vader'].apply(lambda x: x.get('compound'))

In [33]:
#sanity check
X_train_nlp.head()

Unnamed: 0,Description,no punctuation and numbers,vader,neg,neu,pos,compound
6709,A new career-high & @pelicansnba franchise-hig...,A new careerhigh pelicansnba franchisehigh a...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0
1615,Swipe through to recap Wednesday’s seven games!,Swipe through to recap Wednesday’s seven games,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0
980,@dloading breaks out an array of moves!,dloading breaks out an array of moves,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0
3759,No. 2??...@giannis_an34 ELECTRIFIES The Garden...,No giannisan ELECTRIFIES The Garden BESTofNBA,"{'neg': 0.306, 'neu': 0.694, 'pos': 0.0, 'comp...",0.306,0.694,0.0,-0.296
6991,@dwighthoward takes it coast to coast for the ...,dwighthoward takes it coast to coast for the N...,"{'neg': 0.27, 'neu': 0.73, 'pos': 0.0, 'compou...",0.27,0.73,0.0,-0.516


In [34]:
#sanity check
X_test_nlp.head()

Unnamed: 0,Description,no punctuation and numbers,vader,neg,neu,pos,compound
5087,?? “Ain’t no time better than this” ...listen ...,“Ain’t no time better than this” listen in to...,"{'neg': 0.078, 'neu': 0.642, 'pos': 0.279, 'co...",0.078,0.642,0.279,0.7672
5603,How’d @spidadmitchell do that!?,How’d spidadmitchell do that,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0
1626,@giannis_an34 finds @malcolmbrogdon for the @b...,giannisan finds malcolmbrogdon for the bucks t...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0
6832,@pushat305 in traffic!,pushat in traffic,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0
3896,10-year-old Phoenix Johnson shows off CRAZY ha...,yearold Phoenix Johnson shows off CRAZY handle...,"{'neg': 0.18, 'neu': 0.688, 'pos': 0.132, 'com...",0.18,0.688,0.132,-0.2103


## Step 4: Tokenize then Lemmatize 
- I decided to proceed with lemmatizing since it spits out the base or dictionary form of the word. 

In [35]:
import nltk

In [36]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

In [37]:
#instntiate tokenizer
tokenizer = RegexpTokenizer(r'\w+')
lem = WordNetLemmatizer()

In [38]:
#creating a lemmatize function that tokenizes then lemmatize the test
def lemmatize(text):
    return [lem.lemmatize(w.lower()) for w in tokenizer.tokenize(text)]  



In [39]:
#let's apply that function to both train and test 
X_train_nlp['lemmatized'] = X_train_nlp['no punctuation and numbers'].apply(lemmatize)
X_test_nlp['lemmatized'] = X_test_nlp['no punctuation and numbers'].apply(lemmatize)

In [40]:
#sanity check
X_train_nlp.head()

Unnamed: 0,Description,no punctuation and numbers,vader,neg,neu,pos,compound,lemmatized
6709,A new career-high & @pelicansnba franchise-hig...,A new careerhigh pelicansnba franchisehigh a...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0,"[a, new, careerhigh, pelicansnba, franchisehig..."
1615,Swipe through to recap Wednesday’s seven games!,Swipe through to recap Wednesday’s seven games,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0,"[swipe, through, to, recap, wednesday, s, seve..."
980,@dloading breaks out an array of moves!,dloading breaks out an array of moves,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0,"[dloading, break, out, an, array, of, move]"
3759,No. 2??...@giannis_an34 ELECTRIFIES The Garden...,No giannisan ELECTRIFIES The Garden BESTofNBA,"{'neg': 0.306, 'neu': 0.694, 'pos': 0.0, 'comp...",0.306,0.694,0.0,-0.296,"[no, giannisan, electrifies, the, garden, best..."
6991,@dwighthoward takes it coast to coast for the ...,dwighthoward takes it coast to coast for the N...,"{'neg': 0.27, 'neu': 0.73, 'pos': 0.0, 'compou...",0.27,0.73,0.0,-0.516,"[dwighthoward, take, it, coast, to, coast, for..."


In [41]:
#sanity check
X_test_nlp.head()

Unnamed: 0,Description,no punctuation and numbers,vader,neg,neu,pos,compound,lemmatized
5087,?? “Ain’t no time better than this” ...listen ...,“Ain’t no time better than this” listen in to...,"{'neg': 0.078, 'neu': 0.642, 'pos': 0.279, 'co...",0.078,0.642,0.279,0.7672,"[ain, t, no, time, better, than, this, listen,..."
5603,How’d @spidadmitchell do that!?,How’d spidadmitchell do that,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0,"[how, d, spidadmitchell, do, that]"
1626,@giannis_an34 finds @malcolmbrogdon for the @b...,giannisan finds malcolmbrogdon for the bucks t...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0,"[giannisan, find, malcolmbrogdon, for, the, bu..."
6832,@pushat305 in traffic!,pushat in traffic,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0,"[pushat, in, traffic]"
3896,10-year-old Phoenix Johnson shows off CRAZY ha...,yearold Phoenix Johnson shows off CRAZY handle...,"{'neg': 0.18, 'neu': 0.688, 'pos': 0.132, 'com...",0.18,0.688,0.132,-0.2103,"[yearold, phoenix, johnson, show, off, crazy, ..."


**As you can see above, the lemmatize function returns a list of words separated by commas.**

**In order to vectorize properly, we must change the lemmatize column to a string without any brackets or commas.** 

### The following code will execute the transformation to string for both test and train. 
- The final string we will use to vectorize will be called `stringOG`

In [42]:

X_train_nlp['texty'] = [[' '.join(i)] for i in X_train_nlp['lemmatized']]
X_train_nlp['string'] = [str(i) for i in X_train_nlp['texty']]
X_train_nlp['stringOG'] = [i.replace('[', '').replace(']', '').replace("'", '') for i in X_train_nlp['string']]

X_test_nlp['texty'] = [[' '.join(i)] for i in X_test_nlp['lemmatized']]
X_test_nlp['string'] = [str(i) for i in X_test_nlp['texty']]
X_test_nlp['stringOG'] = [i.replace('[', '').replace(']', '').replace("'", '') for i in X_test_nlp['string']]


In [43]:
#sanity check
X_train_nlp.head()

Unnamed: 0,Description,no punctuation and numbers,vader,neg,neu,pos,compound,lemmatized,texty,string,stringOG
6709,A new career-high & @pelicansnba franchise-hig...,A new careerhigh pelicansnba franchisehigh a...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0,"[a, new, careerhigh, pelicansnba, franchisehig...",[a new careerhigh pelicansnba franchisehigh as...,['a new careerhigh pelicansnba franchisehigh a...,a new careerhigh pelicansnba franchisehigh ass...
1615,Swipe through to recap Wednesday’s seven games!,Swipe through to recap Wednesday’s seven games,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0,"[swipe, through, to, recap, wednesday, s, seve...",[swipe through to recap wednesday s seven game],['swipe through to recap wednesday s seven game'],swipe through to recap wednesday s seven game
980,@dloading breaks out an array of moves!,dloading breaks out an array of moves,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0,"[dloading, break, out, an, array, of, move]",[dloading break out an array of move],['dloading break out an array of move'],dloading break out an array of move
3759,No. 2??...@giannis_an34 ELECTRIFIES The Garden...,No giannisan ELECTRIFIES The Garden BESTofNBA,"{'neg': 0.306, 'neu': 0.694, 'pos': 0.0, 'comp...",0.306,0.694,0.0,-0.296,"[no, giannisan, electrifies, the, garden, best...",[no giannisan electrifies the garden bestofnba],['no giannisan electrifies the garden bestofnba'],no giannisan electrifies the garden bestofnba
6991,@dwighthoward takes it coast to coast for the ...,dwighthoward takes it coast to coast for the N...,"{'neg': 0.27, 'neu': 0.73, 'pos': 0.0, 'compou...",0.27,0.73,0.0,-0.516,"[dwighthoward, take, it, coast, to, coast, for...",[dwighthoward take it coast to coast for the n...,['dwighthoward take it coast to coast for the ...,dwighthoward take it coast to coast for the nb...


In [44]:
#sanity check
X_test_nlp.head()

Unnamed: 0,Description,no punctuation and numbers,vader,neg,neu,pos,compound,lemmatized,texty,string,stringOG
5087,?? “Ain’t no time better than this” ...listen ...,“Ain’t no time better than this” listen in to...,"{'neg': 0.078, 'neu': 0.642, 'pos': 0.279, 'co...",0.078,0.642,0.279,0.7672,"[ain, t, no, time, better, than, this, listen,...",[ain t no time better than this listen in to t...,['ain t no time better than this listen in to ...,ain t no time better than this listen in to th...
5603,How’d @spidadmitchell do that!?,How’d spidadmitchell do that,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0,"[how, d, spidadmitchell, do, that]",[how d spidadmitchell do that],['how d spidadmitchell do that'],how d spidadmitchell do that
1626,@giannis_an34 finds @malcolmbrogdon for the @b...,giannisan finds malcolmbrogdon for the bucks t...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0,"[giannisan, find, malcolmbrogdon, for, the, bu...",[giannisan find malcolmbrogdon for the buck tr...,['giannisan find malcolmbrogdon for the buck t...,giannisan find malcolmbrogdon for the buck tri...
6832,@pushat305 in traffic!,pushat in traffic,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0,"[pushat, in, traffic]",[pushat in traffic],['pushat in traffic'],pushat in traffic
3896,10-year-old Phoenix Johnson shows off CRAZY ha...,yearold Phoenix Johnson shows off CRAZY handle...,"{'neg': 0.18, 'neu': 0.688, 'pos': 0.132, 'com...",0.18,0.688,0.132,-0.2103,"[yearold, phoenix, johnson, show, off, crazy, ...",[yearold phoenix johnson show off crazy handle...,['yearold phoenix johnson show off crazy handl...,yearold phoenix johnson show off crazy handle ...


## Step 5: Vectorizer Time 

Let's only get `stringOG` from both test and train in order to vectorize. 

**Note:** I decided to use tfidf vectorizer and remove english stopwords

In [45]:

X_train_vec = X_train_nlp['stringOG']
X_test_vec = X_test_nlp['stringOG']

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [47]:
#instantiate then fit tfidf
tfidf = TfidfVectorizer(stop_words= 'english')
tfidf.fit(X_train_vec)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

## Transformation Time!
- Note how I transformed my test data based on .fit(train) in order to prevent data leakage from occuring 

In [48]:
# transoforming 
train_transform = tfidf.transform(X_train_vec)
test_transform = tfidf.transform(X_test_vec)

In [49]:
#creating df of transformed data
train_transform_df = pd.DataFrame(train_transform.toarray(),
                   columns= tfidf.get_feature_names())

test_transform_df = pd.DataFrame(test_transform.toarray(),
                   columns= tfidf.get_feature_names())

In [50]:
#sanity check
train_transform_df.shape

(5193, 3400)

## We finally made our train and test!
- Now let's append all the important features together
- The main features for both train and test are: vectorized df, vader sentiment scores, `Followers at Posting`, `Type_Photo`, and `Type_Video`. 

In [51]:
vader = ['neg', 'neu', 'pos', 'compound']
X_train_df = pd.concat(objs = [train_transform_df,
                       X_train_nlp.reset_index()[vader],
                       X_train.reset_index()[['Followers at Posting', 'Type_Photo', 'Type_Video']]], 
                       axis = 1)

In [52]:
#sanity check
X_train_df.isnull().sum().sum()

0

In [53]:
vader = ['neg', 'neu', 'pos', 'compound']
X_test_df = pd.concat(objs = [test_transform_df,
                       X_test_nlp.reset_index()[vader],
                       X_test.reset_index()[['Followers at Posting', 'Type_Photo', 'Type_Video']]], 
                       axis = 1)

In [57]:
X_train_df.index.is_monotonic

True

In [54]:
#sanity check
X_test_df.isnull().sum().sum()

0

# Notice!
- At this point, I duplicated my notebook (called `ApplyingTEST`) in order to transform my holdout data the exact same way as the train and test data. 
- The holdout data should have 3407 columns (just like train and test). 
- Eventually after I find the best model, I will bring in the holdout data in order to predict `Engagements`

# Modeling Time!
- We finally have our dataframes for train and test
- The following is a list of models I will perform on my train and test data

## Attempting to model using multiple regressions
- Linear Regression
- Lasso
- Ridge
- Elastic
- Random Forest
- Gradient Booster
- Bernoulli
- Gaussian 
- SVR

In [55]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import cross_val_score

In [56]:
lr = LinearRegression()
lasso = Lasso()
ridge = Ridge()
elastic = ElasticNet()

**I am computing the cross val scores of each model to in order to pick the best performing models to ultimately perform a GridSearch.**

In [57]:
print(cross_val_score(lr, X_train_df, y_train, cv = 5).mean()) #linear regression
print(cross_val_score(lasso, X_train_df, y_train, cv = 5).mean()) #lasso
print(cross_val_score(ridge, X_train_df, y_train, cv = 5).mean()) #ridge
print(cross_val_score(elastic, X_train_df, y_train, cv = 5).mean()) #elastic

-12786610.161508564




0.8612494302840688


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number3.093155e-18
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number3.062102e-18
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number3.084544e-18
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number3.133011e-18
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number3.019713e-18
  overwrite_a=True).T


0.8929670880868088
0.4524421300197375


In [67]:
#sanity check
X_test_df.shape

(2559, 3407)

In [58]:
#sanity check
X_train_df.head()

Unnamed: 0,aarontaosunitedmasters,abc,abcespn,ability,able,academy,acfresh,achievement,acknowledges,acl,...,zhuri,zmane,zo,neg,neu,pos,compound,Followers at Posting,Type_Photo,Type_Video
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,26083495,0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,34492470,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,35454842,0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.306,0.694,0.0,-0.296,30442523,0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.27,0.73,0.0,-0.516,25826105,0,1


In [59]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.naive_bayes import BernoulliNB, GaussianNB

In [62]:
rf = RandomForestRegressor()
gb = GradientBoostingRegressor()
b = BernoulliNB()
g = GaussianNB()

In [63]:
print(cross_val_score(rf, X_train_df, y_train, cv = 5).mean())  #random forest
print(cross_val_score(gb, X_train_df, y_train, cv = 5).mean())   #gradient boost
print(cross_val_score(b, X_train_df, y_train, cv = 5).mean())  #bernoulli
print(cross_val_score(g, X_train_df, y_train, cv = 5).mean())   #gaussian 



0.9028712934507697
0.8833657286643053




0.03139350042523398




0.6240993283219586


In [64]:
from sklearn.svm import SVR

In [65]:
svr = SVR()

In [66]:
cross_val_score(svr, X_train_df, y_train, cv = 5).mean() #support vector regression 



-0.0738795111886684

## The following models that I picked based on the cross val score are: 
|Model|Cross Val Score|
|--|--|
|Lasso|.8612|
|Ridge|.8929|
|Random Forest|.9028|
|Gradient Boosting|.8833|

In [68]:
l = Lasso()
r = Ridge()
rf = RandomForestRegressor()
gb = GradientBoostingRegressor()

## Since this competition is based on the MAPE metric, I will create a function that computes MAPE

In [69]:
#creating a mape function 
import numpy as np

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

## The following tests are modeled based on the default parameters

### Lasso

In [70]:
l.fit(X_train_df, y_train)
print(l.score(X_train_df, y_train))
print(l.score(X_test_df, y_test))

0.9638518751396787
0.8736641179300437


In [71]:
l_pred = l.predict(X_test_df)
#.predict spits out pandas series 

In [72]:
mean_absolute_percentage_error(y_test, l_pred)

8.74918008539785

### Ridge

In [73]:
r.fit(X_train_df, y_train)
print(r.score(X_train_df, y_train))
print(r.score(X_test_df, y_test))

0.938406705501203
0.8949144143922124


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number2.593397e-18
  overwrite_a=True).T


In [74]:
r_pred = r.predict(X_test_df)

In [76]:
mean_absolute_percentage_error(y_test, r_pred)

8.08113255732593

### Random Forest 

In [77]:
rf.fit(X_train_df, y_train)
print(rf.score(X_train_df, y_train))
print(rf.score(X_test_df, y_test))



0.9834637964981385
0.9074461074815124


In [78]:
rf_pred = rf.predict(X_test_df)

In [79]:
mean_absolute_percentage_error(y_test, rf_pred)

7.292803202696748

### Gradient Boosting

In [80]:
gb.fit(X_train_df, y_train)
print(gb.score(X_train_df, y_train))
print(gb.score(X_test_df, y_test))

0.8982712487876601
0.8829584577782535


In [81]:
gb_pred = gb.predict(X_test_df)

In [82]:
mean_absolute_percentage_error(y_test, gb_pred)

8.719080657753869

**The following table shows each models train, test, and MAPE score (based on default parameters)**

|Model|Train Score|Test Score|Mape Score|
|--|--|--|--|
|Lasso|.9638|.8736|8.749|
|Ridge|.9384|.8949|8.08|
|Random Forest|.983|.907|**7.292**|
|Gradient Boosting|.898|.882|8.719|

**Note:** My best MAPE score is 7.292

## The following code will perform GridSearch Pipeline to determine if we can get a better MAPE score. 
- Note: I didn't GridSearch Gradient Boosting since it took to long to execute. Therefore, I added ElasticNet for GridSearching. 

**The following models I performed on GridSearch are:**
- Random Forest
- Lasso
- Ridge 
- ELastic Net 

## GridSearch Pipeline Random Forest

In [84]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [86]:
pipe = Pipeline([
    ('rf', RandomForestRegressor())
])
pipe_params = {
    'rf__n_estimators': [50, 200, 500],
    'rf__max_depth': [None, 3, 6],
    'rf__max_features': ['auto', None]
    #can add C parameter for logisitic regression
}


gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)


gs.fit(X_train_df, y_train)


print(gs.best_score_)
gs.best_params_

0.90653167433752


{'rf__max_depth': None, 'rf__max_features': 'auto', 'rf__n_estimators': 500}

In [87]:
gs.score(X_train_df, y_train)

0.9880105979019126

In [88]:
gs.score(X_test_df, y_test)

0.9127106312293013

In [118]:
rf_pred = gs.predict(X_test_df)

In [119]:
mean_absolute_percentage_error(y_test, rf_pred)

7.0428018597336255

## GridSearch Pipeline LASSO

In [91]:
pipe = Pipeline([
    ('l', Lasso())
])
pipe_params = {
    'l__alpha': [0.2, 0.5, 1],
    'l__max_iter': [200, 500, 1200],
    #'rf__max_features': ['auto', None]
    #can add C parameter for logisitic regression
}


gs2 = GridSearchCV(pipe, param_grid=pipe_params, cv=3)


gs2.fit(X_train_df, y_train)


print(gs2.best_score_)
gs2.best_params_



0.8478451658566838




{'l__alpha': 1, 'l__max_iter': 200}

In [92]:
gs2.score(X_train_df, y_train)

0.963851872830397

In [93]:
gs2.score(X_test_df, y_test)

0.873623677335399

In [94]:
lasso_pred = gs2.predict(X_test_df)

In [95]:

mean_absolute_percentage_error(y_test, lasso_pred)

8.749733472602898

## GridSearch Pipeline Ridge 

In [96]:
pipe = Pipeline([
    ('r', Ridge())
])
pipe_params = {
    'r__alpha': [0.2, 0.5, 1],
    'r__max_iter': [200, 500, 1200],
    #'rf__max_features': ['auto', None]
    #can add C parameter for logisitic regression
}


gs3 = GridSearchCV(pipe, param_grid=pipe_params, cv=3)


gs3.fit(X_train_df, y_train)


print(gs3.best_score_)
gs3.best_params_

Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number8.839340e-19
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number8.915988e-19
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number9.000149e-19
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number8.839340e-19
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number8.915988e-19
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number9.000149e-19
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number8.839340e-19
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not g

0.8898703452847614


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number2.593397e-18
  overwrite_a=True).T


{'r__alpha': 1, 'r__max_iter': 200}

In [97]:
gs3.score(X_train_df, y_train)

0.938406705501203

In [98]:
gs3.score(X_test_df, y_test)

0.8949144143922124

In [99]:
ridge_pred = gs3.predict(X_test_df)
mean_absolute_percentage_error(y_test, ridge_pred)

8.08113255732593

## GridSearch ElasticNet


In [114]:
pipe = Pipeline([
    ('e', ElasticNet())
])
pipe_params = {
    #'gb__learning_rate': [0.1],
    'e__l1_ratio': [0.9, 0.95, 1],
    #'gb__n_estimators': [100, 200, 500]
    #can add C parameter for logisitic regression
}


gs4 = GridSearchCV(pipe, param_grid=pipe_params, cv=3, n_jobs = 3)


gs4.fit(X_train_df, y_train)


print(gs4.best_score_)
gs4.best_params_

0.8473055069722082


{'e__l1_ratio': 1}

In [115]:
gs4.score(X_train_df, y_train)

0.9638518751396787

In [116]:
gs4.score(X_test_df, y_test)

0.8736641179300437

In [117]:
e_pred = gs4.predict(X_test_df)
mean_absolute_percentage_error(y_test, e_pred)

8.74918008539785

## Conclusion
- Random Forest had the best overall train, test, and MAPE score when GridSearching for optimal paramters. 

Here is a table of our results when GridSearching for optimal parameters:

|Model|Train Score|Test Score|Mape Score|GridSearch Name|
|--|--|--|--|--|
|Lasso|.9638|.8736|8.749|gs2|
|Ridge|.9384|.8949|8.08|gs3|
|Random Forest|.988|.9127|**7.0428**|gs|
|Elastic Net|.963|.873|8.749|gs4|

**Let's bring in our test data to predict the values engagement using gs.predict. Note how the random forest is the first gridsearch; therefore, we will use gs.**

In [122]:
#bringing in our holdout data
holdout = pd.read_csv('./datasets/X_holdout_df.csv').drop(columns = 'Unnamed: 0')
holdout.head()

Unnamed: 0,aarontaosunitedmasters,abc,abcespn,ability,able,academy,acfresh,achievement,acknowledges,acl,...,zhuri,zmane,zo,neg,neu,pos,compound,Followers at Posting,Type_Photo,Type_Video
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,36984682,1,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,36984682,1,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.729,0.271,0.3818,36984682,0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,36955156,0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,36955156,0,1


In [123]:
holdout.shape

(1000, 3407)

In [124]:
engagement_pred = gs.predict(holdout)

In [125]:
answer = pd.DataFrame({
    'engagements': engagement_pred
})

In [126]:
answer.head()

Unnamed: 0,engagements
0,397319.322
1,394746.192
2,657470.742
3,596612.56
4,612162.51


In [127]:
answer.shape

(1000, 1)

In [129]:
answer.join(holdout).head()

Unnamed: 0,engagements,aarontaosunitedmasters,abc,abcespn,ability,able,academy,acfresh,achievement,acknowledges,...,zhuri,zmane,zo,neg,neu,pos,compound,Followers at Posting,Type_Photo,Type_Video
0,397319.322,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,36984682,1,0
1,394746.192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,36984682,1,0
2,657470.742,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.729,0.271,0.3818,36984682,0,1
3,596612.56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,36955156,0,1
4,612162.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,36955156,0,1


In [131]:
holdout_set_erinhwang = answer.join(holdout)
holdout_set_erinhwang.head()

Unnamed: 0,engagements,aarontaosunitedmasters,abc,abcespn,ability,able,academy,acfresh,achievement,acknowledges,...,zhuri,zmane,zo,neg,neu,pos,compound,Followers at Posting,Type_Photo,Type_Video
0,397319.322,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,36984682,1,0
1,394746.192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,36984682,1,0
2,657470.742,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.729,0.271,0.3818,36984682,0,1
3,596612.56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,36955156,0,1
4,612162.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,36955156,0,1


In [132]:
holdout_set_erinhwang.shape

(1000, 3408)

In [134]:
holdout_set_erinhwang.isnull().sum().sum()

0

In [133]:
holdout_set_erinhwang.to_csv('./datasets/holdout_set_erinhwang.csv')