# Objective
- Add vader sentiment analysis 
- TfidfVectorizer

In [1]:
import nltk

In [2]:
from nltk.sentiment import vader

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression

In [5]:
df = pd.read_csv('./datasets/snow.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,subreddit,stringOG
0,0,1,we know that septemb 10 was world suicid preve...
1,1,1,welcom to r depress check in post a place to t...
2,2,1,i m go to the movi i m so nervous i m will lea...
3,3,1,now i can save so i can get myself out of this...
4,4,1,i alway do this i ll stay up until the wee hou...


In [6]:
df.shape

(1891, 3)

In [7]:
df['subreddit'].value_counts()

0    969
1    922
Name: subreddit, dtype: int64

# Let's Add Vader Sentiment Analysis 

In [8]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [9]:
#instantiate vader
analyser = SentimentIntensityAnalyzer()

In [10]:
#getting polarity scores(4) from stringOG
df['vader'] = [analyser.polarity_scores(text) for text in df['stringOG']]

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,subreddit,stringOG,vader
0,0,1,we know that septemb 10 was world suicid preve...,"{'neg': 0.054, 'neu': 0.75, 'pos': 0.196, 'com..."
1,1,1,welcom to r depress check in post a place to t...,"{'neg': 0.062, 'neu': 0.779, 'pos': 0.159, 'co..."
2,2,1,i m go to the movi i m so nervous i m will lea...,"{'neg': 0.146, 'neu': 0.854, 'pos': 0.0, 'comp..."
3,3,1,now i can save so i can get myself out of this...,"{'neg': 0.0, 'neu': 0.669, 'pos': 0.331, 'comp..."
4,4,1,i alway do this i ll stay up until the wee hou...,"{'neg': 0.103, 'neu': 0.809, 'pos': 0.088, 'co..."


In [12]:
df['vader'][0]

{'neg': 0.054, 'neu': 0.75, 'pos': 0.196, 'compound': 0.7506}

In [13]:
#this code creates new columns of vader scores 
df['neg'] = df['vader'].apply(lambda x: x.get('neg'))
df['neu'] = df['vader'].apply(lambda x: x.get('neu'))
df['pos'] = df['vader'].apply(lambda x: x.get('pos'))
df['compound'] = df['vader'].apply(lambda x: x.get('compound'))

In [14]:
df.head()

Unnamed: 0.1,Unnamed: 0,subreddit,stringOG,vader,neg,neu,pos,compound
0,0,1,we know that septemb 10 was world suicid preve...,"{'neg': 0.054, 'neu': 0.75, 'pos': 0.196, 'com...",0.054,0.75,0.196,0.7506
1,1,1,welcom to r depress check in post a place to t...,"{'neg': 0.062, 'neu': 0.779, 'pos': 0.159, 'co...",0.062,0.779,0.159,0.9965
2,2,1,i m go to the movi i m so nervous i m will lea...,"{'neg': 0.146, 'neu': 0.854, 'pos': 0.0, 'comp...",0.146,0.854,0.0,-0.4101
3,3,1,now i can save so i can get myself out of this...,"{'neg': 0.0, 'neu': 0.669, 'pos': 0.331, 'comp...",0.0,0.669,0.331,0.836
4,4,1,i alway do this i ll stay up until the wee hou...,"{'neg': 0.103, 'neu': 0.809, 'pos': 0.088, 'co...",0.103,0.809,0.088,-0.0258


In [15]:
#lets save the df with vader columns
df.to_csv('./datasets/df_vader.csv')

# Train Test Split

In [16]:
#first stage of train test split 
features = ['stringOG', 'neg', 'neu', 'pos', 'compound']
X = df[features]
y = df['subreddit']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify = y)

In [18]:
X_train

Unnamed: 0,stringOG,neg,neu,pos,compound
232,late i ve been dread ani event whether it be h...,0.182,0.659,0.159,-0.5346
441,there is someth veri wrong with me deep insid ...,0.172,0.699,0.129,-0.8987
1469,i have had a lot of issu with therapi througho...,0.074,0.895,0.031,-0.7472
1672,i was so nervous and anxious befor that appoin...,0.144,0.727,0.129,-0.3008
446,i have a life that on the surfac mani would en...,0.090,0.749,0.161,0.9925
1133,hello there go to be a tournament in pari in d...,0.103,0.873,0.024,-0.8462
759,so i was on my xbox with my friend die laugh a...,0.168,0.683,0.149,-0.3774
1516,basic in my french exam last week i had a mini...,0.155,0.819,0.025,-0.9186
964,hi everyon excus my format error yesterday i w...,0.063,0.898,0.039,-0.0644
23,even tho now ive seen a psychiatrist i still c...,0.235,0.689,0.076,-0.9338


# Tfidf String

In [19]:
#separating string from vader score
#im doing this because I need to Tfidfvectorize stringOG then append the vader scores
X_train_string = X_train['stringOG']
X_test_string = X_test['stringOG']

In [20]:
tfidf = TfidfVectorizer()
tfidf.fit(X_train_string)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [21]:
# transofrming 
train_transform = tfidf.transform(X_train_string)
test_transform = tfidf.transform(X_test_string)

In [22]:
#creating df of transformed data
train_transform_df = pd.DataFrame(train_transform.toarray(),
                   columns= tfidf.get_feature_names())

test_transform_df = pd.DataFrame(test_transform.toarray(),
                   columns= tfidf.get_feature_names())

In [23]:
train_transform_df.shape

(1418, 6752)

In [24]:
train_transform_df.head()

Unnamed: 0,00,000,000000001,00am,00pm,03,05,06,07,10,...,yuck,yummi,zap,zeebo,zero,zinn,zoloft,zombi,zone,zx
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
#appending to train data
vader = ['neg', 'neu', 'pos', 'compound']
X_train_df = pd.concat(objs = [train_transform_df, X_train.reset_index()[vader]], axis = 1)

In [26]:
X_train_df.isnull().sum().sum()

0

In [27]:
X_train_df.head()

Unnamed: 0,00,000,000000001,00am,00pm,03,05,06,07,10,...,zero,zinn,zoloft,zombi,zone,zx,neg,neu,pos,compound
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.182,0.659,0.159,-0.5346
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.172,0.699,0.129,-0.8987
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.074,0.895,0.031,-0.7472
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.144,0.727,0.129,-0.3008
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.09,0.749,0.161,0.9925


In [28]:
#appending test data 
X_test_df = pd.concat(objs = [test_transform_df, X_test.reset_index()[vader]], axis = 1)

In [29]:
X_test_df.head()

Unnamed: 0,00,000,000000001,00am,00pm,03,05,06,07,10,...,zero,zinn,zoloft,zombi,zone,zx,neg,neu,pos,compound
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.133,0.718,0.149,0.3031
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.163,0.748,0.089,-0.9936
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.133,0.681,0.186,0.7812
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.179585,0.0,0.0,0.0,0.169,0.801,0.03,-0.9182
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.828,0.032,-0.8176


In [30]:
X_test_df.isnull().sum().sum()

0

## Append Target Variable for both train and test data

In [31]:
#adding subreddit to train 
X_train_df['subreddit'] = y_train.reset_index()['subreddit']

In [32]:
X_train_df.head()

Unnamed: 0,00,000,000000001,00am,00pm,03,05,06,07,10,...,zero,zinn,zoloft,zombi,zone,zx,neg,neu,pos,compound
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.182,0.659,0.159,-0.5346
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.172,0.699,0.129,-0.8987
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.074,0.895,0.031,-0.7472
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.144,0.727,0.129,-0.3008
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.09,0.749,0.161,0.9925


In [33]:
X_train_df.isnull().sum().sum()

0

In [34]:
#adding subreddit to test
X_test_df['subreddit'] = y_test.reset_index()['subreddit']

In [35]:
X_test_df.head()

Unnamed: 0,00,000,000000001,00am,00pm,03,05,06,07,10,...,zero,zinn,zoloft,zombi,zone,zx,neg,neu,pos,compound
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.133,0.718,0.149,0.3031
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.163,0.748,0.089,-0.9936
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.133,0.681,0.186,0.7812
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.179585,0.0,0.0,0.0,0.169,0.801,0.03,-0.9182
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.828,0.032,-0.8176


In [36]:
X_test_df.isnull().sum().sum()

0

# Lets save it to csv file

In [37]:
X_train_df.to_csv('./datasets/train_snow.csv')

In [38]:
X_test_df.to_csv('./datasets/test_snow.csv')