In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

import matplotlib.pyplot as plt
import pandas as pd
import pydotplus
import seaborn as sns

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
%matplotlib inline

# Import `CSV` that was scraped earlier

In [None]:
df = pd.read_csv('scraped_2018-09-05 15:43:57.539375')

In [None]:
df.tail()

In [None]:
df.shape

# Change `subreddit` categories to numerical values
> `politics == 1`

In [None]:
df['subreddit'] = df['subreddit'].map(lambda x: 1 if x == 'The_Donald' else 0)

In [None]:
df.iloc[1987,:]

# Train / Test Split

In [None]:
X = df['title']
y = df['subreddit']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    stratify=y)

In [None]:
y_train.value_counts(normalize=True)

In [None]:
y_test.value_counts(normalize=True)

In [None]:
y_train.shape

# Create Word Vectors for Model and EDA

In [None]:
cv = CountVectorizer(min_df=3, stop_words='english')

cv.fit(X_train)

In [None]:
X_train_cv = cv.transform(X_train)

In [None]:
train_set = pd.DataFrame(X_train_cv.todense(), columns=cv.get_feature_names())

In [None]:
train_set.head()

In [None]:
train_set['is_donald'] = y_train.values

In [None]:
train_set.shape

# Initial Model, Check Score Against Baseline

In [None]:
pipe = Pipeline([
    ('cv', CountVectorizer(stop_words='english', min_df=3)),
    ('nb', MultinomialNB())
])

pipe.fit(X_train, y_train)

pipe.score(X_train, y_train), pipe.score(X_test, y_test)

In [None]:
pipe = Pipeline([
    ('fu', FeatureUnion([
        ('cv', CountVectorizer()),
        ('awl', AverageWordLength()),
        ('ch', CountHashes())
    ])),
    ('nb', MultinomialNB())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

# include compound score from VADER in pipe

In [None]:
class CompoundScore(BaseEstimator, TransformerMixin):
    
    sia = SentimentIntensityAnalyzer()
    
    def __init__(self):
        pass
    
    def get_compound(self, line):
        return sia.polarity_scores(line)['compound']
    
    def transform(self, df):
        return np.array(df.apply(self.get_compound)).reshape(-1,1)
    
    def fit(self, df, y=None):
        return self

In [None]:
pipe = Pipeline([
    ('fu', FeatureUnion([
        ('cv', CountVectorizer()),
        ('awl', AverageWordLength()),
        ('ch', CountHashes()),
        ('cs', CompoundScore())
    ])),
    ('nb', LogisticRegression())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

# add neg score from VADER to pipe

In [None]:
class NegScore(BaseEstimator, TransformerMixin):
    
    sia = SentimentIntensityAnalyzer()
    
    def __init__(self):
        pass
    
    def get_neg(self, line):
        return sia.polarity_scores(line)['neg']
    
    def transform(self, df):
        return np.array(df.apply(self.get_neg)).reshape(-1,1)
    
    def fit(self, df, y=None):
        return self

In [None]:
pipe = Pipeline([
    ('fu', FeatureUnion([
        ('cv', CountVectorizer()),
        ('awl', AverageWordLength()),
        ('ch', CountHashes()),
        ('cs', CompoundScore()),
        ('ns', NegScore())
    ])),
    ('nb', LogisticRegression())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

# add pos score from VADER to pipe

In [None]:
class PosScore(BaseEstimator, TransformerMixin):
    
    sia = SentimentIntensityAnalyzer()
    
    def __init__(self):
        pass
    
    def get_pos(self, line):
        return sia.polarity_scores(line)['pos']
    
    def transform(self, df):
        return np.array(df.apply(self.get_pos)).reshape(-1,1)
    
    def fit(self, df, y=None):
        return self

In [None]:
pipe = Pipeline([
    ('fu', FeatureUnion([
        ('cv', CountVectorizer()),
        ('awl', AverageWordLength()),
        ('ch', CountHashes()),
        ('cs', CompoundScore()),
        ('ns', NegScore()),
        ('ps', PosScore())
    ])),
    ('nb', LogisticRegression())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

# Add Neutral score from VADER to pipeline

In [None]:
class NeuScore(BaseEstimator, TransformerMixin):
    
    sia = SentimentIntensityAnalyzer()
    
    def __init__(self):
        pass
    
    def get_neu(self, line):
        return sia.polarity_scores(line)['neu']
    
    def transform(self, df):
        return np.array(df.apply(self.get_neu)).reshape(-1,1)
    
    def fit(self, df, y=None):
        return self

In [None]:
pipe = Pipeline([
    ('fu', FeatureUnion([
        ('cv', CountVectorizer()),
        ('awl', AverageWordLength()),
        ('ch', CountHashes()),
        ('cs', CompoundScore()),
        ('ns', NegScore()),
        ('ps', PosScore()),
        ('ne', NeuScore())
    ])),
    ('lcv', LogisticRegressionCV())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

# `KNN` Model 

In [None]:
pipe = Pipeline([
    ('fu', FeatureUnion([
        ('cv', CountVectorizer()),
        ('awl', AverageWordLength()),
        ('ch', CountHashes()),
        ('cs', CompoundScore()),
        ('ns', NegScore()),
        ('ps', PosScore()),
        ('ne', NeuScore())
    ])),
    ('knn', KNeighborsClassifier())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

# Trees and Forests
> random forest

In [None]:
pipe = Pipeline([
    ('fu', FeatureUnion([
        ('cv', CountVectorizer()),
        ('awl', AverageWordLength()),
        ('ch', CountHashes()),
        ('cs', CompoundScore()),
        ('ns', NegScore()),
        ('ps', PosScore()),
        ('ne', NeuScore())
    ])),
    ('rf', RandomForestClassifier(n_estimators=500))
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

> decision tree

In [None]:
pipe = Pipeline([
    ('fu', FeatureUnion([
        ('cv', CountVectorizer()),
        ('awl', AverageWordLength()),
        ('ch', CountHashes()),
        ('cs', CompoundScore()),
        ('ns', NegScore()),
        ('ps', PosScore()),
        ('ne', NeuScore())
    ])),
    ('dc', DecisionTreeClassifier())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

> extra trees

In [None]:
pipe = Pipeline([
    ('fu', FeatureUnion([
        ('cv', CountVectorizer()),
        ('awl', AverageWordLength()),
        ('ch', CountHashes()),
        ('cs', CompoundScore()),
        ('ns', NegScore()),
        ('ps', PosScore()),
        ('ne', NeuScore())
    ])),
    ('et', ExtraTreesClassifier())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

# support vector

In [None]:
pipe = Pipeline([
    ('fu', FeatureUnion([
        ('cv', CountVectorizer()),
        ('awl', AverageWordLength()),
        ('ch', CountHashes()),
        ('cs', CompoundScore()),
        ('ns', NegScore()),
        ('ps', PosScore()),
        ('ne', NeuScore())
    ])),
    ('sv', SVC(C=100))
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

# find the 15 most used words in all titles and cast to list

In [None]:
top_words = list(train_set.drop(['is_donald'], axis=1).sum().sort_values(ascending=False)[:15].index)

# stole this function from your EDA walkthrough

In [None]:
def plot_word_from_groupby(word, df=train_set, by='is_donald'):
    df.groupby(by).sum()[word].plot(kind='barh')
    plt.title(f'Occurences of {word.title()}')
    plt.show()

In [None]:
for word in top_words:
    plot_word_from_groupby(word)

# Try to make a word cloud
> top words in `r/the_donald` 

In [None]:
only_don = df[df['subreddit'] == 1]

In [None]:
lst = []
for each in only_don['title']:
    lst.append(each)

In [None]:
lst = str(lst)

In [None]:
wave_mask = np.array(Image.open('./Images/trump3.jpg'))
 
# Make the figure
wordcloud = WordCloud(collocations=True,mask=wave_mask).generate(lst)
plt.figure(figsize=(9,16))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()

# let's see what this vader stuff is all about

In [None]:
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
sia = SentimentIntensityAnalyzer()

> Visualize Sentiments

In [None]:
dicts = []

for post in df['title']:
    scores = sia.polarity_scores(post)
    scores['title'] = post
    dicts.append(scores)
df_vader = pd.DataFrame(dicts)
#df_vader = df_vader.drop_duplicates(['title'])

In [None]:
df_vader.columns

In [None]:
df_vader['is_donald'] = df['subreddit'].values

In [None]:
half = 1993//2

df_vader.sort_values('pos', ascending=False)[:half]['is_donald'].sum()

In [None]:
623/996

> 62.55% of the top 50% most "positive" posts are from `r/the_donald`

# It seems that `r/the_donald` has a higher rate of positivity overall
> this is probably due to the fact that `r/the_donald` is a subreddit that serves as an echo chamber for staunch trump supporters - they have less to complain about
# and `r/politics`  shows up a lot more often in the top 50% of negative sentiments
> in the exact opposite fashion of `r/the_donald`, `r/politics` has A LOT to complain about

In [None]:
df_vader.sort_values('neg', ascending=False)[:half]['is_donald'].sum()