# Are you feeling happy or sad?
#### Enter some text, an email, song lyrics, a note from your lover, or just some random thoughts you are mulling on. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import requests, time, re
import datetime as dt

from psaw import PushshiftAPI
from textblob import TextBlob

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer

from ipywidgets import widgets, interact, interact_manual, fixed
from IPython.display import display, clear_output, HTML

import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_columns = 4000

In [2]:
line_divider = widgets.Output(layout={'border': '1px solid black'})

def on_text_submit(incoming):
    showcase = True
    display(line_divider)
    print("Message:")
    print(incoming.value)
    new_posts = incoming.value
    # Process new post to ascertain classification
    new_posts_featurized = reddit_prepare_features(pd.Series(new_posts)) # Featurize new post
    new_posts_featurized = reddit_equalize_features(X_features, new_posts_featurized) # Only utilize features actually modeled
    predictions = reddit_sample_score(model, new_posts_featurized, showcase=True) # Generate predictions 
    # Convert predictions to dataframe
    df_predictions = pd.DataFrame(columns=['classification', 'post'])
    df_predictions['classification'] = predictions
    df_predictions['post'] = new_posts
    print("\nResult:")
    for e in df_predictions['classification']:
        if e == 0: print("That message has a sad sentiment to it.\nConsider watching some furry animal videos while listening to 'Somewhere Over the Rainbow.'")
        else: print("That message has a happy sentiment to it.\nThat makes me happy as well :-)")

In [3]:
# Display input text box and handle on_submit
input_text = widgets.Text()
display(input_text)
input_text.on_submit(on_text_submit)

Text(value='')

In [4]:
# Function: Transform text input to cleaned, corrected, and vectorized feature set as a dataframe
#  X: pd.Series of text to prepare into feature set
def reddit_prepare_features(X, lowercase=True, max_features=5000):
    
    # Fix spellings (computing heavy operation)
    #df['body'] = df['body'].map(lambda x: str(TextBlob(x).correct()))
    
    # Transform text to counts of words using CountVectorizer
    #  stop_words='english' to remove common English stop words
    #  lowercase=True to convert all to lowercase
    #  max_features=5000 to limit feature set to 5000 variables
    vectorizer = CountVectorizer(analyzer="word", token_pattern=r'\b[^\d\W]+\b', ngram_range=(1, 2), stop_words='english', lowercase=lowercase, max_features=max_features)
    X = vectorizer.fit_transform(X)  
    X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    
    return X

# Function: Build, fit, and evaluate model
#  model:
#  X:
#  y:
#  Note: the model will not fit if the argument test=True
def reddit_model_fit_score(model, X, y):
    
    # Split dataset into training and testing subsets
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
    
    # Use model 'lr': LogisticRegression
    if model == 'lr':
        model_type = 'LogisticRegression'
        # Instantiate, fit, and predict
        lr = LogisticRegression()
        model = lr.fit(X_train, y_train)
        predictions = lr.predict(X_train)
        # Evaluate model
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        score_cv_train = cross_val_score(lr, X_train, y_train, cv=kf).mean()
        score_train = lr.score(X_train, y_train)
        score_test = lr.score(X_test, y_test)
        #print("Using a {0} model we get the following scores:".format(model_type))
        #print(" Accuracy (cross_validated) on training data: {0:.2f}".format(score_cv_train))
        #print(" Accuracy on training data: {0:.2f}".format(score_train))
        #print(" Accuracy on testing data: {0:.2f}".format(score_test))
        #print("")
        
        
    # Use model 'rf': RandomForestClassifier
    if model == 'rf':
        model_type = 'RandomForestClassifier'
        # Instantiate, fit, and predict
        rf = RandomForestClassifier()
        model = rf.fit(X_train, y_train)
        predictions = rf.predict(X_train)
        # Evaluate model
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        score_cv_train = cross_val_score(rf, X_train, y_train, cv=kf).mean()
        score_train = rf.score(X_train, y_train)
        score_test = rf.score(X_test, y_test)
        #print("Using a {0} model we get the following scores:".format(model_type))
        #print(" Accuracy (cross_validated) on training data: {0:.2f}".format(score_cv_train))
        #print(" Accuracy on training data: {0:.2f}".format(score_train))
        #print(" Accuracy on testing data: {0:.2f}".format(score_test))

    
    # Return model used
    return model

# Function: Equalize testing data columns by imputing missing features with null values. In addition, remove features not in model
#  model_features: list of features in model
#  new_df: new dataframe
def reddit_equalize_features(model_features, new_df):
    new_features = new_df.columns.tolist()
    if set(model_features) != set(new_features):
        missing_cols = set(model_features) - set(new_features)
        for col in missing_cols:
            new_df[col] = 0
    
    if set(new_features) != set(model_features):
        missing_cols = set(new_features) - set(model_features)
        new_df = new_df.drop(missing_cols, axis=1)
    
    new_df = new_df[model_features]
    return new_df

# Function: Apply model to sample dataset and generate predictions
#  model: model object to use (generated from training dataset)
#  X: dataframe of sample features
#  showcase: Default=False. If showcase=True, then do not print output 
def reddit_sample_score(model, X, showcase=False):
    predictions = model.predict(X)
    if not showcase:
        pass
        #print(model)
        #print("")
        
    return predictions

# Which subreddit does a post belong to?
#  Gather data from reddit and using NLP on the title/text classify post as either 1 or 0
#  In this case, 1 = happy (from a subreddit called happy), and 0 = sad (from a subreddit called sad)

# Read csv files containing 'happy' and 'sad' dataset from reddit
df_happy = pd.read_csv('../data/happy_sc_201611_20181219_reddit.csv')
df_sad = pd.read_csv('../data/sad_sc_201011_20181219_reddit.csv')

# Concatenate datasets
df = pd.concat([df_happy, df_sad], ignore_index=True)

# Basic dimensions, columns, and datatypes
#df.info()

# Header of dataset
#df.head()

# Remove unneeded columns, rows with nulls, and any duplicate posts if present
df = df.drop(['Unnamed: 0', 'created', 'created_utc'], axis=1)
df = df.drop_duplicates('id')
df = df.dropna()

# Only include submissions
mask = (df['type'] == 'submission')
df = df.loc[mask, :]

# Insure relative balance of classes
#df['subreddit'].value_counts()

# Convert target variable 'subreddit' to 1 = happy and 0 = sad
df['subreddit'] = df['subreddit'].map(lambda x: 1 if x == 'happy' else 0)

# Prepare feature set for model using text vectorization among other type sof pre-processing (see function for details)
X = reddit_prepare_features(df['body'], lowercase=True, max_features=10000)
X_features = X.columns.tolist()
#print("Number of features in model: {0:,}".format(len(X_features)))

# Set target variable and features
y = df['subreddit']
X = X

# Fit and score model 'lr': LogisticRegression
model_lr = reddit_model_fit_score('lr', X, y)

# # Fit and score model 'rf': RandomForestClassifier
# model_rf = reddit_model_fit_score('rf', X, y)

# Set model to LogisticRegression since more generalized model and with higher accuracy than with a RandomForestClassifier
model = model_lr

# Input new post and let model ascertain classification
new_posts = ["I am so sad about how things are coming along",
           "I am so happy about how things are in my life"]

# Process new post to ascertain classification
new_posts_featurized = reddit_prepare_features(pd.Series(new_posts)) # Featurize new post
new_posts_featurized = reddit_equalize_features(X_features, new_posts_featurized) # Only utilize features actually modeled
predictions = reddit_sample_score(model, new_posts_featurized) # Generate predictions

# Convert predictions to dataframe
df_predictions = pd.DataFrame(columns=['classification', 'post'])
df_predictions['classification'] = predictions
df_predictions['post'] = new_posts
#print(df_predictions)

In [5]:
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script><a href="javascript:code_toggle()">Toggle</a> code.''')