# Subreddit NLP- How to differentiate between AmItheAsshole and Legal Advice

Imports:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
import nltk
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier


# EDA

## Clean data:

In [2]:
def preprocessing(csv):
    dataframe = pd.read_csv(csv)
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    #remove null values
    dataframe = dataframe[dataframe['selftext'].notnull()]
    
    #drop "removed" posts
    dataframe = dataframe[dataframe['selftext'] != '[removed]']
    # combine post and title text
    dataframe['fulltext'] = dataframe['selftext'] + dataframe['title']
    # clean data
    dataframe['fulltext'] =[str(words).lower() for words in dataframe['fulltext']]
    dataframe['fulltext'] =[str(words).replace("'", "").replace(".","").replace("(", "").replace(")", "") for words in dataframe['fulltext']]
    dataframe['fulltext'] =[str(words).replace(":", "").replace("*","").replace('"', "") for words in dataframe['fulltext']]
    
    #lemmatize data: (with help from https://stackoverflow.com/questions/47557563/lemmatization-of-all-pandas-cells)
    
    dataframe['fulltext'] = [w_tokenizer.tokenize(string) for string in dataframe['fulltext']]
    for words in dataframe['fulltext']:
        words = [lemmatizer.lemmatize(word) for word in words]
    dataframe['fulltext'] = [' '.join(words) for words in dataframe['fulltext']]
    
    #return dataframe
    return dataframe
    #convert all text to lowercase
    # remove punctuation
    #stem or lemmatize each word of the text


In [3]:
# Run both subreddits of data through the set and return  
aita = preprocessing('./data/AITA_raw.csv')
legal_advice = preprocessing('./data/legal_advice_raw.csv')
combined = legal_advice.append(aita)
# remove AITA to make predictions more accurate
#(aka remove a fake word that will be in every post of one subreddit)
combined['fulltext'] =[str(words).replace("aita", "") for words in combined['fulltext']]

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [4]:
# Turn the Y into a 0/1 column!
combined['subreddit'] = combined['subreddit'].map({'legaladvice': 1, "AmItheAsshole": 0})

## Data Exploration:

In [5]:
# Code modified from https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d

def get_top_n_words(corpus, n = 20):
    bag_of_words = cvec.fit_transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in cvec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

cvec = CountVectorizer(stop_words='english')
aita['fulltext']
la_words = get_top_n_words(legal_advice['fulltext'])
aita_words = get_top_n_words(aita['fulltext'])

Using the above code, I looked at the most frequent words in each subreddit. It was interesting how many more words there were, period, in AITA vs Legal Advice, as well as the fact that some casual speaking mannerisms (using Just, Like, and Really, for example) were much common in one thread than the other, suggesting that n-grams increases would better my model.

In [6]:
la_words

[('just', 3047),
 ('im', 2602),
 ('time', 2531),
 ('know', 2281),
 ('said', 2093),
 ('work', 2067),
 ('told', 2062),
 ('amp', 2058),
 ('like', 1971),
 ('car', 1839),
 ('pay', 1761),
 ('x200b', 1712),
 ('got', 1682),
 ('company', 1657),
 ('want', 1640),
 ('going', 1540),
 ('dont', 1516),
 ('did', 1507),
 ('house', 1420),
 ('legal', 1420)]

In [7]:
aita_words

[('just', 6432),
 ('like', 5509),
 ('im', 4286),
 ('time', 4025),
 ('aita', 3979),
 ('said', 3895),
 ('told', 3710),
 ('really', 3378),
 ('friend', 3259),
 ('know', 3164),
 ('want', 3145),
 ('friends', 3083),
 ('dont', 2825),
 ('got', 2741),
 ('feel', 2570),
 ('going', 2442),
 ('didnt', 2352),
 ('day', 2090),
 ('people', 2078),
 ('work', 2023)]

# Preproccessing and Modeling

### establish variables

In [8]:
X = combined['fulltext']
y = combined['subreddit']

### Train/Test split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11, stratify= y)

## Pipeline

### Logestic Regression

Baseline logistic regression, with no parameters:

In [10]:
cvec = CountVectorizer()
cvec_X_train = cvec.fit_transform(X_train)
cvec_X_test = cvec.transform(X_test)

lr = LogisticRegression()
lr.fit(cvec_X_train, y_train)
lr.score(cvec_X_train, y_train)



0.9998346013893483

In [11]:
lr.score(cvec_X_test, y_test)

0.9603174603174603

I ran several dozen variations of the pipeline and gridsearch outlined below in order ot find my model's best parameters. For everyone's sake, I've currently left these not running, but I tried the following parameters as in attempts to improve my baseline logistic regression score:

For CVEC:

- max features ranging from 3,000 to none (447,000 with ngram range (1,2))
- n-gram ranges (1,1), (1,2) and (1,3) (which did not increase my score enough to justify the computational time.)
- stopwords 'english' and none (none performed better every time)

For Logistic Regression:

- Penalties l1 and l2 (lasso consistently outperformed ridge, while ridge rarely improved upon the baseline)
- Alphas ranging from 1 to 1000 (the two you see below were the best performing in logspace(1, 5, 10)

Additionally, I attempted to run TF-IDF instead of Count Vectorizer, expecting it would perform better because one of my subreddits (AITA) was just wordier than the other subreddits, and I thought it might better account for that fact, but it performed about .5% worse on average.

No variations on my logistic regression model were able to reduce the amount my model was overfit to less than #% (100% vs 97%.) All efforts I made to lower my training score impacted my testing score quite negatively--evening them out wasn't possible. I think further data collection would be the next step in improving this logistic regression model beyond what I have currently tried.

In [12]:
# Best Model:
cvec = CountVectorizer(max_features = None, ngram_range=(1,2))
cvec_X_train = cvec.fit_transform(X_train)
cvec_X_test = cvec.transform(X_test)

lr = LogisticRegression(penalty='l1', C = 166.81)
lr.fit(cvec_X_train, y_train)
lr.score(cvec_X_train, y_train)


1.0

In [13]:
lr.score(cvec_X_test, y_test)

0.9672619047619048

### Bayes Classifier

The Naive Bayes classifier I tested never performed better than the logistic regression model. It was less overfit, though!

In [14]:
pipe = Pipeline(
    [
        ('cvec', CountVectorizer()),
        ('nb', MultinomialNB(alpha=0))
         ])

In [15]:
pipe_params = {'cvec__max_features':[None, 200000, 250000],
               'cvec__ngram_range': [(1, 2)],
               'cvec__stop_words': [None]
              }

In [16]:
gs= GridSearchCV(pipe, param_grid=pipe_params, cv=3)

In [17]:
gs.fit(X_train, y_train)

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('nb', MultinomialNB(alpha=0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'cvec__max_features': [None, 200000, 250000], 'cvec__ngram_range': [(1, 2)], 'cvec__stop_words': [None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [18]:
gs.best_params_

{'cvec__max_features': 250000,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': None}

In [19]:
gs.best_score_

0.9199470724445915

In [20]:
gs.score(X_train, y_train)

0.9996692027786966

In [21]:
gs.score(X_test, y_test)

0.9255952380952381

## Random Forrest

I also tried a Random Forrest model to see if I could manipulate its parameters enough to reduce my variance in my model, but it actually included significantly more variance.

In [22]:
cvec = CountVectorizer()
cvec_X_train = cvec.fit_transform(X_train)
cvec_X_test = cvec.transform(X_test)

In [23]:
rf = RandomForestClassifier()

In [24]:
rf.fit(cvec_X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [25]:
rf.score(cvec_X_train, y_train)

0.9960304333443599

In [26]:
rf.score(cvec_X_test, y_test)

0.8526785714285714

In [27]:
pipe = Pipeline(
    [
        ('cvec', CountVectorizer(ngram_range=(1, 2), max_features=300_000)),
        ('rf',RandomForestClassifier())
         ])

In [28]:
# pipe_params = {'cvec__max_features':[None, 15000, 20000],
#                'cvec__ngram_range': [(1, 2), (1,3)],
#                'cvec__stop_words': [None, 'english']
#               }

rf_params = {
    'rf__n_estimators': [200, 400, 600],
    'rf__max_depth': [None, 5000],
    'rf__min_samples_split': [5, 10]
}

In [29]:
gs= GridSearchCV(pipe, param_grid=rf_params, cv=3)

In [30]:
gs.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=300000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        str...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'rf__n_estimators': [200, 400, 600], 'rf__max_depth': [None, 5000], 'rf__min_samples_split': [5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [31]:
gs.best_params_

{'rf__max_depth': 5000, 'rf__min_samples_split': 5, 'rf__n_estimators': 400}

In [32]:
gs.score(X_train, y_train)

1.0

In [33]:
gs.score(X_test, y_test)

0.9191468253968254

### Modelling Conclusions:

The best model, by far, for this data set is a logistic regression. With 97% accuracy, and only 3% variance between my training and testing data, my final model performed wonderfully.

# Model Evaluation

To analyze my model, and understand why it made the decisions it made, I first examined my confusion matrix, and then wrote several functions to help me look at the most prominent features of the model: 

In [34]:
cvec = CountVectorizer(ngram_range=(1,2))
cvec_X_train = cvec.fit_transform(X_train)
cvec_X_test = cvec.transform(X_test)

lr = LogisticRegression(penalty='l1', C = 166.81)
lr.fit(cvec_X_train, y_train)



LogisticRegression(C=166.81, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l1', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [35]:
predictions = lr.predict(cvec_X_test)

In [36]:
from sklearn.metrics import confusion_matrix

In [37]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

In [38]:
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)


True Negatives: 897
False Positives: 39
False Negatives: 22
True Positives: 1058


In [39]:
predict_chart = pd.DataFrame()
predict_chart['text'] = X_test
predict_chart['true_values'] = y_test
predict_chart['predicted_values'] = predictions
predict_chart.reset_index(drop=True, inplace=True)
false_predictions = predict_chart[predict_chart['predicted_values'] != predict_chart['true_values']]

As evidenced below, the number of features involved in this dataset is much smaller than the number of max features that are optimized using gridsearch! This is because many of the strong predictors are not used many times, but do impact the model.

In [40]:
coefficients = pd.DataFrame(columns=['coefficient', 'variable'])
coefficients['coefficient'] = lr.coef_.tolist()[0]
coefficients['variable'] = cvec.vocabulary_.items()
non_zero_coefficients = coefficients[coefficients['coefficient'] != 0]
non_zero_coefficients.reset_index(drop=True, inplace=True)
non_zero_coefficients.shape

(2458, 2)