# Am I The Asshole? - Modelling

In [160]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline

from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

from sklearn.preprocessing import Normalizer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [161]:
def gridsearchit(pipe, pipe_params, X, y, cv=5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,random_state=42)
    gs = GridSearchCV(pipe, param_grid=pipe_params, cv=cv)
    gs.fit(X_train, y_train)
    print_scores(gs)
    return gs.best_estimator_

In [37]:
def print_scores(gs):
    print(f'best score: {gs.best_score_}')
    #print(f'best estimator: {gs.best_estimator_}')
    print(f'train score: {gs.score(X_train, y_train)}')
    print(f'test score: {gs.score(X_test, y_test)}')

### Read in the Data

In [123]:
df = pd.read_csv('./datasets/final_4.csv')


In [39]:
df.shape
df.isnull().sum()

id                0
title             0
selftext          0
total_comments    0
vote              0
dtype: int64

In [40]:
df.head()

Unnamed: 0,id,title,selftext,total_comments,vote
0,euqrvj,AITA for expecting my husband's help as soon a...,I spend all day home with kids age 5 and 7. Ru...,21.0,1
1,euqlxk,AITA for holding a grudge for 10 years with my...,This story may be mild compared to what you ca...,20.0,1
2,euqhqt,AITA for not letting my son go to a non-vegan ...,My son (Eddie) recently started Grade 1 last S...,57.0,1
3,euq28u,AITA for forcing my son to work out and go out...,My son is the laziest kid I've ever met and re...,16.0,1
4,euq0ce,AITA for stealing my moms money to buy clothes?,Am I the asshole for stealing my moms money to...,25.0,1


In [41]:
stop_words = set(stopwords.words('english')) 

### Create a df with Balanced Classes

In [42]:
df_all_assholes = df.loc[df['vote'] == 1]

In [43]:
num_assholes = df_all_assholes.vote.count()

In [44]:
df_no_assholes = df.loc[df['vote'] == 0]

In [45]:
df_no_assholes_sample = df_no_assholes.sample(n=num_assholes, random_state=1)

In [46]:
balanced_df = pd.concat([df_all_assholes, df_no_assholes_sample], axis=0)

In [47]:
balanced_df.vote.value_counts()

1    857
0    857
Name: vote, dtype: int64

In [48]:
balanced_df.reset_index(inplace=True)

In [49]:
balanced_df.shape

(1714, 6)

In [50]:
balanced_df.isnull().sum()

index             0
id                0
title             0
selftext          0
total_comments    0
vote              0
dtype: int64

### Tokenize, Lemmatize and Stem

In [51]:
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [52]:
def make_list_of_tokenized_colvals(col, sw=stop_words, t=tokenizer):
    tokenized_col = [t.tokenize(txt) for txt in col]
    tokenized_col_no_stop = []
    if sw:
        for i in tokenized_col:
            no_stop = [word for word in i if word not in sw]
            tokenized_col_no_stop.append(no_stop)
        return tokenized_col_no_stop
    return tokenized_col

In [53]:
#takes list of tokens lists
def make_lemma_stems_df(tokenized_texts_list, prefix):
    lemmatized = []
    stemmed = []
    for i in tokenized_texts_list:
        lemma_tokens = {}
        lemmas = [lemmatizer.lemmatize(word) for word in i]
        lemma_tokens['lemmas_'+ prefix] = ' '.join(lemmas)
        lemmatized.append(lemma_tokens)

        stem_tokens = {}
        stems = [stemmer.stem(word) for word in i]
        stem_tokens['stems_'+ prefix] = ' '.join(stems)
        stemmed.append(stem_tokens)
    stem_lemma_df = pd.concat([pd.DataFrame(stemmed), pd.DataFrame(lemmatized)], axis=1)
    return stem_lemma_df

In [54]:
title_tokens = make_list_of_tokenized_colvals(balanced_df['title'])
title_lemma_stems_df = make_lemma_stems_df(title_tokens, 'title')

In [93]:
title_tokens_with_sws = make_list_of_tokenized_colvals(balanced_df['title'], sw=False)
title_lemma_stems_df_no_sws = make_lemma_stems_df(title_tokens_with_sws, 'title_with_sw')

In [94]:
balanced_df = pd.concat([balanced_df, title_lemma_stems_df_no_sws], axis=1)

In [55]:
selftext_tokens = make_list_of_tokenized_colvals(balanced_df['selftext'])
selftext_lemma_stems_df = make_lemma_stems_df(selftext_tokens, 'selftext')

In [56]:
balanced_df = pd.concat([balanced_df, title_lemma_stems_df], axis=1)

In [57]:
balanced_df = pd.concat([balanced_df, selftext_lemma_stems_df], axis=1)

#### Make column that combines title and selftext data

In [126]:
balanced_df['title_selftext_stems'] = balanced_df['stems_selftext'] + balanced_df['stems_title']
balanced_df['title_selftext_lemmas'] = balanced_df['lemmas_selftext'] + balanced_df['lemmas_title']


In [127]:
balanced_df.head()

Unnamed: 0,index,id,title,selftext,total_comments,vote,stems_title,lemmas_title,stems_selftext,lemmas_selftext,stems_title_with_sw,lemmas_title_with_sw,title_selftext_stems,title_selftext_lemmas
0,0,euqrvj,AITA for expecting my husband's help as soon a...,I spend all day home with kids age 5 and 7. Ru...,21.0,1,aita expect husband help soon get home,AITA expecting husband help soon get home,I spend day home kid age 5 7 run around ton ex...,I spend day home kid age 5 7 Running around to...,aita for expect my husband s help as soon as h...,AITA for expecting my husband s help a soon a ...,I spend day home kid age 5 7 run around ton ex...,I spend day home kid age 5 7 Running around to...
1,1,euqlxk,AITA for holding a grudge for 10 years with my...,This story may be mild compared to what you ca...,20.0,1,aita hold grudg 10 year S O,AITA holding grudge 10 year S O,thi stori may mild compar get aita post but ye...,This story may mild compared get AITA post But...,aita for hold a grudg for 10 year with my S O,AITA for holding a grudge for 10 year with my S O,thi stori may mild compar get aita post but ye...,This story may mild compared get AITA post But...
2,2,euqhqt,AITA for not letting my son go to a non-vegan ...,My son (Eddie) recently started Grade 1 last S...,57.0,1,aita let son go non vegan birthday parti,AITA letting son go non vegan birthday party,My son eddi recent start grade 1 last septemb ...,My son Eddie recently started Grade 1 last Sep...,aita for not let my son go to a non vegan birt...,AITA for not letting my son go to a non vegan ...,My son eddi recent start grade 1 last septemb ...,My son Eddie recently started Grade 1 last Sep...
3,3,euq28u,AITA for forcing my son to work out and go out...,My son is the laziest kid I've ever met and re...,16.0,1,aita forc son work go outsid,AITA forcing son work go outside,My son laziest kid I ever met recent I enough ...,My son laziest kid I ever met recently I enoug...,aita for forc my son to work out and go outsid,AITA for forcing my son to work out and go out...,My son laziest kid I ever met recent I enough ...,My son laziest kid I ever met recently I enoug...
4,4,euq0ce,AITA for stealing my moms money to buy clothes?,Am I the asshole for stealing my moms money to...,25.0,1,aita steal mom money buy cloth,AITA stealing mom money buy clothes,Am I asshol steal mom money get cloth I live c...,Am I asshole stealing mom money get clothes I ...,aita for steal my mom money to buy cloth,AITA for stealing my mom money to buy clothes,Am I asshol steal mom money get cloth I live c...,Am I asshole stealing mom money get clothes I ...


### Model 1: Logistic Regression with TfidfVectorizer and CountVectorizer

#### Set Features, CountVectorize and TfidVectorize

In [153]:
X = balanced_df['stems_selftext']
y = balanced_df['vote']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,random_state=42)

#### Set TfidfVec Pipline, Run Gridsearch

In [149]:
pipe = Pipeline([
    ('tf', TfidfVectorizer()),
    ('lr', LogisticRegression(solver = 'liblinear'))
])
pipe_params = {
    'tf__max_features': [800, 1000, 1500, 2000],
    'tf__min_df': [1, 2, 3],
    'tf__max_df': [.6, .7, .9],
    'tf__stop_words': [None, stop_words],
    'tf__ngram_range': [(1,2), (1,3)],
    'lr__C': [.03, .1, .2]
}

In [150]:
gridsearchit(pipe, pipe_params, X, y, cv=3)

best score: 0.5859922178988327
best estimator: Pipeline(memory=None,
         steps=[('tf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.7, max_features=1500,
                                 min_df=3, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'a...
                                 strip_accents=None, sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                      

#### Set Cvec Pipline, Run Gridsearch

In [156]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(solver = 'liblinear'))
])
pipe_params = {
    'cvec__max_features': [500, 900, 1200],
    'cvec__min_df': [1, 2, 3],
    'cvec__max_df': [.6, .7, .9],
    'cvec__stop_words': [None, stop_words],
    'cvec__ngram_range': [(1,2)],
    'lr__C': [.03, .1, .2]
}

In [157]:
gridsearchit(pipe, pipe_params, X, y, cv=3)

best score: 0.5595330739299611
best estimator: Pipeline(memory=None,
         steps=[('cvec',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.6,
                                 max_features=500, min_df=2, ngram_range=(1, 2),
                                 preprocessor=None,
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             '...
                                             'couldn', "couldn't", ...},
                                 strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                 

### Model 2: Random Forest with CountVectorizer

In [168]:
X = balanced_df['stems_title']
y = balanced_df['vote']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,random_state=42)

In [169]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('rf', RandomForestClassifier(n_estimators=100))
])

In [170]:
pipe_params = {
    'rf__n_estimators': [100, 150, 200],
    'rf__max_depth': [None, 1, 3, 5],
    'rf__max_features' : [None, 2, 3, 4] # none is bagging, a number can take a random subset of features
}

In [171]:
gridsearchit(pipe, pipe_params, X, y, cv=3)

best score: 0.5805447470817121
best estimator: Pipeline(memory=None,
         steps=[('cvec',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('rf',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features=3, max_leaf_nodes=None,
                                        

Pipeline(memory=None,
         steps=[('cvec',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('rf',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features=3, max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                    

### Use Column Transformer with TfidfVectorizer and Logit

In [None]:
ct = ColumnTransformer(
        [("cvec", CountVectorizer())
         #("norm2", Normalizer(norm='l1')
          ]
)

In [118]:
balanced_df.isnull().sum()

index                   0
id                      0
title                   0
selftext                0
total_comments          0
vote                    0
stems_title             0
lemmas_title            0
stems_selftext          0
lemmas_selftext         0
stems_title_with_sw     0
lemmas_title_with_sw    0
dtype: int64

In [125]:
X = balanced_df[['stems_title', 'total_comments']]
y = balanced_df['vote']

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['total_comments']
# numeric_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median'))])

categorical_features = ['stems_title']
categorical_transformer = Pipeline(steps=[
    ('cvec', CountVectorizer())])

ct = ColumnTransformer(
    transformers=[('tf', TfidfVectorizer(), 'stems_title')])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('ct', ct),
                      ('classifier', LogisticRegression())])

grid_search_params = {'ct__tf__ngram_range': [],
                      'classifier__solver':['liblinear']}

X_train, X_test, y_train, y_test = train_test_split(X, y)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.562




### Use Column Transformer with TfidfVectorizer and RandomForest

In [None]:
ct = ColumnTransformer(
        [("cvec", CountVectorizer())
         #("norm2", Normalizer(norm='l1')
          ]
)

In [118]:
balanced_df.isnull().sum()

index                   0
id                      0
title                   0
selftext                0
total_comments          0
vote                    0
stems_title             0
lemmas_title            0
stems_selftext          0
lemmas_selftext         0
stems_title_with_sw     0
lemmas_title_with_sw    0
dtype: int64

In [125]:
X = balanced_df[['stems_title', 'total_comments']]
y = balanced_df['vote']

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['total_comments']
# numeric_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median'))])

categorical_features = ['stems_title']
categorical_transformer = Pipeline(steps=[
    ('cvec', CountVectorizer())])

ct = ColumnTransformer(
    transformers=[('tf', TfidfVectorizer(), 'stems_title')])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('ct', ct),
                      ('classifier', LogisticRegression())])

grid_search_params = {'ct__tf__ngram_range': [],
                      'classifier__solver':['liblinear']}

X_train, X_test, y_train, y_test = train_test_split(X, y)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.562


