In [1]:
#This is the modeling notebook. Here we start with clean datasets, with LPT/ULPT removed, but containing
#the target class in the 'subreddit' column.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [5]:
#This package is how to download the stopwords. This way we can use these in the same pipeline as our model, to make sure we remove the transformed version from our text.
import nltk
nltk.download('stopwords')
nltk_stopwords = stopwords.words('english')
nltk_stopwords[:3]

[nltk_data] Downloading package stopwords to /Users/Keith/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i', 'me', 'my']

In [1]:
import warnings
warnings.filterwarnings('ignore')

---

In [1]:
from IPython.display import HTML
HTML('''<script>
var code_show_err = false; 
var code_toggle_err = function() {
 var stderrNodes = document.querySelectorAll('[data-mime-type="application/vnd.jupyter.stderr"]')
 var stderr = Array.from(stderrNodes)
 if (code_show_err){
     stderr.forEach(ele => ele.style.display = 'block');
 } else {
     stderr.forEach(ele => ele.style.display = 'none');
 }
 code_show_err = !code_show_err
} 
document.addEventListener('DOMContentLoaded', code_toggle_err);
</script>
To toggle on/off output_stderr, click <a onclick="javascript:code_toggle_err()">here</a>.''')

Read in the data. Our first models will use the dataframes I built that are just the title, text and subreddit. Later models will attempt to use the scores and sentiments I derived in the EDA notebook.

In [6]:
#Read in of separate datasets, for analysis/comparison:
eth_df = pd.read_csv('./data/lpt_cleaned.csv')
uneth_df = pd.read_csv('./data/ulpt_cleaned.csv')

In [7]:
#Let's start by reading in the dataset:
full_raw = pd.read_csv('./data/both_cleaned.csv', index_col=0)
full_raw.head()

Unnamed: 0,title,selftext,subreddit
0,": when packing for a move, use your clothes to...","most people will unpack the kitchen early on, ...",LifeProTips
1,: to avoid giving clicks/views to clickbaity n...,most of the time it will give you the name of ...,LifeProTips
2,: kindness is not weakness.,"before i go on, i hope this doesn’t get taken ...",LifeProTips
3,: bought online and shipping delayed? check gu...,for example- i ordered several times from newe...,LifeProTips
4,": guys, if you are on a date with a girl, shut...",(if physical attraction and all that is there ...,LifeProTips


In [8]:
full_raw['subreddit'].unique()

array(['LifeProTips', 'UnethicalLifeProTips'], dtype=object)

In [9]:
#Let's convert our target to a 0 or a 1, where 1 is UnethicalLifeProTips. We want to be able to detect unethical 
#text.
full_raw['subreddit'] = np.where(full_raw['subreddit'] == 'UnethicalLifeProTips', 1, 0)

In [10]:
full_raw['subreddit'].unique()

array([0, 1])

In [11]:
#Ok great, we have our target encoded. Let's play with a copy of the raw so that we can save typing:
df = full_raw.copy()
df.head()

Unnamed: 0,title,selftext,subreddit
0,": when packing for a move, use your clothes to...","most people will unpack the kitchen early on, ...",0
1,: to avoid giving clicks/views to clickbaity n...,most of the time it will give you the name of ...,0
2,: kindness is not weakness.,"before i go on, i hope this doesn’t get taken ...",0
3,: bought online and shipping delayed? check gu...,for example- i ordered several times from newe...,0
4,": guys, if you are on a date with a girl, shut...",(if physical attraction and all that is there ...,0


In [12]:
df.dropna(inplace=True)

In [13]:
df.isna().sum()

title        0
selftext     0
subreddit    0
dtype: int64

### For prediction, our null model is as follows:
### Whichever class is most common, that will be our baseline guess. 
Accuracy will be our metric. We may evaluate on other metrics later if it is determined that the model is lacking a certain type of missed classification.

In [15]:
#Comparing the ethical (0) with unethical (1)
len(df[df['subreddit']==1]) > len(df[df['subreddit']==0])

False

In [16]:
#Calculating our null performance, what % will we get right if we guess ethical every time?
null_performance = 1-(len(df[df['subreddit']==1]))/len(df)

In [17]:
null_performance

0.5085236843878445

In [18]:
#almost a 50/50 shot, based on how much data we got.

In [19]:
#Shouldn't be hard to beat, but let's see:

Defining our X and y for our basic model:

In [20]:
#Let's create an X and y to test and model our data, on titles, text, and both:
X_title = df['title']
X_text = df['selftext']
X_both = df[['title', 'selftext']]
y = df['subreddit']

In [21]:
#This LemmaTokenizer function will both lemmatize and tokenize our text: (borrowed kindly from NLP II lesson)
class LemmaTokenizer:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    
    def __call__(self, doc):
        return [self.lemmatizer.lemmatize(t) for t in word_tokenize(doc)]

Defining our pipelines:

In [22]:
#Ok let's try to do a model with just the titles, and see what we get:
#We will make 2 different pipelines, one using CountVectorizer and one using a TfidfVectorizer
#strip accents as unicode, as that is the most generic way to handle weird characters which we will probably
#encounter in 7000 reddit posts.
pipe_1 = make_pipeline(CountVectorizer(stop_words='english',
                                       max_features=500,
                                       strip_accents='unicode'),
                      MultinomialNB())
                       
pipe_2 = make_pipeline(TfidfVectorizer(stop_words='english',
                                      max_features=500,
                                       strip_accents='unicode'),
                      MultinomialNB())

In [23]:
#Do my train test splits:
#Title:
X_ti_tr, X_ti_te, y_train, y_test = train_test_split(X_title, y, stratify = y, random_state=42)
#Text:
X_tx_tr, X_tx_te, y_tx_train, y_tx_test = train_test_split(X_text, y, stratify = y, random_state=42)
#Both:
X_bo_tr, X_bo_te, y_bo_train, y_bo_test = train_test_split(X_both, y, stratify = y, random_state=42)

In [24]:
#Fitting astype unicode. Was encountering errors due to encoding before.
#https://stackoverflow.com/questions/39303912/tfidfvectorizer-in-scikit-learn-valueerror-np-nan-is-an-invalid-document
pipe_1.fit(X_ti_tr.values.astype('U'), y_train)
pipe_2.fit(X_ti_tr.values.astype('U'), y_train)

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_features=500, stop_words='english',
                                 strip_accents='unicode')),
                ('multinomialnb', MultinomialNB())])

In [25]:
pipe_1.score(X_ti_te.values.astype('U'), y_test)

0.7537052007545136

In [26]:
pipe_2.score(X_ti_te.values.astype('U'), y_test)

0.7502021018593371

Hey, right off the bat, a 25% increase in accuracy of classification. Let's see if we can improve that:

In [27]:
#Hmm not bad. Let's see what a random forest can do...

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
pipe_forest = make_pipeline(CountVectorizer(stop_words='english',
                                       max_features=15_000,
                                       strip_accents='unicode'),
                      RandomForestClassifier(
                                             n_jobs = -1,
                                             ccp_alpha = .001
                      ))

In [30]:

pipe_forest.fit(X_ti_tr.values.astype('U'), y_train)

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=15000, stop_words='english',
                                 strip_accents='unicode')),
                ('randomforestclassifier',
                 RandomForestClassifier(ccp_alpha=0.001, n_jobs=-1))])

In [31]:
pipe_forest.score(X_ti_tr.values.astype('U'), y_train)

0.7690026954177898

In [32]:
pipe_forest.score(X_ti_te.values.astype('U'), y_test)

0.7537052007545136

Looks like we have improved, only slightly in our prediction.

## On Post Text:

In [33]:
pipe_3 = make_pipeline(CountVectorizer(stop_words='english',
                                       max_features=500,
                                       strip_accents='unicode'),
                      MultinomialNB())

In [34]:
pipe_3.fit(X_tx_tr.values.astype('U'), y_tx_train)

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=500, stop_words='english',
                                 strip_accents='unicode')),
                ('multinomialnb', MultinomialNB())])

In [35]:
pipe_3.score(X_tx_te, y_tx_test)

0.7256804095931015

Looks like for the posts, titles are more effective at classifying. Let's check a random forest:

In [36]:
forest_pipe_2 = make_pipeline(CountVectorizer(stop_words='english',
                                       max_features=100_000,
                                       strip_accents='unicode'),
                      RandomForestClassifier(
                                             n_jobs = -1,
                                             ccp_alpha = .001
                      ))

In [37]:
forest_pipe_2.fit(X_tx_tr.values.astype('U'), y_tx_train)

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=100000, stop_words='english',
                                 strip_accents='unicode')),
                ('randomforestclassifier',
                 RandomForestClassifier(ccp_alpha=0.001, n_jobs=-1))])

In [38]:
forest_pipe_2.score(X_tx_te, y_tx_test)

0.7510105092966856

Random forest does worse on text as well. Maybe I take a grid search approach, before moving onto some new types of analysis:

In [39]:
##WARNING: I COMMENTED THE BELOW CODE BECAUSE THE GRID SEARCH TAKES A LONG TIME. MY BEST OUTPUT IS SAVED BELOW.###

In [169]:
# #Define the param grid for my best grid searches:
# params_1 = {
#     'countvectorizer__stop_words':['english', None],
#     'countvectorizer__max_features':[100, 500, 1000, 10_000, 100_000],
#     'countvectorizer__ngram_range':[(1,1), (2,2), (3,3)],
#     'countvectorizer__strip_accents':['unicode'],
#     'countvectorizer__tokenizer':[LemmaTokenizer(), None],
#     'multinomialnb__alpha':[0, 0.1, 0.5, 1, 10]
# }

# params_forest = {
#     'countvectorizer__stop_words':['english', None],
#     'countvectorizer__max_features':[100, 500, 1000, 10_000, 100_000],
#     'countvectorizer__ngram_range':[(1,1), (2,2), (3,3)],
#     'countvectorizer__strip_accents':['unicode'],
#     'countvectorizer__tokenizer':[LemmaTokenizer(), None],
#     'randomforestclassifier__n_estimators':[100, 500, 1000],
#     'randomforestclassifier__max_depth':[2, 3, 4, 5, 7],
#     'randomforestclassifier__n_jobs':[-1],
#     'randomforestclassifier__ccp_alpha':[0, 0.01, 0.1, 1, 10]
# }

In [164]:
#grid_1 = GridSearchCV(pipe_1, param_grid = params_1)

In [165]:
#grid_1.fit(X_ti_tr.values.astype('U'), y_train)



GridSearchCV(estimator=Pipeline(steps=[('countvectorizer',
                                        CountVectorizer(max_features=500,
                                                        stop_words='english',
                                                        strip_accents='unicode')),
                                       ('multinomialnb', MultinomialNB())]),
             param_grid={'countvectorizer__max_features': [100, 500, 1000,
                                                           10000, 100000],
                         'countvectorizer__ngram_range': [(1, 1), (2, 2),
                                                          (3, 3)],
                         'countvectorizer__stop_words': ['english', None],
                         'countvectorizer__strip_accents': ['unicode'],
                         'countvectorizer__tokenizer': [<__main__.LemmaTokenizer object at 0x7fc56052e6a0>,
                                                        None],
                    

In [166]:
#grid_1.best_params_

{'countvectorizer__max_features': 100000,
 'countvectorizer__ngram_range': (1, 1),
 'countvectorizer__stop_words': None,
 'countvectorizer__strip_accents': 'unicode',
 'countvectorizer__tokenizer': None,
 'multinomialnb__alpha': 0.5}

In [167]:
#grid_1.best_score_

0.8261766024204393

In [172]:
#grid_1.score(X_ti_te.values.astype('U'), y_test)

0.8346329658510352

This cell was the best output for my gridsearching. I have put the best model in and will show the results of that:

In [None]:
#Wahoo! 83% accuracy. Here are the params, saving so these cells don't run and take forever:
# {'countvectorizer__max_features': 100000,
#  'countvectorizer__ngram_range': (1, 1),
#  'countvectorizer__stop_words': None,
#  'countvectorizer__strip_accents': 'unicode',
#  'countvectorizer__tokenizer': None,
#  'multinomialnb__alpha': 0.5}

In [42]:
params_best = {'countvectorizer__max_features': [100000],
  'countvectorizer__ngram_range': [(1, 1)],
  'countvectorizer__stop_words': [None],
  'countvectorizer__strip_accents': ['unicode'],
  'countvectorizer__tokenizer': [None],
  'multinomialnb__alpha': [0.5]}

In [43]:
grid_best = GridSearchCV(pipe_1, param_grid = params_best)
grid_best.fit(X_ti_tr.values.astype('U'), y_train)

GridSearchCV(estimator=Pipeline(steps=[('countvectorizer',
                                        CountVectorizer(max_features=500,
                                                        stop_words='english',
                                                        strip_accents='unicode')),
                                       ('multinomialnb', MultinomialNB())]),
             param_grid={'countvectorizer__max_features': [100000],
                         'countvectorizer__ngram_range': [(1, 1)],
                         'countvectorizer__stop_words': [None],
                         'countvectorizer__strip_accents': ['unicode'],
                         'countvectorizer__tokenizer': [None],
                         'multinomialnb__alpha': [0.5]})

In [44]:
grid_best.best_score_

0.8290206648697216

In [45]:
grid_best.score(X_ti_te.values.astype('U'), y_test)

0.8296954998652655

I have achieved nearly 83% accuracy, with very little if any overfitting. It looks like I have a good model.

Next I will try a Random Forest model, to see if I can gain any accuracy:

In [170]:
#grid_forest = GridSearchCV(forest_pipe_2, param_grid = params_forest)

---

Next I tried a random forest grid search. It took a long time, so I will save the best params as above and commment it out.  
This will save time running the notebook.

In [175]:
#%%time
#grid_forest.fit(X_ti_tr.values.astype('U'), y_train)



CPU times: user 14h 26min 59s, sys: 32min 25s, total: 14h 59min 25s
Wall time: 17h 4min 5s


GridSearchCV(estimator=Pipeline(steps=[('countvectorizer',
                                        CountVectorizer(max_features=100000,
                                                        stop_words='english',
                                                        strip_accents='unicode')),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(ccp_alpha=0.001,
                                                               n_jobs=-1))]),
             param_grid={'countvectorizer__max_features': [100, 500, 1000,
                                                           10000, 100000],
                         'countvectorizer__ngram_range': [(1, 1), (2, 2),
                                                          (3, 3)],
                         'countv...p_words': ['english', None],
                         'countvectorizer__strip_accents': ['unicode'],
                         'countvectorizer__tokenizer

In [177]:
#grid_forest.best_params_

{'countvectorizer__max_features': 10000,
 'countvectorizer__ngram_range': (1, 1),
 'countvectorizer__stop_words': 'english',
 'countvectorizer__strip_accents': 'unicode',
 'countvectorizer__tokenizer': <__main__.LemmaTokenizer at 0x7fc574f38d90>,
 'randomforestclassifier__ccp_alpha': 0,
 'randomforestclassifier__max_depth': 7,
 'randomforestclassifier__n_estimators': 1000,
 'randomforestclassifier__n_jobs': -1}

In [179]:
#best_params = grid_forest.best_params_

The best parameters that came out of this run (17 hours long) were below:

In [180]:
#Saving these params for later:
# saved_params = {'countvectorizer__max_features': 10000,
#  'countvectorizer__ngram_range': (1, 1),
#  'countvectorizer__stop_words': 'english',
#  'countvectorizer__strip_accents': 'unicode',
#  'countvectorizer__tokenizer': LemmaTokenizer(),
#  'randomforestclassifier__ccp_alpha': 0,
#  'randomforestclassifier__max_depth': 7,
#  'randomforestclassifier__n_estimators': 1000,
#  'randomforestclassifier__n_jobs': -1}

In [47]:
#grid_forest.best_score_
#Best score was .7646

In [46]:
#grid_forest.score(X_ti_te.values.astype('U'), y_test)
#Score on the test set was .77

In [1]:
#Best score on the random forest model is .77 accuracy. Decent, but not the best I have seen.

### Including Our Score Values

Next, I tried to use the sentiment, polarity, and subjectivity scores to train my model. This means I cannot use Naive Bayes, as it does not accept negative values:

In [49]:
#This code was pasted in from my scratch notebook, so it is not as clean.
cvect = CountVectorizer(max_features = 15_000)

title_vect = cvect.fit_transform(full_raw['title'].dropna())

vect_df = pd.DataFrame(title_vect.toarray(), columns = cvect.get_feature_names())
vect_df.drop('subreddit', axis=1, inplace=True)
vect_df.head()


Unnamed: 0,00,000,00pm,00s,013,05,07,10,100,1000,...,zips,zits,zombies,zone,zoned,zoom,zoomed,zu,état,ānmén
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [74]:
vect_df['target']

0        0
1        0
2        0
3        0
4        0
        ..
14836    0
14837    0
14838    0
14839    0
14840    0
Name: target, Length: 14841, dtype: int64

In [75]:
#adding the vectorized title to the scores I found in my EDA:
scores = pd.read_csv('./data/full_raw_scores.csv')
#Have to rename the target column so it is not repeated. Both subreddit and target may be words in the title of our posts.
scores['subreddit_target'] = scores['target']
full_df = pd.concat([vect_df, scores.drop(['title', 'selftext', 'text_scores', 'title_scores'], axis=1)], axis=1)
full_df.head()


Unnamed: 0,00,000,00pm,00s,013,05,07,10,100,1000,...,title_neg,title_neu,title_pos,title_comp,text_neg,text_neu,text_pos,text_comp,target,subreddit_target
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.06,0.874,0.066,0.0516,0.06,0.874,0.066,0.0516,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.36,0.64,0.6522,0.0,0.36,0.64,0.6522,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.101,0.899,0.0,-0.2263,0.101,0.899,0.0,-0.2263,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.892,0.108,0.5574,0.0,0.892,0.108,0.5574,0,0


In [91]:
#This means my model will lose some accuracy as the word 'target' will no longer be kept. I should go back and work around this.
full_df.dropna(inplace = True)
X_2 = full_df.drop(['target', 'subreddit_target', 'Unnamed: 0'], axis=1)
y_2 = full_df['subreddit_target']

In [92]:
X_2.columns

Index(['00', '000', '00pm', '00s', '013', '05', '07', '10', '100', '1000',
       ...
       'polarity', 'subjectivity', 'title_neg', 'title_neu', 'title_pos',
       'title_comp', 'text_neg', 'text_neu', 'text_pos', 'text_comp'],
      dtype='object', length=14697)

In [93]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, stratify=y_2, random_state=42)

In [94]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=10_000, n_jobs=-1)

lr.fit(X_train_2, y_train_2)

lr.score(X_train_2, y_train_2)

0.9490566037735849

In [95]:
lr.score(X_test_2, y_test_2)

0.8323901913230936

In [87]:
X_2.head()

Unnamed: 0,00,000,00pm,00s,013,05,07,10,100,1000,...,polarity,subjectivity,title_neg,title_neu,title_pos,title_comp,text_neg,text_neu,text_pos,text_comp
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.22,0.46,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.25,0.25,0.06,0.874,0.066,0.0516,0.06,0.874,0.066,0.0516
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.246654,0.662843,0.0,0.36,0.64,0.6522,0.0,0.36,0.64,0.6522
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.185714,0.548413,0.101,0.899,0.0,-0.2263,0.101,0.899,0.0,-0.2263
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.142857,0.0,0.892,0.108,0.5574,0.0,0.892,0.108,0.5574


In [88]:
X_2.columns[14670:]

Index(['ziploc', 'ziplock', 'ziplocs', 'zipper', 'zippo', 'zips', 'zits',
       'zombies', 'zone', 'zoned', 'zoom', 'zoomed', 'zu', 'état', 'ānmén',
       'Unnamed: 0', 'title_word_count', 'text_word_count', 'polarity',
       'subjectivity', 'title_neg', 'title_neu', 'title_pos', 'title_comp',
       'text_neg', 'text_neu', 'text_pos', 'text_comp'],
      dtype='object')

One last try: Let's make a new count vectorized version with fewer features:

In [98]:
cvect_2 = CountVectorizer(max_features = 1_000, stop_words='english')

title_vect_2 = cvect_2.fit_transform(full_raw['title'].dropna())
vect_df_2 = pd.DataFrame(title_vect_2.toarray(), columns = cvect_2.get_feature_names())
vect_df_2.head()

Unnamed: 0,10,100,12,15,20,25,30,50,able,accept,...,write,writing,wrong,xbox,year,years,yo,young,youtube,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [100]:
#adding the vectorized title to the scores I found in my EDA:
scores_2 = pd.read_csv('./data/full_raw_scores.csv', index_col=0)
#Have to rename the target column so it is not repeated. Both subreddit and target may be words in the title of our posts.
scores_2['subreddit_target'] = scores_2['target']
full_df_2 = pd.concat([vect_df_2, scores_2.drop(['title', 'selftext', 'text_scores', 'title_scores', 'target'], axis=1)], axis=1)
full_df_2.head()


Unnamed: 0,10,100,12,15,20,25,30,50,able,accept,...,subjectivity,title_neg,title_neu,title_pos,title_comp,text_neg,text_neu,text_pos,text_comp,subreddit_target
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.46,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.25,0.06,0.874,0.066,0.0516,0.06,0.874,0.066,0.0516,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.662843,0.0,0.36,0.64,0.6522,0.0,0.36,0.64,0.6522,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.548413,0.101,0.899,0.0,-0.2263,0.101,0.899,0.0,-0.2263,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.142857,0.0,0.892,0.108,0.5574,0.0,0.892,0.108,0.5574,0


In [103]:
full_df_2.columns[990:]

Index(['write', 'writing', 'wrong', 'xbox', 'year', 'years', 'yo', 'young',
       'youtube', 'zoom', 'title_word_count', 'text_word_count', 'polarity',
       'subjectivity', 'title_neg', 'title_neu', 'title_pos', 'title_comp',
       'text_neg', 'text_neu', 'text_pos', 'text_comp', 'subreddit_target'],
      dtype='object')

In [104]:
full_df_2.dropna(inplace = True)
X_3 = full_df_2.drop(['subreddit_target'], axis=1)
y_3 = full_df_2['subreddit_target']

In [105]:
X_tr_3, X_te_3, y_tr_3, y_te_3 = train_test_split(X_3, y_3, stratify=y_3, random_state=42)

In [106]:
lr_2 = LogisticRegression(max_iter=10_000, n_jobs=-1)

lr_2.fit(X_tr_3, y_tr_3)

lr_2.score(X_tr_3, y_tr_3)

0.8417789757412399

In [107]:
lr_2.score(X_te_3, y_te_3)

0.7933171651845864

Ok so the best I can do it appears is with a Naive Bayes Classifier, classifying around 83% of my subreddits accurately.

Let's try one last thing: let's drop the request column, to see if that helps or hurts our classification ability:

In [111]:
full_df.dropna(inplace = True)
X_4 = full_df.drop(['target', 'subreddit_target', 'Unnamed: 0', 'request'], axis=1)
y_4 = full_df['subreddit_target']

In [112]:
X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(X_4, y_4, stratify=y_4, random_state=42)

In [115]:
X_train_4.columns[14680:]

Index(['zoomed', 'zu', 'état', 'ānmén', 'title_word_count', 'text_word_count',
       'polarity', 'subjectivity', 'title_neg', 'title_neu', 'title_pos',
       'title_comp', 'text_neg', 'text_neu', 'text_pos', 'text_comp'],
      dtype='object')

In [113]:
#Just going to go with 4 to keep my numbers straight
lr_4 = LogisticRegression(max_iter=10_000, n_jobs=-1)

lr.fit(X_train_4, y_train_4)

lr.score(X_train_4, y_train_4)

0.9463611859838275

In [116]:
lr.score(X_test_4, y_test_4)

0.8256534626785234

The Logistic Regression is starting to do better. Maybe gridsearching can optimize the amount of regularization?

In [122]:
lr_params = {
    'C':[.01, .1, 1, 10],
    'n_jobs':[-1],
    'max_iter':[1000, 10_000]
}

In [125]:
lr_grid = GridSearchCV(LogisticRegression(), lr_params, n_jobs=-1)

In [126]:
lr_grid.fit(X_train_4, y_train_4)

GridSearchCV(estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 10], 'max_iter': [1000, 10000],
                         'n_jobs': [-1]})

In [127]:
lr_grid.best_params_

{'C': 1, 'max_iter': 10000, 'n_jobs': -1}

In [128]:
lr_grid.score(X_train_4, y_train_4)

0.9463611859838275

In [129]:
lr_grid.score(X_test_4, y_test_4)

0.8256534626785234

In [136]:
#Save our model coefficients and intercept:
lr_coefs = lr_grid.best_estimator_.coef_

In [198]:
lr_intercept = lr_grid.best_estimator_.intercept_
lr_intercept

array([-3.87591929])

In [142]:
#Number of coefficients should match up with our X_train columns:
len(lr_coefs[0])

14696

In [143]:
X_train_4.shape

(11130, 14696)

In [144]:
lr_features = X_train_4.columns
lr_features[:3]

Index(['00', '000', '00pm'], dtype='object')

In [165]:
lr_coefs_dict = dict(zip(lr_features, lr_coefs[0]))
lr_coefs_dict['00']


0.2277498232013268

These coefficients and intercept can be used to interpret any vectorized text. Let's write a function to give a prediction on a single body of text:

In [318]:
#This function takes in a single body of text, and returns a prediction value to classify it as either a 1 or 0 (unethical or ethical):
#It uses the logistic function to make a prediction:
def predict_single_lr(text, coefs_dict, intercept):
    #Define the ecoef:
    #print("Intercept is: " + str(intercept))
    e_coef = intercept.copy()
    #instantiate a count vectorizer:
    cvect = CountVectorizer(max_features=15_000)
    #fit and transform our input text:
    vect = cvect.fit_transform(text)
    vect_df = pd.DataFrame(vect.toarray(), columns = cvect.get_feature_names())
    for word in coefs_dict:
        if word in vect_df.columns:
            #print('Word = '+word)
            #print('Total of this word: ' + str(vect_df[word].sum()))
            #print('Coefficient of this word: '+str(coefs_dict[word]))
            e_coef+= (vect_df[word].sum())* (coefs_dict[word])
    #print(e_coef)
    ###Make a vader sentiment and add it to the model coeff:
    ###TBA
    return (np.exp(e_coef)/(1+np.exp(e_coef)))

In [250]:
#Top 20 most influential words towards unethical:
import operator
sorted(lr_coefs_dict.items(), key=operator.itemgetter(1), reverse=True)[:20]

[('refund', 2.816598607533285),
 ('unethical', 2.2594941893336795),
 ('lie', 2.2396875167484445),
 ('return', 2.1509934149353063),
 ('doctor', 2.1351370892599033),
 ('roommate', 2.1135224680341937),
 ('caught', 2.099624095514548),
 ('sell', 2.0019448837937457),
 ('reques', 1.975189864356173),
 ('bathroom', 1.8960315197063429),
 ('legally', 1.8741265467144599),
 ('fake', 1.8689759244146276),
 ('exam', 1.78386496409944),
 ('neighbor', 1.7601666323574623),
 ('pretend', 1.7491131124284078),
 ('illegal', 1.712220445308074),
 ('coronavirus', 1.676452760128427),
 ('drug', 1.611236711377302),
 ('paywalls', 1.5956856171582026),
 ('shit', 1.5648542690560574)]

In [221]:
#Top 20 most influential words towards ethical:
import operator
sorted(lr_coefs_dict.items(), key=operator.itemgetter(1), reverse=False)[:20]

[('consider', -2.0003462270803567),
 ('thoughts', -1.796589720980724),
 ('learn', -1.7420257125267606),
 ('gifts', -1.6879599304070898),
 ('awards', -1.6755211831549244),
 ('pressure', -1.573760699202852),
 ('mean', -1.54485106893298),
 ('habit', -1.5235511784648352),
 ('cook', -1.5166744230260645),
 ('remember', -1.5056100679464575),
 ('winter', -1.4907706743549367),
 ('shower', -1.4666573617047047),
 ('focus', -1.4584053492386329),
 ('matter', -1.4113993943660466),
 ('santa', -1.3938570519355693),
 ('whether', -1.391657253576208),
 ('honest', -1.3894368389345153),
 ('also', -1.387936630130401),
 ('stuffs', -1.3568606970470058),
 ('treat', -1.343932965413788)]

In [253]:
predict_single_lr(['Lie about a fake coronavirus diagnosis to get time off from work, and get free unemployment.'], lr_coefs_dict, lr_intercept)

array([0.99864168])

Hey pretty neat. This could be used in the real world, on random text. Values > 0.5 indicate unethical, whereas <.5 are ethical.

However, it appears to be very narrowly defined by what redditors appear to want or desire that is unethical. I will need more training data to generalize it to other mediums.

In [226]:
#Operator borrowed from: https://stackoverflow.com/questions/613183/how-do-i-sort-a-dictionary-by-value

In [270]:
#Found through our top coefficients that unethical was still in our text. Let's remove it from our equation, as well as reques which is a shortened/misspelled request:
X_5 = X_4.drop(['unethical', 'reques'], axis=1)

In [271]:
X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(X_5, y_4, stratify=y_4, random_state=42)

In [272]:
lr_5 = LogisticRegression(max_iter=10_000, n_jobs=-1)

lr_5.fit(X_train_5, y_train_5)

lr_5.score(X_train_5, y_train_5)

0.945193171608266

In [273]:
lr_5.score(X_test_5, y_test_5)

0.8248450552411749

In [274]:
#The model is still overfit, but I would like to generalize these coefficients as above.

In [275]:
#Save our model coefficients and intercept:
lr_coefs_2 = lr_5.coef_
lr_intercept_2 = lr_5.intercept_
lr_features_2 = X_train_5.columns
lr_coefs_dict_2 = dict(zip(lr_features_2, lr_coefs_2[0]))

In [276]:
#Top 20 contributors to being unethical text:
sorted(lr_coefs_dict_2.items(), key=operator.itemgetter(1), reverse=True)[:20]

[('refund', 2.7779270836009045),
 ('lie', 2.209543692475753),
 ('return', 2.1395129920835303),
 ('doctor', 2.1328231199206047),
 ('caught', 2.095268465710384),
 ('roommate', 2.0922761880215917),
 ('illegal', 2.032611318440779),
 ('sell', 1.990955358093733),
 ('bathroom', 1.8971584139916058),
 ('legally', 1.8730700586788043),
 ('fake', 1.8166190909609465),
 ('exam', 1.7610609771721664),
 ('pretend', 1.7514665176189932),
 ('neighbor', 1.7236792325221257),
 ('coronavirus', 1.6949960892766613),
 ('drug', 1.587292821814307),
 ('paywalls', 1.586048982616131),
 ('shit', 1.5588739760927868),
 ('police', 1.5146780381710985),
 ('fart', 1.465527832012307)]

In [277]:
#Top 20 contributors to being ethical text:
sorted(lr_coefs_dict_2.items(), key=operator.itemgetter(1), reverse=False)[:20]

[('consider', -1.981627711278092),
 ('thoughts', -1.7983212160895168),
 ('learn', -1.7646077559582198),
 ('gifts', -1.6948616563140317),
 ('awards', -1.688148056548761),
 ('pressure', -1.5708073781947598),
 ('mean', -1.5676006547407135),
 ('remember', -1.5138989179072235),
 ('habit', -1.511236738809651),
 ('winter', -1.500556621909785),
 ('cook', -1.5001608714209713),
 ('shower', -1.4892236486179895),
 ('focus', -1.4787573495298925),
 ('honest', -1.4106407711387272),
 ('santa', -1.409310229737807),
 ('matter', -1.4041667455079108),
 ('whether', -1.395949969077398),
 ('also', -1.3710880307205153),
 ('stuffs', -1.3598626099506226),
 ('treat', -1.347673028201269)]

## Testing/Playground

Testing a definitely unethical piece of advice:

In [302]:
predict_single_lr(['Lie about a fake coronavirus diagnosis to get time off from work, and get free unemployment.'], lr_coefs_dict_2, lr_intercept_2)

array([0.99998843])

In [327]:
predict_single_lr(['Free coronavirus vaccines now! On sale! Super cheap!'], lr_coefs_dict_2, lr_intercept_2)

array([0.98664763])

In [279]:
#Interesting. Removing the unethical and reques columns actually made the individual predictor stronger.

Testing ethical advice:

In [303]:
predict_single_lr(['Consider the thoughts of others when you are out driving. Remember, getting angry doesnt make traffic go any faster. '], lr_coefs_dict_2, lr_intercept_2)

array([0.00131547])

More ethical advice:

In [304]:
predict_single_lr(['Be helpful when you see someone in trouble whether you know them or not. They probably need it, and you will feel better helping them.'], lr_coefs_dict_2, lr_intercept_2)

array([0.00568813])

Two attempts at unethical advice:

In [305]:
predict_single_lr(['Steal from your sibling only things that can be hidden or eaten easily. That way you can get away with it easier.'], lr_coefs_dict_2, lr_intercept_2)

array([0.30219783])

In [306]:
predict_single_lr(['Steal from your brother/sister only things that can be hidden or eaten easily. That way you can get away with it easier.'], lr_coefs_dict_2, lr_intercept_2)

array([0.72597546])

In [314]:
predict_single_lr(['Steal from your brother.'], lr_coefs_dict_2, lr_intercept_2)

array([0.82578585])

In [293]:
#Classified incorrectly, when using sibling. The words brother/sister appear to carry more weight with unethical acts.
#Which makes sense - who gives advice about doing nice things for your siblings? People mostly advise how to deal with poor spirited siblings or get away with
#things with respect to their siblings. Maybe people more often refer to them as siblings when it is positive (on reddit) and brother or sister when asking for 
#how to deal with them or get away with something. Shortening the advice cues in on the length affecting the classification.

And finally, the golden rule:

In [319]:
predict_single_lr(['Do unto others as you would have them do unto you.'], lr_coefs_dict_2, lr_intercept_2)

array([0.65060762])

In [316]:
#Uh oh. 

In [322]:
predict_single_lr(['Treat other people in the way that you think they want to be treated.'], lr_coefs_dict_2, lr_intercept_2)

array([0.732224])

Looking at the code of ethics from:
https://www.asha.org/code-of-ethics/

In [320]:
text = 'Individuals shall honor their responsibility to hold paramount the welfare of persons they serve professionally or who are participants in research and scholarly activities, and they shall treat animals involved in research in a humane manner.'

In [321]:
predict_single_lr([text], lr_coefs_dict_2, lr_intercept_2)

array([0.07534966])

In [296]:
X_5.columns[14680:]

Index(['état', 'ānmén', 'title_word_count', 'text_word_count', 'polarity',
       'subjectivity', 'title_neg', 'title_neu', 'title_pos', 'title_comp',
       'text_neg', 'text_neu', 'text_pos', 'text_comp'],
      dtype='object')

In [290]:
lr_coefs_dict_2['title_neg']

0.3963099018427896

In [291]:
lr_coefs_dict_2['title_pos']

0.0548566390030068

In [292]:
#The negative sentiment of the title contributes much more to the probability of being unethical.

In [297]:
lr_coefs_dict_2['title_word_count']

-0.03965973431086903

In [298]:
lr_coefs_dict_2['polarity']

-0.6578721962646711

In [None]:
#Positive polarity contributes very strongly to ethical text classification. 