# Visualizing with Tableau
---

In [None]:
# libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, ElasticNetCV, RidgeCV, LassoCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import train_test_split, cross_val_score ## ** GridSearchCV
from sklearn.metrics import confusion_matrix as cm

In [None]:
# data
data = pd.read_csv('data/bootstrapped_gp.csv', low_memory=False)
data.head()

### Vectorize

In [5]:
cvec = CountVectorizer(max_features=10000, ngram_range=(1,3))

df_cv = cvec.fit_transform(all_scrape['title'])
df_cv = pd.DataFrame(df_cv.todense(), columns=cvec.get_feature_names())

gp_cv = df_cv.loc[1000:]
ss_cv = df_cv.loc[:999]

### Word frequencies

In [6]:
# get all words by frequency
ss_freq = ss_cv.sum().sort_values(ascending=False)
gp_freq = gp_cv.sum().sort_values(ascending=False)

### Get word frequencies DF

In [7]:
n=1500
# list top n words from each subreddit by frequency
ss_top = list(ss_freq[:n].index)
gp_top = list(gp_freq[:n].index)
top_all = set(ss_top+gp_top)

# make df of top words by frequncy
df_freq = pd.DataFrame([{word: ss_freq[word] for word in top_all if word in ss_freq.index},
                        {word: gp_freq[word] for word in top_all if word in gp_freq.index}
                       ]).T.fillna(0).reset_index().rename(columns={'index' : 'word',
                                                                    0:'SubredditSimulator',
                                                                    1:'SubSimulatorGPT2',})
df_freq[df_freq._get_numeric_data().columns] = df_freq._get_numeric_data().astype('int')
df_freq.to_csv('data/df_freq.csv')
df_freq.head(3)
len(df_freq)

2381

In [8]:
df_freq.head(3)

Unnamed: 0,word,SubredditSimulator,SubSimulatorGPT2
0,shoot,4,1
1,dank maymays,2,0
2,my dog,2,4


### Define stopwords

In [9]:
nltk_stopwords = set(stopwords.words('english'))

def check_stopwords(x):
    grams = x.split(' ')
    stop = 0
    for gram in grams:
        if gram in nltk_stopwords:
            stop += 1
    if stop==len(grams):
        return 'stopword'
    elif stop>0:
        return 'part-stopword'
    else:
        return 'non-stopword'

In [10]:
df_freq['stopword'] = df_freq['word'].apply(check_stopwords)
df_freq.head(4)

Unnamed: 0,word,SubredditSimulator,SubSimulatorGPT2,stopword
0,shoot,4,1,non-stopword
1,dank maymays,2,0,non-stopword
2,my dog,2,4,part-stopword
3,but the,4,2,stopword


In [11]:
df_freq['stopword'].value_counts()

non-stopword     1266
part-stopword     681
stopword          434
Name: stopword, dtype: int64

### Define ngrams

In [12]:
def ngram_checker(word):
    ngram = 1
    for char in word:
        if char == ' ': ngram+=1
    return ngram

In [13]:
df_freq['ngrams'] = df_freq['word'].apply(ngram_checker)
df_freq.head(3)

Unnamed: 0,word,SubredditSimulator,SubSimulatorGPT2,stopword,ngrams
0,shoot,4,1,non-stopword,1
1,dank maymays,2,0,non-stopword,2
2,my dog,2,4,part-stopword,2


### Add mean TF-IDF scores
Average of scores across corpus when `tf-idf > 0.1`

In [14]:
tfidf_df = pd.read_csv('data/top_tfidf.csv').set_index('Unnamed: 0')

In [15]:
def get_tfidf_mean(word):
    if word in tfidf_df.index:
        return tfidf_df.loc[word].mean()

In [16]:
df_freq['tf-idf mean'] = df_freq['word'].apply(get_tfidf_mean)
df_freq.head(3)

Unnamed: 0,word,SubredditSimulator,SubSimulatorGPT2,stopword,ngrams,tf-idf mean
0,shoot,4,1,non-stopword,1,0.190601
1,dank maymays,2,0,non-stopword,2,
2,my dog,2,4,part-stopword,2,0.329556


### Get MnNB coefs

In [17]:
# combine dfs, define X, y
both_scrape = pd.concat([ss_scrape, gpt2_scrape], sort=False)
X = both_scrape['title']
y = both_scrape['sr']

# tfidf transform
tfid = TfidfVectorizer(max_features=10000, ngram_range=(1,3),)
X = tfid.fit_transform(X)
X = pd.DataFrame(X.toarray(), columns=tfid.get_feature_names())

# keep only cols from df_freq
Xf = X[[w for w in df_freq['word'] if w in X.columns]]

In [20]:
mnb = MultinomialNB(alpha=1*np.e**-20) # approaching 0 appears to be the best alpha
mnb.fit(Xf, y)

df_freq['mnb coef'] = mnb.coef_[0]
df_freq.head()

Unnamed: 0,word,SubredditSimulator,SubSimulatorGPT2,stopword,ngrams,tf-idf mean,mnb coef
0,shoot,4,1,non-stopword,1,0.190601,-9.711637
1,dank maymays,2,0,non-stopword,2,,-27.76819
2,my dog,2,4,part-stopword,2,0.329556,-7.24268
3,but the,4,2,stopword,2,0.17204,-8.825712
4,found this,4,1,part-stopword,2,0.250828,-8.8484


### Get Ridge coefs

In [21]:
lr = LogisticRegressionCV(Cs=np.logspace(.1, 1, 50), penalty='l2', solver='lbfgs', max_iter=5000, cv=3)
lr.fit(Xf, y)
df_freq['ridge tf-idf coefficient'] = lr.coef_[0] # new col for coef
print(cross_val_score(lr, Xf, y, cv=5).mean())
df_freq.head(3)

0.723


Unnamed: 0,word,SubredditSimulator,SubSimulatorGPT2,stopword,ngrams,tf-idf mean,mnb coef,ridge tf-idf coefficient
0,shoot,4,1,non-stopword,1,0.190601,-9.711637,-0.80455
1,dank maymays,2,0,non-stopword,2,,-27.76819,-0.516721
2,my dog,2,4,part-stopword,2,0.329556,-7.24268,1.072602


#### Add `Total Count`,  `Ridge β (direction)` and `Ridge β (abs)`

In [22]:
df_freq['Total Count'] = df_freq['SubredditSimulator'] + df_freq['SubSimulatorGPT2']

def sway(x):
    if x<0: return 'β < 0 (ss)'
    else: return 'β > 0 (gpt2)'  
df_freq['Ridge β'] = df_freq['ridge tf-idf coefficient'].apply(sway)
df_freq['Ridge β (abs)'] = df_freq['ridge tf-idf coefficient'].abs()
df_freq

Unnamed: 0,word,SubredditSimulator,SubSimulatorGPT2,stopword,ngrams,tf-idf mean,mnb coef,ridge tf-idf coefficient,Total Count,Ridge β,Ridge β (abs)
0,shoot,4,1,non-stopword,1,0.190601,-9.711637,-0.804550,5,β < 0 (ss),0.804550
1,dank maymays,2,0,non-stopword,2,,-27.768190,-0.516721,2,β < 0 (ss),0.516721
2,my dog,2,4,part-stopword,2,0.329556,-7.242680,1.072602,6,β > 0 (gpt2),1.072602
3,but the,4,2,stopword,2,0.172040,-8.825712,-0.644483,6,β < 0 (ss),0.644483
4,found this,4,1,part-stopword,2,0.250828,-8.848400,-0.114577,5,β < 0 (ss),0.114577
...,...,...,...,...,...,...,...,...,...,...,...
2376,of all,2,4,stopword,2,0.224201,-7.810621,0.328704,6,β > 0 (gpt2),0.328704
2377,box stuck in,0,14,part-stopword,3,,-8.796794,0.414690,14,β > 0 (gpt2),0.414690
2378,when see,3,1,part-stopword,2,,-9.680254,-0.341991,4,β < 0 (ss),0.341991
2379,don even,2,0,part-stopword,2,,-27.768190,-0.496973,2,β < 0 (ss),0.496973


### Get doc (appearance) counts:

In [24]:
doc_count = pd.DataFrame(df_freq['word'])

for i in doc_count.index:
    word = doc_count.loc[i, 'word']
    try:
        doc_count.loc[i, 'ss_doc_count'] = len(ss_cv[ss_cv[word]!=0])
    except:
        print(i)

In [26]:
for i in doc_count.index:
    word = doc_count.loc[i, 'word']
    try:
        doc_count.loc[i, 'gp_doc_count'] = len(gp_cv[gp_cv[word]!=0])
    except:
        print(i)
doc_count.head(3)

Unnamed: 0,word,ss_doc_count,gp_doc_count
0,shoot,4.0,1.0
1,dank maymays,2.0,0.0
2,my dog,2.0,4.0


In [27]:
df_freq = pd.concat([df_freq, doc_count.drop(columns='word')], axis=1)
df_freq.head(3)

Unnamed: 0,word,SubredditSimulator,SubSimulatorGPT2,stopword,ngrams,tf-idf mean,mnb coef,ridge tf-idf coefficient,Total Count,Ridge β,Ridge β (abs),ss_doc_count,gp_doc_count
0,shoot,4,1,non-stopword,1,0.190601,-9.711637,-0.80455,5,β < 0 (ss),0.80455,4.0,1.0
1,dank maymays,2,0,non-stopword,2,,-27.76819,-0.516721,2,β < 0 (ss),0.516721,2.0,0.0
2,my dog,2,4,part-stopword,2,0.329556,-7.24268,1.072602,6,β > 0 (gpt2),1.072602,2.0,4.0


## Export

In [28]:
from better_profanity import profanity
profanity.load_censor_words()

In [29]:
def limit_profanity(x):
    grams = x.split(' ')
    res = []
    for g in grams:
        if profanity.contains_profanity(g):
            if 'ing' not in g: res.append("".join([g[0]]+['*']*(len(g)-2)+[g[-1]]))
            else: res.append("".join([g[0]]+['*']*(len(g)-4)+[g[-3]]))
        else:
            res.append(g)
    return " ".join(res)

In [31]:
df_freq.loc[np.where([profanity.contains_profanity(word) for word in df_freq['word']])].head(3)

Unnamed: 0,word,SubredditSimulator,SubSimulatorGPT2,stopword,ngrams,tf-idf mean,mnb coef,ridge tf-idf coefficient,Total Count,Ridge β,Ridge β (abs),ss_doc_count,gp_doc_count
6,of shit you,0,3,part-stopword,3,0.20033,-9.32221,0.089508,3,β > 0 (gpt2),0.089508,0.0,1.0
56,gay and,0,3,part-stopword,2,0.199115,-8.002887,0.650739,3,β > 0 (gpt2),0.650739,0.0,3.0
75,ass tittyfuck bitch,3,0,non-stopword,3,0.26973,-27.76819,-0.364826,3,β < 0 (ss),0.364826,1.0,0.0


In [32]:
df_freq['word'] = df_freq['word'].apply(limit_profanity)

In [34]:
df_freq.to_csv('df_freq.csv', index=False)