In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
sns.set()
%config InlineBackend.figure_format = 'retina'

In [2]:
DATA = 'data'
allFiles = glob.glob(DATA + "/*.csv")
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_)
    list_.append(df)
data = pd.concat(list_)

In [320]:
# we'll ignore retweets and non-english tweets
data = data.loc[(data.language == 'English') & (data.retweet == 0)]

In [106]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import words
import re
from tokenizer import tokenizer

In [199]:
T = tokenizer.TweetTokenizer(preserve_handles=True, 
                             preserve_hashes=True, 
                             preserve_case=False, 
                             preserve_url=False,
                             regularize=True)
en_stop = set(stopwords.words('english') + ['rt'])
p_stemmer = PorterStemmer()
en_words = set(w.lower() for w in words.words())

In [347]:
def lower(x): 
    return x.lower()
def delspecial(x):
    return x.replace('â', '')
def tokenize(x): 
    return T.tokenize(x)
def rm_stop(x): 
    return [j for j in x if not j in en_stop]
def stem(x): 
    return [p_stemmer.stem(j) for j in x]
def english(x):
    return list(filter(lambda w: w in en_words, x))
def rm_noalphanum(x):
    return list(filter(lambda x: re.search(r'[A-Za-z]', x), x))

In [348]:
def preprocess(text, transformations=[lower, delspecial, tokenize, rm_stop, rm_noalphanum]):
    for t in transformations:
        text = t(text)
    return text

### On The Right

In [349]:
text = data.loc[(data.account_category == 'RightTroll')].content
docs = [preprocess(d) for d in text.values]

KeyboardInterrupt: 

In [336]:
from gensim.corpora import Dictionary
dictionary = Dictionary(docs)
print('Before:', len(dictionary))
dictionary.filter_extremes(no_below=10, no_above=.20)
print('After:', len(dictionary))

Before: 124445
After: 20196


In [337]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [338]:
from gensim.models import LdaModel
model = LdaModel(corpus=corpus,
                 id2word=dictionary,
                 num_topics=8,
                 iterations=100)

In [261]:
topicdocs = np.zeros((100000, 8))

In [262]:
for idoc, doc in enumerate(model[corpus]):
    for it in doc:
        topicdocs[idoc, it[0]] = it[1]

In [339]:
import pyLDAvis.gensim
import warnings
pyLDAvis.enable_notebook()
warnings.filterwarnings("ignore", category=FutureWarning) 

In [340]:
pdata = pyLDAvis.gensim.prepare(model, corpus, dictionary)

In [341]:
pdata