In [2]:
#imports
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline

In [3]:
#grab csv as dataframe
df = pd.read_csv("redditcons", sep = "\t", index_col = "time")
df = df.drop("Unnamed: 0", axis = 1)
df.head()

Unnamed: 0_level_0,posts,text,urls,author,crossposts
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1970-01-18 13:38:23.242,House panel clears release of Democrats’ rebut...,,https://www.washingtonpost.com/world/national-...,hurtsdonut_,0
1970-01-18 13:38:22.359,Old School r/conspiracy. What this sub thought...,,https://np.reddit.com/r/conspiracy/comments/5r...,naturalproducer,0
1970-01-18 13:38:21.695,Non-partisan take on FISA memo - Ron Paul,,http://www.unz.com/rpaul/what-the-fbifisa-memo...,stainless_hardened3,0
1970-01-18 13:38:21.036,Trump’s ‘State of the Union’ raised trade war ...,,http://www.cogwriter.com/news/prophecy/trumps-...,Jigglycheesepuff,0
1970-01-18 13:38:21.035,Looks Like Carter Page (inside the Trump Campa...,https://theconservativetreehouse.com/2018/02/0...,https://www.reddit.com/r/conspiracy/comments/7...,Patreut,0


In [4]:
df["text"] = df.text.astype(str)


In [5]:
#Clean up the posts text
from nltk.corpus import stopwords
import re

stop = stopwords.words("english")
stop2 = ["get", "r", "like", "us"]
def stopwords(x):
    x = re.sub("[^a-z\s]", " ", x.lower())
    x = [w for w in x.split() if w not in set(stop) and w not in stop2]
    return " ".join(x)

df["posts_cleaned"] = df["posts"].apply(stopwords)
df["text_cleaned"] = df["text"].apply(stopwords)

df.head()

Unnamed: 0_level_0,posts,text,urls,author,crossposts,posts_cleaned,text_cleaned
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1970-01-18 13:38:23.242,House panel clears release of Democrats’ rebut...,,https://www.washingtonpost.com/world/national-...,hurtsdonut_,0,house panel clears release democrats rebuttal ...,
1970-01-18 13:38:22.359,Old School r/conspiracy. What this sub thought...,,https://np.reddit.com/r/conspiracy/comments/5r...,naturalproducer,0,old school conspiracy sub thought false left r...,
1970-01-18 13:38:21.695,Non-partisan take on FISA memo - Ron Paul,,http://www.unz.com/rpaul/what-the-fbifisa-memo...,stainless_hardened3,0,non partisan take fisa memo ron paul,
1970-01-18 13:38:21.036,Trump’s ‘State of the Union’ raised trade war ...,,http://www.cogwriter.com/news/prophecy/trumps-...,Jigglycheesepuff,0,trump state union raised trade war concerns sh...,
1970-01-18 13:38:21.035,Looks Like Carter Page (inside the Trump Campa...,https://theconservativetreehouse.com/2018/02/0...,https://www.reddit.com/r/conspiracy/comments/7...,Patreut,0,looks carter page inside trump campaign accident,https theconservativetreehouse com march carte...


In [6]:
#Tokenize + tag the posts
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag, pos_tag_sents
from textblob import TextBlob

df["posts_stokenized"] = df["posts"].apply(sent_tokenize)
df["text_stokenized"] = df["text"].apply(sent_tokenize)
df["posts_wtokenized"] = df["posts"].apply(word_tokenize)
df["text_wtokenized"] =  df["text"].apply(word_tokenize)
df["posts_tagged"] = pos_tag_sents(df["posts"].apply(word_tokenize).tolist())
df["text_tagged"] =  pos_tag_sents(df["text"].apply(word_tokenize).tolist()) 
df["posts_nouns"] = df["posts"].astype(str).apply(lambda x: TextBlob(x).noun_phrases)

df.head()

Unnamed: 0_level_0,posts,text,urls,author,crossposts,posts_cleaned,text_cleaned,posts_stokenized,text_stokenized,posts_wtokenized,text_wtokenized,posts_tagged,text_tagged,posts_nouns
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1970-01-18 13:38:23.242,House panel clears release of Democrats’ rebut...,,https://www.washingtonpost.com/world/national-...,hurtsdonut_,0,house panel clears release democrats rebuttal ...,,[House panel clears release of Democrats’ rebu...,[nan],"[House, panel, clears, release, of, Democrats,...",[nan],"[(House, NNP), (panel, NN), (clears, VBZ), (re...","[(nan, NN)]","[house panel, ’ rebuttal, gop memo, trump]"
1970-01-18 13:38:22.359,Old School r/conspiracy. What this sub thought...,,https://np.reddit.com/r/conspiracy/comments/5r...,naturalproducer,0,old school conspiracy sub thought false left r...,,"[Old School r/conspiracy., What this sub thoug...",[nan],"[Old, School, r/conspiracy, ., What, this, sub...",[nan],"[(Old, NNP), (School, NNP), (r/conspiracy, NN)...","[(nan, NN)]","[old school r/conspiracy, false left-right par..."
1970-01-18 13:38:21.695,Non-partisan take on FISA memo - Ron Paul,,http://www.unz.com/rpaul/what-the-fbifisa-memo...,stainless_hardened3,0,non partisan take fisa memo ron paul,,[Non-partisan take on FISA memo - Ron Paul],[nan],"[Non-partisan, take, on, FISA, memo, -, Ron, P...",[nan],"[(Non-partisan, JJ), (take, NN), (on, IN), (FI...","[(nan, NN)]","[non-partisan, fisa, ron paul]"
1970-01-18 13:38:21.036,Trump’s ‘State of the Union’ raised trade war ...,,http://www.cogwriter.com/news/prophecy/trumps-...,Jigglycheesepuff,0,trump state union raised trade war concerns sh...,,[Trump’s ‘State of the Union’ raised trade war...,[nan],"[Trump, ’, s, ‘, State, of, the, Union, ’, rai...",[nan],"[(Trump, NNP), (’, NNP), (s, VBD), (‘, NNP), (...","[(nan, NN)]","[trump, ’ s ‘ state, union ’, trade war, 'worl..."
1970-01-18 13:38:21.035,Looks Like Carter Page (inside the Trump Campa...,https://theconservativetreehouse.com/2018/02/0...,https://www.reddit.com/r/conspiracy/comments/7...,Patreut,0,looks carter page inside trump campaign accident,https theconservativetreehouse com march carte...,[Looks Like Carter Page (inside the Trump Camp...,[https://theconservativetreehouse.com/2018/02/...,"[Looks, Like, Carter, Page, (, inside, the, Tr...","[https, :, //theconservativetreehouse.com/2018...","[(Looks, NNS), (Like, IN), (Carter, NNP), (Pag...","[(https, NN), (:, :), (//theconservativetreeho...","[looks, carter page, trump campaign, accident]"


In [7]:
from nltk import BigramCollocationFinder

#most common words from posts: 
words = pd.Series("".join(df["posts_cleaned"]).lower().split()).value_counts()[:1000]
conspiracy_words = []
for word in words.keys():
    conspiracy_words.append(word)

conspiracy_words

['trump',
 'conspiracy',
 'new',
 'people',
 'fbi',
 'world',
 'government',
 'one',
 'state',
 'news',
 'israel',
 'memo',
 'clinton',
 'says',
 'media',
 'vegas',
 'would',
 'war',
 'russia',
 'secret',
 'time',
 'fake',
 'year',
 'post',
 'russian',
 'cia',
 'false',
 'video',
 'going',
 'real',
 'could',
 'make',
 'even',
 'iran',
 'missile',
 'think',
 'last',
 'see',
 'theory',
 'american',
 'white',
 'u',
 'house',
 'know',
 'fisa',
 'control',
 'president',
 'years',
 'first',
 'twitter',
 'truth',
 'las',
 'hawaii',
 'right',
 'back',
 'israeli',
 'police',
 'really',
 'human',
 'intelligence',
 'report',
 'anti',
 'anyone',
 'jewish',
 'found',
 'old',
 'history',
 'alert',
 'military',
 'today',
 'reddit',
 'every',
 'earth',
 'deep',
 'also',
 'never',
 'used',
 'made',
 'else',
 'political',
 'hillary',
 'go',
 'man',
 'evidence',
 'actually',
 'want',
 'child',
 'america',
 'release',
 'states',
 'use',
 'claims',
 'public',
 'top',
 'around',
 'social',
 'obama',
 'inves

In [11]:
#write that list to a file
f = open("conspiracy1000words.txt", "w")
for item in conspiracy_words:
    f.write(item)
f.close()
print("all done")

all done


In [10]:
#get the bigrams
vectorizer = CountVectorizer(ngram_range = (2, 2), analyzer = "word")
sparse_matrix = vectorizer.fit_transform(df["posts_cleaned"])
frequencies = sum(sparse_matrix).toarray()[0]
dfConspiracy = pd.DataFrame(frequencies, index = vectorizer.get_feature_names(), columns = ["frequency"])
dfConspiracy.sort_values(by=["frequency"], ascending = False, inplace = True)
dfConspiracy.head(100)

Unnamed: 0,frequency
las vegas,84
fake news,65
deep state,54
donald trump,46
anyone else,41
vegas shooting,39
fisa memo,36
conspiracy theory,36
white house,36
fusion gps,35


In [11]:
dfConspiracy.index.values.tofile("conspiracy100bigrams.txt", "w")
print("all done")

all done


In [12]:
#and the trigrams
vectorizer = CountVectorizer(ngram_range = (3, 3), analyzer = "word")
sparse_matrix = vectorizer.fit_transform(df["posts_cleaned"])
frequencies = sum(sparse_matrix).toarray()[0]
dfConspiracy3 = pd.DataFrame(frequencies, index = vectorizer.get_feature_names(), columns = ["frequency"])
dfConspiracy3.sort_values(by=["frequency"], ascending = False, inplace = True)
dfConspiracy3.head(100)

Unnamed: 0,frequency
las vegas shooting,28
martin luther king,12
false missile alert,12
las vegas massacre,11
hawaii missile alert,10
fake news awards,10
new york times,9
new world order,8
year old palestinian,8
world trade center,7


In [1]:
dfConspiracy3.index.values.tofile("conspiracy100trigrams.txt", "w")
print("all done")

NameError: name 'dfConspiracy3' is not defined

In [15]:
#dataframe to csv for eda
df.to_csv("RedditConspiracy2018", sep = "\t", encoding = "utf-8")
print("done")

done
