In [None]:
#import
import pandas as pd
import numpy as np
from textblob import TextBlob, Word
import nltk
import seaborn as sns

#magic
%matplotlib inline

# Get data from reddit

In [None]:
#Authentication (removed)
import praw

rdt = praw.Reddit(client_id = "xxxx", client_secret = "xxx", password = "xxx",
                  username = "xxx", user_agent = "xxx")

In [None]:
#grab data from reddit
posts = list()
source = list()
text = list()
author = list()
crossposts = list()
time = list()
sub_r = rdt.subreddit("conspiracy")
for sub in sub_r.submissions(1514764800, 1517875200):
    if sub is not None:
        posts.append(sub.title)
        source.append(sub)
        text.append(sub.selftext)
        author.append(sub.author)
        crossposts.append(sub.num_crossposts)
        time.append(sub.created)
    else:
        text.append(None)
        crossposts.append(None)
        time.append(None)
        
conspiracy_reddit = pd.DataFrame(np.column_stack((time, posts, text, author, source)), 
                                 columns = ["time", "posts", "text", "author", "source"])

In [None]:
#set the index
conspiracy_reddit["time"] = (pd.to_datetime(conspiracy_reddit["time"],unit="ms"))


In [None]:
#send to csv
conspiracy_reddit.to_csv("redditcons", sep='\t', encoding='utf-8')

# Analyze Reddit

In [None]:
#grab csv as dataframe
df = pd.read_csv("redditconspiracy/redditcons", sep = "\t", index_col = "time")
df = df.drop("Unnamed: 0", axis = 1)
df["text"] = df.text.astype(str)

In [None]:
#Clean up the posts text
from nltk.corpus import stopwords
import re

stop = stopwords.words("english")
stop2 = ["get", "r", "like", "us"]
def stopwords(x):
    x = re.sub("[^a-z\s]", " ", x.lower())
    x = [w for w in x.split() if w not in set(stop) and w not in stop2]
    return " ".join(x)

df["posts_cleaned"] = df["posts"].apply(stopwords)
df["text_cleaned"] = df["text"].apply(stopwords)

df.head()

In [None]:
#Tokenize + tag the posts
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag, pos_tag_sents
from textblob import TextBlob

df["posts_stokenized"] = df["posts"].apply(sent_tokenize)
df["text_stokenized"] = df["text"].apply(sent_tokenize)
df["posts_wtokenized"] = df["posts"].apply(word_tokenize)
df["text_wtokenized"] =  df["text"].apply(word_tokenize)
df["posts_tagged"] = pos_tag_sents(df["posts"].apply(word_tokenize).tolist())
df["text_tagged"] =  pos_tag_sents(df["text"].apply(word_tokenize).tolist()) 
df["posts_nouns"] = df["posts"].astype(str).apply(lambda x: TextBlob(x).noun_phrases)


In [None]:
from nltk import BigramCollocationFinder

#most common words from posts: 
words = pd.Series("".join(df["posts_cleaned"]).lower().split()).value_counts()[:1000]
conspiracy_words = []
for word in words.keys():
    conspiracy_words.append(word)

In [None]:
#write that list to a file
with open("conspiracy1000words.txt", "w") as f:
    for item in conspiracy_words:
        f.write(item + "\n")
    
f.close()
print("all done")

In [None]:
#get the bigrams
vectorizer = CountVectorizer(ngram_range = (2, 2), analyzer = "word")
sparse_matrix = vectorizer.fit_transform(df["posts_cleaned"])
frequencies = sum(sparse_matrix).toarray()[0]
dfConspiracy = pd.DataFrame(frequencies, index = vectorizer.get_feature_names(), columns = ["frequency"])
dfConspiracy.sort_values(by=["frequency"], ascending = False, inplace = True)
dfConspiracy.head(100)

# Cable News Chyrons
Files from Nov. 2017 - Feb. 2018 of every 15 minutes taken from the Television Archives


In [None]:
#bring in the directory
import glob
path = r"C:/Users/bpeake/dropbox/1_DataSci/data/Cheyron2018"
allFiles = glob.glob(path + "/*.tsv")
list_ = []
for file in allFiles:
    chyron = pd.read_csv(file, index_col = None, header = None, sep = "\t")
    list_.append(chyron)

df = pd.concat(list_)

In [None]:
df.columns = ["timestamp", "channel", "retweets", "show", "text"]
df.set_index("timestamp", inplace = True)
df["text"] = df["text"].str.lower()

In [None]:
#Create conspiracy corpus from r/conspiracy
conspiracies = []

#These are the files I want to read in
files = ["redditconspiracy/conspiracy1000words.txt", 
         "redditconspiracy/conspiracy100bigrams.txt",
         "redditconspiracy/conspiracy100trigrams.txt"]

#populate the list with lists from each file
for file in files:
    with open(file, "r") as f:
         lines = f.read().splitlines()
         conspiracies.append(lines)

#create
conspiracy_list = []
conspiracy_items = [conspiracies[1], conspiracies[2]]
for thing in conspiracy_items:
    for stuff in thing:
        conspiracy_list.append(stuff)

In [None]:
#Measure conspiracy against the dataframe, export to csv
conspiracy = "|".join(conspiracy_list)
df["conspiracy_talk"] = df["text"].str.contains(conspiracy)
df.to_csv("conspiracy_news", sep = "\t", encoding = "utf-8")

#map the boolean to numbers
df["conspiracy_talk"] = df["conspiracy_talk"].map({True: 1, False: 0})

In [None]:
#Create channel-specific dataframes
fox = df[df.channel == "FOXNEWSW"]
bbc = df[df.channel == "BBCNEWS"]
cnn = df[df.channel == "CNNW"]
msnbc = df[df.channel == "MSNBCW"]

In [None]:
#Get the news stats
print("FOX")
print(fox.conspiracy_talk.describe())
print(" ")
print("CNN")
print(cnn.conspiracy_talk.describe())
print(" ")
print("MSNBC")
print(msnbc.conspiracy_talk.describe())
print(" ")
print("BBC")
print(bbc.conspiracy_talk.describe())

In [None]:
#dependencies
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag, pos_tag_sents
from textblob import TextBlob

#create dataframe for news with conspiracy
news_w_conspiracy = df[df.conspiracy_talk == True]

#tagging conspiracy language
news_w_conspiracy["text_stokenized"] = news_w_conspiracy["text"].apply(sent_tokenize)
news_w_conspiracy["text_wtokenized"] =  news_w_conspiracy["text"].apply(word_tokenize)
news_w_conspiracy["text_tagged"] =  pos_tag_sents(news_w_conspiracy["text"].apply(word_tokenize).tolist()) 
news_w_conspiracy["posts_nouns"] = news_w_conspiracy["text"].astype(str).apply(lambda x: TextBlob(x).noun_phrases)


In [None]:
#dependencies
from nltk.corpus import stopwords
import re

# build stopwords function
stop = stopwords.words("english")
stop2 = ["get", "r", "like", "us"]
def stopwords(x):
    x = re.sub("[^a-z\s]", " ", x.lower())
    x = [w for w in x.split() if w not in set(stop) and w not in stop2]
    return " ".join(x)

#clean text column
news_w_conspiracy["text_cleaned"] = news_w_conspiracy["text"].apply(stopwords)

In [None]:
#build conspiracy news bigrams/trigrams that simulate headlines
#dependencies
from sklearn.feature_extraction.text import CountVectorizer
from nltk import BigramCollocationFinder

#bigrams
vectorizer = CountVectorizer(ngram_range = (2, 2), analyzer = "word")
sparse_matrix = vectorizer.fit_transform(news_w_conspiracy["text_cleaned"])
frequencies = sum(sparse_matrix).toarray()[0]
dfConspiracy = pd.DataFrame(frequencies, index = vectorizer.get_feature_names(), columns = ["frequency"])
dfConspiracy.sort_values(by=["frequency"], ascending = False, inplace = True)

with open("cons_news100bigrams.txt", "w") as f:
    for item in dfConspiracy.index:
        f.write(item + "\n")

f.close()

#trigrams
vectorizer = CountVectorizer(ngram_range = (3, 3), analyzer = "word")
sparse_matrix = vectorizer.fit_transform(news_w_conspiracy["text_cleaned"])
frequencies = sum(sparse_matrix).toarray()[0]
dfConspiracy3 = pd.DataFrame(frequencies, index = vectorizer.get_feature_names(), columns = ["frequency"])
dfConspiracy3.sort_values(by=["frequency"], ascending = False, inplace = True)

with open("cons_news100trigrams.txt", "w") as f:
    for item in dfConspiracy3.index:
        f.write(item + "\n")

f.close()

# Twitter analysis
Taken from the past 3 months of Donald Trump's twitter. Taken from previously built dataset

In [None]:
df = pd.read_csv("TrumpTweets112017.txt", sep = ",", encoding = "utf-8", index_col = "created_at")

In [None]:
#read in the conspiracy news dictionary
conspiracy_news = []
files = ["cons_news100bigrams.txt",
         "cons_news100trigrams.txt"]
for file in files:
    with open(file, "r") as f: 
        lines = f.read().splitlines()
        conspiracy_news.append(lines)

conspiracy_news_items = []
conspiracy_news_list = [conspiracy_news[0], conspiracy_news[1]]
for thing in conspiracy_news_list:
    for stuff in thing: 
        conspiracy_news_items.append(stuff)
        
#make the list comprehendable to pandas .contain()
conspiracy_news = "|".join(conspiracy_news_items)

#create the dataframe and export it as a csv
df["conspiracy_news"] = df["text"].str.contains(conspiracy_news)df.drop("id_str", axis = 1, inplace = True)
df.drop("id_str", axis = 1, inplace = True)

In [None]:
#Create conspiracy dictionary
conspiracies = []

#These are the files I want to read in
files = ["redditconspiracy/conspiracy1000words.txt", 
         "redditconspiracy/conspiracy100bigrams.txt",
         "redditconspiracy/conspiracy100trigrams.txt"]

#populate the list with lists from each file
for file in files:
    with open(file, "r") as f:
         lines = f.read().splitlines()
         conspiracies.append(lines)

#create
conspiracy_list = []
conspiracy_items = [conspiracies[1], conspiracies[2]]
for thing in conspiracy_items:
    for stuff in thing:
        conspiracy_list.append(stuff)

conspiracy = "|".join(conspiracy_list)

In [None]:
#create a column about conspiracy talk, send to csv
df["conspiracy_talk"] = df["text"].str.contains(conspiracy)
df.to_csv("trump_cons_twitter.tsv", sep = "\t", encoding = "utf-8")

# Exploratory Data Analysis

In [None]:
#read in the various dataframes
news = pd.read_csv("conspiracy_news.tsv", sep = "\t", 
                   index_col = "timestamp", encoding = "utf-8")
trump = pd.read_csv("trump_cons_twitter.tsv", sep = "\t", index_col = "created_at", encoding = "utf-8")
conspiracy = pd.read_csv("redditconspiracy/RedditConspiracy2018.csv", 
                         sep = "\t", index_col = "time", encoding = "utf-8")

In [None]:
#relationship between favorites and retweets by conspiracy news
trump_consp_news = trump[trump.conspiracy_reference == True]
trump_consp_news.drop("text", axis = 1, inplace = True)
trump_no_consp_news = trump[trump.conspiracy_reference == False]
trump_consp_news.plot(kind = "scatter", x  = "retweet_count", y = "favorite_count", alpha = 0.3)
trump_no_consp_news.plot(kind = "scatter", x = "retweet_count", y = "favorite_count", alpha = 0.3)

In [None]:
#relationship between favorites and retweets by conspiracy talk
trump_consp_talk = trump[trump.conspiracy_talk == True]
trump_no_consp_talk = trump[trump.conspiracy_talk == False]
trump_consp_talk.plot(kind = "scatter", x  = "retweet_count", y = "favorite_count", alpha = 0.3)
trump_no_consp_talk.plot(kind = "scatter", x = "retweet_count", y = "favorite_count", alpha = 0.3)

In [None]:
#boxplot of retweet/favorite count by mention of conspiracy
trump.boxplot(column = "retweet_count", by = "conspiracy_reference")
trump.boxplot(column = "favorite_count", by = "conspiracy_talk")

In [None]:
#distribution of retweets by conspiracy talk
trump.hist(column = "retweet_count", by = "conspiracy_talk")

In [None]:
#distribution of favorites by conspiracy talk
trump.hist(column = "favorite_count", by = "conspiracy_talk")

In [None]:
#distribution of retweets by conspiracy news reference
trump.hist(column = "retweet_count", by = "conspiracy_reference")

In [None]:
#distribution of favorites by conspiracy news reference
trump.hist(column = "favorite_count", by = "conspiracy_reference")

In [None]:
#create cleaning function for topic modeling news sites
from gensim import corpora, models, similarities
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

stopwords = set(stopwords.words("english"))
punctuation = set(string.punctuation)
lemmatize = WordNetLemmatizer()

def cleaning(article):
    lower_split = " ".join([i for i in article.lower().split() if i not in stopwords])
    punctuation_removal = "".join(i for i in lower_split if i not in punctuation)
    lemm = " ".join(lemmatize.lemmatize(i) for i in punctuation_removal.split())
    return lemm

In [None]:
#clean conspiracy news for topic modeling
conspiracy_news = news[news.conspiracy_talk == True]
words = conspiracy_news["text"].apply(cleaning)
word_list = [i.split() for i in words]
len(word_list)

In [None]:
#build dictionary
dictionary = corpora.Dictionary(word_list)
dictionary.save("dictionary.dict")
print(dictionary)

In [None]:
#build corpus - doing manually with gensim more accurate than NLTK
doc_term_matrix = [dictionary.doc2bow(doc) for doc in word_list]
corpora.MmCorpus.serialize("corpus.mm", doc_term_matrix)

print(len(doc_term_matrix))
print(doc_term_matrix[100])

In [None]:
#create a way to monitor the passes by gensim
from time import time
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO,
                   filename='running.log',filemode='w')

In [None]:
#LDA for topic model of news
import gensim
start = time()
#LDA model
#create the object with gensim
Lda = gensim.models.ldamodel.LdaModel

#training the LDA model
ldamodel = Lda(doc_term_matrix, num_topics = 50, id2word = dictionary, 
               passes = 5)

ldamodel.save("topic.model")
print('used: {:.2f}s'.format(time()-start))

In [None]:
#create interactive topic model using the LDAvis import from R
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

d = gensim.corpora.Dictionary.load("dictionary.dict")
c = gensim.corpora.MmCorpus("corpus.mm")
lda = gensim.models.LdaModel.load("topic.model")

data = pyLDAvis.gensim.prepare(lda, c, d)
data

#ignore the warnings

In [None]:
#create topic model for trump conspiracy talk
#clean tweets
import re

def twitter_clean(x): 
    x = re.sub("[^a-z\s]", " ", x.lower())
    x = [w for w in x.split() if w not in set(stopwords)]
    return " ".join(x)

#create dataframe
trump_conspiracy = trump[trump.conspiracy_talk == True]

#clean it
trump_conspiracy["cleaned_text"] = trump_conspiracy["text"].apply(twitter_clean)

In [None]:
#trump model
words = trump_conspiracy["cleaned_text"]
word_list = [i.split() for i in words]
len(word_list)

In [None]:
#build dictionary
dictionary = corpora.Dictionary(word_list)
dictionary.save("dictionary.dict")
print(dictionary)

In [None]:
#build corpus
doc_term_matrix = [dictionary.doc2bow(doc) for doc in word_list]
corpora.MmCorpus.serialize("corpus.mm", doc_term_matrix)

print(len(doc_term_matrix))
print(doc_term_matrix[100])

In [None]:
import gensim
start = time()

#LDA model
#create the object with gensim
Lda = gensim.models.ldamodel.LdaModel

#training the LDA model
ldamodel = Lda(doc_term_matrix, num_topics = 50, id2word = dictionary, 
               passes = 5)

ldamodel.save("topic.model")
print('used: {:.2f}s'.format(time()-start))

In [None]:
#create the interactive topic model
d2 = gensim.corpora.Dictionary.load("dictionary.dict")
c2 = gensim.corpora.MmCorpus("corpus.mm")
lda2 = gensim.models.LdaModel.load("topic.model")

data2 = pyLDAvis.gensim.prepare(lda2, c2, d2)
data2