In [2]:
import json
import nltk
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
## Check to see if data contains are repeats since I'm not sure
## this was taken care of
def check_dups(data):
    doc_ids = []
    for entry in data:
        doc_ids.append(entry['doc_id'])
    if len(doc_ids) != len(set(doc_ids)):
        print('there is a dup!')
    else:
        print('no dup!')

In [4]:
# Split the data into the different areas which we want to test on
def split_data(data):
    vaccine = []
    bigfoot = []
    flat = []
    pizza = []
    climate = []
    for entry in data:
        if entry['seeds'].__contains__('big.foot'):
            bigfoot.append(entry)
        if entry['seeds'].__contains__('vaccine'):
            vaccine.append(entry)
        if entry['seeds'].__contains__('flat.earth'):
            flat.append(entry)
        if entry['seeds'].__contains__('pizzagate'):
            pizza.append(entry)
        if entry['seeds'].__contains__('climate'):
            climate.append(entry)

    return bigfoot, vaccine, flat, pizza, climate

In [5]:
# Find out how many elements in the corpus are conspiratorial
def polarity_analysis(topic):
    tmp_true = 0
    tmp_false = 0

    for i in range(len(topic)):
        if topic[i]['subcorpus'] == 'conspiracy' \
            and topic[i]['conspiracy_representative']:
            tmp_true += 1
        else:
            tmp_false += 1
    return tmp_false, tmp_true

In [6]:
# does what it says on the can
def avg_text_length(topic):
    total = 0
    for i in range(len(topic)):
        total += topic[i]['txt_nwords']

    return float(total) / len(topic)

In [7]:
# does what it says on the can
def avg_sent_num(topic):
    total = 0
    for i in range(len(topic)):
        total += topic[i]['txt_nsentences']

    return float(total) / len(topic)

In [8]:
def avg_share_comment_react(topic):
    share = 0
    comment = 0
    react = 0
    for i in range(len(topic)):
        share += topic[i]['FB_shares']
        comment += topic[i]['FB_comments']
        react += topic[i]['FB_reactions']
    
    return (float(share) / len(topic), \
            float(comment) / len(topic), \
            float(react) / len(topic))

***Total Dataset Analysis***

In [9]:
with open('../data/LOCO_partition.json') as f:
        data = json.load(f)

In [10]:
check_dups(data)

no dup!


***Check labels available***

In [11]:
for key, _ in data[0].items():
    print(key)

doc_id
URL
website
seeds
subcorpus
title
txt
txt_nwords
txt_nsentences
txt_nparagraphs
topic_k100
topic_k200
topic_k300
mention_conspiracy
conspiracy_representative
cosine_similarity
FB_shares
FB_comments
FB_reactions


***Check sample data given***

In [12]:
for key, value in data[0].items():
    print(key, '=', value, '\n')

doc_id = C00007 

URL = https://awarenessact.com/scientist-shares-important-tips-on-wearing-gloves-are-you-using-them-correctly/ 

website = awarenessact.com 

seeds = climate.change; coronavirus 

subcorpus = conspiracy 

title = Scientist Shares Important Tips On Wearing Gloves – Are You Using Them Correctly? 

txt = While a lot of people are trying to do their best to remain clean and germ-free during this pandemic, if you’re using gloves please make sure you’re using them properly. When it comes to wearing PPE (personal protective equipment) if you’re wearing or using them wrong, they won’t be protecting you.

I recently came across a thread of posts on Twitter by Dr. Jacquelyn Gill that really got me thinking about this big time. Dr. Gill for those who might not be aware is an associate professor at the University of Maine’s Climate Change Institute. She began this thread noting that when she goes on her weekly grocery runs she sees just how many people are using gloved improper

In [13]:
bigfoot, vaccine, flat, pizza, climate = split_data(data)

In [14]:
for entry in bigfoot:
    if entry['doc_id'] == 'C000bf':
        print(entry)

{'doc_id': 'C000bf', 'URL': 'https://thedailyconspiracy.com/2018/07/12/targeted-individuals-cant-fight-back/', 'website': 'thedailyconspiracy.com', 'seeds': 'big.foot; jfk.assassination', 'date': '2018-07-12', 'subcorpus': 'conspiracy', 'title': 'Targeted Individuals Can’t Fight Back – The Daily Conspiracy', 'txt': 'Did you know that an estimated ten thousand people are reporting unusual assaults on their minds that they claim break down their personalities and undermine health and well-being? These “targeted individuals” or TIs believe that unseen, powerful forces are monitoring, manipulating, and torturing them.\r\n\r\nAlmost all TIs report hearing voices that no one else can detect. They say their phones and other electronic devices are delivering unwanted and very disturbing, repetitive messages like “Your mother is coming over to kill you. Go to the kitchen and get a knife. Now.” Wounds that appear on the hands or other body parts are thought to be from illicit microchip implantat

In [39]:
print(len(bigfoot),len(vaccine),len(flat),len(pizza),len(climate))

2727 7104 2251 1371 3055


In [41]:
check_dups(bigfoot)
check_dups(vaccine)
check_dups(flat)
check_dups(pizza)
check_dups(climate)

no dup!
no dup!
no dup!
no dup!
no dup!


***Bigfoot Analysis***

In [70]:
print(polarity_analysis(bigfoot))
print(avg_text_length(bigfoot))
print(avg_sent_num(bigfoot))
print(avg_share_comment_react(bigfoot))

(2586, 141)
971.6501650165017
48.893656032269895
(300.66336633663366, 260.4198753208654, 987.4965163182985)


***Vaccine Analysis***

In [10]:
print(polarity_analysis(vaccine))
print(avg_text_length(vaccine))
print(avg_sent_num(vaccine))

(6826, 278)
879.3942849099099
41.36176801801802


***Flat Earth Analysis***

In [11]:
print(polarity_analysis(flat))
print(avg_text_length(flat))
print(avg_sent_num(flat))

(2122, 129)
945.6934695690804
47.486894713460686


***Pizzagate Analysis***

In [12]:
print(polarity_analysis(pizza))
print(avg_text_length(pizza))
print(avg_sent_num(pizza))

(1298, 73)
1025.6688548504742
49.797228300510575


***Climate Analysis***

In [13]:
print(polarity_analysis(climate))
print(avg_text_length(climate))
print(avg_sent_num(climate))

(2955, 100)
899.2124386252045
40.50605564648118


In [14]:
vocab = []
for i in range(len(climate)):
    vocab.append(climate[i]['txt'])

In [15]:
tfIdfVectorizer=TfidfVectorizer(use_idf=True)
tfIdf = tfIdfVectorizer.fit_transform(vocab)
df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
print(df.head(25))

            TF-IDF
gloves    0.436004
gill      0.392139
you       0.386571
re        0.205685
things    0.202407
dr        0.199021
wearing   0.169770
to        0.141854
ppe       0.127311
properly  0.118707
if        0.117936
touching  0.110451
thread    0.105282
doing     0.094899
using     0.089076
use       0.086781
of        0.083498
she       0.082366
them      0.080203
shouldn   0.077382
and       0.075222
aware     0.069162
people    0.067272
the       0.066668
gloved    0.066053




In [16]:
lsa = TruncatedSVD(algorithm='arpack').fit(tfIdf)

In [19]:
print(get_model_topics(lsa, tfIdf, lsa_topics))

NameError: name 'lsa_topics' is not defined

In [18]:
def get_model_topics(model, vectorizer, topics, n_top_words=20):
    word_dict = {}
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        word_dict[topics[topic_idx]] = top_features

    return pd.DataFrame(word_dict)