In [58]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import csr_matrix, diags

# load data
rawData = pd.read_csv("truth_seeker.csv")

In [59]:
rawData.rename(columns={'Unnamed: 0': 'ind'}, inplace=True)

rawData.shape # (134198, 9)
rawData.columns # 'ind', 'author', 'statement', 'target', 'BinaryNumTarget',
    # 'manual_keywords', 'tweet', '5_label_majority_answer',
    # '3_label_majority_answer'
    
# Randomly sample 1% of the rows in rawData
smallData = rawData.sample(frac=0.01, random_state=42)
rawData = smallData


In [60]:
rawData.head()

Unnamed: 0,ind,author,statement,target,BinaryNumTarget,manual_keywords,tweet,5_label_majority_answer,3_label_majority_answer
122451,122451,Daniel Funke,At 17 years old Kyle (Rittenhouse) was perfect...,False,0.0,"Kyle,possess,rifle,legal",@fattycattie @robquinnpc @SammyTMcCarty @WCCO ...,Mostly Agree,Agree
14029,14029,Jill Terreri,"""I actually in 2005 ran on Medicare for all.""",True,1.0,"2005, ran on Medicare","never forget, Matt Santos ran on Medicare for ...",Mostly Agree,Agree
51449,51449,Miriam Valverde,COVID-19 vaccinations are a violation of the N...,False,0.0,"vaccine,Nuremberg code,viloation",@BilldeBlasio You are mandating an experimenta...,NO MAJORITY,Agree
17981,17981,Manuela Tobias,"""You know what Amazon paid in federal income t...",True,1.0,"Amazon, federal income taxes, zero",@SenSanders The working class is literally bei...,Mostly Agree,Agree
66284,66284,Julie Kliegman,"""Crimea became part of Ukraine only in 1954. C...",True,1.0,"Crimea, historically, Russia",@PeterTong9 @timand2037 @ClimateAudit But none...,Mostly Agree,Agree


In [61]:
def determine_outcome(row):
    if (row['target'] == True and row['3_label_majority_answer'] == 'Agree') or (row['target'] == False and row['3_label_majority_answer'] == 'Disagree'):
        return 'real'
    elif (row['target'] == True and row['3_label_majority_answer'] == 'Disagree') or (row['target'] == False and row['3_label_majority_answer'] == 'Agree'):
        return 'fake'
    else:
        return None

rawData['outcome'] = rawData.apply(determine_outcome, axis=1)
reshaped = rawData[rawData['outcome'].notnull()]
reshaped['outcome'].value_counts() # fake: 65213, real: 68985

outcome
fake    673
real    669
Name: count, dtype: int64

In [62]:
topics_list = [topic.split('nn') for topic in reshaped['tweet'].astype(str)]
topics_list = [[topic.replace('^.* -', '') for topic in topics] for topics in topics_list]
topics_unique = sorted(set([item for sublist in topics_list for item in sublist]))


In [63]:
# dummy matrix
topics_dummies = [[topic in topics for topic in topics_unique] for topics in topics_list]
topics_dummies = csr_matrix(topics_dummies)

In [64]:
topics_dummies.shape 

(1342, 1531)

In [65]:
# labelling for ease
texts = {index: text for index, text in zip(rawData['ind'], rawData['tweet'])}

In [67]:
# building corpus
corpus = list(texts.values())

# corpus cleaning
vectorizer = CountVectorizer(strip_accents='unicode', stop_words='english', lowercase=True, token_pattern=r'\b[a-zA-Z]{3,}\b')
dtm_slam = vectorizer.fit_transform(corpus)

In [68]:
# row names
dtm_slam.index = rawData['ind']

In [70]:
# dropping terms that occur in <3 documents
drop = dtm_slam.sum(axis=0).A1 < 3
dtm_slam = dtm_slam[:, ~drop]

dtm_slam.shape

(1342, 1895)

In [71]:
# converting to sparse matrix for memory/performance
dtm = csr_matrix(dtm_slam)

In [73]:
# normalizing (preferred)
dtm_norm = diags(1 / dtm.sum(axis=1).A1) @ dtm
dtm_norm.index = rawData['ind']

In [74]:
# summary statistics
print("DTM:")
print(f"Mean entry: {dtm_norm.mean()}")
print(f"Dimensions: {dtm_norm.shape}")

DTM:
Mean entry: 0.0005277044854881266
Dimensions: (1342, 1895)
