In [3]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
from transformers import pipeline

pd.set_option('display.max_colwidth', 255)
plt.style.use("dark_background")

In [6]:
anon1 = pd.read_csv('./data/anon1.csv')
az_gop = pd.read_csv('./data/AZGOP.csv')
biggs1 = pd.read_csv('./data/biggs.csv')
biggs2 = pd.read_csv('./data/biggs2.csv')
boebert = pd.read_csv('./data/boebert.csv')
brooks = pd.read_csv('./data/brooks.csv')
cpac1 = pd.read_csv('./data/cpac1.csv')
cpac2 = pd.read_csv('./data/cpac2.csv')
daines = pd.read_csv('./data/daines.csv')
gosar = pd.read_csv('./data/gosar.csv')
hash_8balls = pd.read_csv('./data/hash8balls.csv')
hash_1933 = pd.read_csv('./data/hash1933.csv')
hash_boogaloo = pd.read_csv('./data/hashboogaloo.csv')
hash_civilwar = pd.read_csv('./data/hashcivilwar.csv')
hash_frens = pd.read_csv('./data/hashfrens.csv')
hash_joebiggs = pd.read_csv('./data/hashjoebiggs.csv')
hash_oathkeepers = pd.read_csv('./data/hashoathkeepers.csv')
hash_patriot = pd.read_csv('./data/hashpatriot.csv')
hash_proudboys = pd.read_csv('./data/hashproudboys.csv')
hash_qanon = pd.read_csv('./data/hashQanon.csv')
hash_SS = pd.read_csv('./data/hashSS.csv')
hawley = pd.read_csv('./data/hawley.csv')
mtg1 = pd.read_csv('./data/congress1.csv')
mtg2 = pd.read_csv('./data/mtg2.csv')
poli1 = pd.read_csv('./data/poli1.csv')
poli2 = pd.read_csv('./data/1900_USpol_jan9/us_tweets.csv')
# jan20 = pd.read_csv('./data/jan20_archive/tweets_v4.csv') #not sure what's wrong here
jan6 = pd.read_csv('./data/tweets_2021-01-06.csv')
qanon_archive = pd.read_json('./data/qanon_archive/posts.json')
riot_tweets = pd.read_csv('./data/riot_tweets_2021-01-06.csv')
rosendale1 = pd.read_csv('./data/congress2.csv')
rosendale2 = pd.read_csv('./data/rosendale2.csv')
trump_tweets = pd.read_csv('./data/trump_tweets_01-08-2021.csv')


# parler data is a mass of text files.  
# Will decide if I can do anything with that later.
# parler = pd.read_csv('./data/parler_archive/Parler/Parler/*') 

In [7]:
def check_shape(dfs):
    for df in dfs:
        print(df.shape)

In [8]:
def count_rows(dfs):
    rowcount = 0
    for df in dfs:
        rowcount += df.shape[0]
    return rowcount

In [9]:
dfs = [az_gop, anon1, biggs1, biggs2, boebert, brooks, cpac1, cpac2, daines, gosar, 
       hash_1933, hash_eights, hash_SS, hash_frens, hash_qanon, hash_boogaloo, 
       hash_civilwar, hash_joebiggs, hash_oathkeepers, hash_patriot, hash_proudboys, 
       hawley, mtg1, mtg2, poli1, qanon_archive, riot_tweets, rosendale1, 
       rosendale2, trump_tweets]

In [10]:
check_shape(dfs)

(2866, 15)
(1732, 5)
(3912, 5)
(319, 5)
(2925, 5)
(649, 5)
(3196, 16)
(857, 5)
(805, 5)
(1505, 5)
(250001, 5)
(27644, 5)
(8975, 5)
(250001, 5)
(250001, 5)
(250001, 5)
(250001, 5)
(4188, 5)
(28659, 5)
(250001, 5)
(250001, 5)
(1468, 5)
(46, 5)
(5197, 5)
(21059, 16)
(4953, 4)
(82309, 14)
(360, 5)
(63, 5)
(56571, 9)


In [11]:
count_rows(dfs)

2010265

In [12]:
riot_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82309 entries, 0 to 82308
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   tweet_id          82309 non-null  int64  
 1   text              82308 non-null  object 
 2   query             82309 non-null  object 
 3   user_id           82309 non-null  int64  
 4   user_name         2478 non-null   object 
 5   follower_count    82309 non-null  int64  
 6   user_tweet_count  82309 non-null  int64  
 7   likes             82309 non-null  int64  
 8   retweets          82309 non-null  int64  
 9   location_name     285 non-null    object 
 10  longitude         285 non-null    float64
 11  latitude          285 non-null    float64
 12  user_location     15890 non-null  object 
 13  date              82309 non-null  object 
dtypes: float64(2), int64(6), object(6)
memory usage: 8.8+ MB


In [13]:
riot_tweets['text'].head()

0    we can adjust by population to get crude excess death rates 2020 will also be the highest since 1940 even with measures taken to limit covid spread 2020 will still top 1951 the year of a major flu epidemic without all our efforts 2020 could have been ...
1    turning to labour or remainers like what would you do different while sat on one of the worst economic recessions amp death rates in the g7 its like driving a car off a cliff amp while youre falling offering the wheel to the guy in the passenger seat ...
2    numbers of deaths are affected by population size we can also consider death rates as a proportion of the population crude death rates are shown below all improvements in the first decade of this century will be reversed this year spanish flu did the ...
3                     here is a list of governors who preside over states with higher deathpermillion rates than fls ron desantis whom the media have anointed public enemy 1 1 murphy d 2 cuomo d 3 baker r 4 raimondo d 5 

In [14]:
riot_tweets['text'].sample(10)

61622    im in the capitol im safe and my team and i are sheltering in place the president of the united states has incited a riot that has now stormed the capitol there are rioters roaming the halls of the capitol i saw them with my own eyes our country deser...
59591                                                                                                                                                                                                                   this is what the law and order president wanted
146      when people comment on slow vaccination rates someone always jumps in with its a marathon not a sprint maybe but when youre vaccinating the most atrisk first and the death aversion curve looks anything like the one below you should be sprinting out of...
66938         trends are nothing without you stay united and spread love unveil avpl trend common watsapp status release date and time yourself go to gif section gt type the keyword gt use the gif and spread 

In [28]:
riot_tweets[riot_tweets['text'].str.contains("qanon", na=False)]

Unnamed: 0,tweet_id,text,query,user_id,user_name,follower_count,user_tweet_count,likes,retweets,location_name,longitude,latitude,user_location,date


In [21]:
riot_tweets[(riot_tweets['text'].str.contains('civil')) & (riot_tweets['text'].str.contains('war'))]

Unnamed: 0,tweet_id,text,query,user_id,user_name,follower_count,user_tweet_count,likes,retweets,location_name,longitude,latitude,user_location,date
512,1346819122970185730,what happened last night isnt about democrats or republicans its about our country we now have a real chance to beat covid fight climate change make work pay and pass a new civil rights act congratulations to raphael warnock and jon ossoff,covid,596450900,,1302,44086,0,2038,,,,,2021-01-06
528,1346819116963774464,what happened last night isnt about democrats or republicans its about our country we now have a real chance to beat covid fight climate change make work pay and pass a new civil rights act congratulations to raphael warnock and jon ossoff,covid,1409133672,,1346,13579,0,2038,,,,,2021-01-06
715,1346819070016933889,what happened last night isnt about democrats or republicans its about our country we now have a real chance to beat covid fight climate change make work pay and pass a new civil rights act congratulations to raphael warnock and jon ossoff,covid,1314004250704277504,,59,3892,0,2038,,,,,2021-01-06
755,1346819062282776578,what happened last night isnt about democrats or republicans its about our country we now have a real chance to beat covid fight climate change make work pay and pass a new civil rights act congratulations to raphael warnock and jon ossoff,covid,330338176,,5754,64286,0,2038,,,,PA,2021-01-06
773,1346819058122039296,what happened last night isnt about democrats or republicans its about our country we now have a real chance to beat covid fight climate change make work pay and pass a new civil rights act congratulations to raphael warnock and jon ossoff,covid,15400260,,2461,27542,0,2038,,,,NY,2021-01-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63982,1346916014999666689,i had reps from twitter tell me that the president advocating for a civil war didnt violate their tos this year i dont care what fun widgets they put on his tweets their products made this completely frictionless just disgusting and embarrassing,president,16842165,alex bledsoe,3222,25725,0,186,,,,WI,2021-01-06
64210,1346916008003592197,i cannot be the only person watching this and worrying this agentprovocateur who is wearing the mask of sitting president inciting violence spewing racial divide and hatred is brewing the start of civil war part 2 americanbetrayal,president,939872966,,113,8606,0,0,,,,USA,2021-01-06
64609,1346915994338533385,since donald trump has turned on mike pence mike pence can call mitch mcconnell and invoke the 25th amendment do a written declaration to deem trump unable to discharge duties of a president after inciting this civil war they could remove him tonight ...,president,2807448258,,1556,41446,0,38,,,,,2021-01-06
78411,1346822075219107840,you wont go to local officials and protest when they lock down schools or pass mask mandates or shut down businesses or waste local money you are too busy for that but you are going to usher in a civil war,mask,159669793,,467,18259,1,0,,,,,2021-01-06


In [None]:
riot_tweets['text']

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
type(df['created_at'])

In [None]:
type(df['created_at'][0])