In [1]:
import ujson as json
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_json("ext_act_lang_filtered_pre_proc.ndjson", lines=True)

In [3]:
test = df.head(500)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

Goals: 
1. Pull the top n term freqs for overall, posts, and comments
2. Pull the top hashtags for each of those

Steps:
1. vectorize
2. pull feature names & counts for overall, posts and comments
    * this requies filtering the dfs
    * likely to be additional functions
3. check
    * for those that are hashtags -> get those counts
        * dict comp

In [5]:
# function to override the pre-proc that occurs within the vectorizer
# just returns the original string -> because I already had it clean
def dummy_func(x):   
    return x

In [6]:
def cust_tokenizer(x):
    return x.split()

In [41]:
# lightweight function to grep for hashtags
def hashtag_search(x):
    matches = re.findall(r"(#[a-zA-Z]+)", x)

    # need to adjust for empty lists...
    
    return matches

In [7]:
# function to take a given df and an n 
# returns the term freqs for that df sorted
def vect_tf(df, n):

    # initializing the vectorizer
    # setting min doc freq to 1 to avoid having things be too lossy
    # setting a pre-proc and tokenizer -> because the built-in ones are too aggressive
    vectorizer = CountVectorizer(min_df=1, preprocessor=dummy_func, tokenizer=cust_tokenizer, ngram_range=(1,3))

    
    # applying the vectorizer
    tf = vectorizer.fit_transform(df["bo"])

    # pulling the terms
    tf_feature_names = vectorizer.get_feature_names()

    # pulling the term counts
    term_counts = np.asarray(tf.sum(axis=0))[0]

    # making a dictionary w/terms and term counts
    term_freqs = dict(zip(tf_feature_names, term_counts))
    
    # making a dict w/terms and term counts for hashtags within this
    hashtag_freqs = {key:value for (key, value) in term_freqs.items() if "#" in key}

    hashtag_freqs = {key: value for key, value in hashtag_freqs.items()}
    
    top_n_freqs = dict((sorted(term_freqs.items(), key=lambda item: item[1],reverse=True))[0:n])
    
    top_n_hashtag_freqs = dict((sorted(hashtag_freqs.items(), key=lambda item: item[1],reverse=True))[0:n])
    
    return [top_n_freqs, top_n_hashtag_freqs]

In [8]:
# function to take given df, term_dict, and n -> returns overview of terms
def vect_overview(df, term_dict, n):
    
    overall = vect_tf(df, n)
    term_dict.setdefault("overall", {})
    term_dict["overall"].setdefault("overall_terms", overall[0])
    term_dict["overall"].setdefault("overall_hashtags", overall[1])
    
    posts = vect_tf(df[df["t"]=="p"], n)
    term_dict.setdefault("posts", {})
    term_dict["posts"].setdefault("posts_terms", posts[0])
    term_dict["posts"].setdefault("posts_hashtags", posts[1])

    
    comments = vect_tf(df[df["t"]=="c"], n)    
    term_dict.setdefault("comments", {})
    term_dict["comments"].setdefault("comments_terms", comments[0])
    term_dict["comments"].setdefault("comments_hashtag", comments[1])
    
    return term_dict

In [9]:
# current state -> trying to use the modification of a list as the return type
# ~ unsure if it's actually working
# expected is a triple nested dict {file:{overall:{posts_terms:{},posts_hashtags:{}}}}
#

In [10]:
file_list = ["min_act_lang_filtered_pre_proc.ndjson","mod_act_lang_filtered_pre_proc.ndjson",
             "ver_act_lang_filtered_pre_proc.ndjson","ext_act_lang_filtered_pre_proc.ndjson"]

In [11]:
%%time

term_dict = {}

for file in file_list:
    
    # reading in the file
    df = pd.read_json(file, lines=True)
    
    # creating the dict entry for the file
    term_dict.setdefault(file, {})
    
    # attaching the vect overview dictionary to the term_dict for the file
    term_dict[file] = vect_overview(df, term_dict[file], 50)

CPU times: total: 2h 19min 48s
Wall time: 2h 20min 2s


In [27]:
type(term_dict["min_act_lang_filtered_pre_proc.ndjson"]["overall"]["overall_terms"]["parler"])

numpy.int64

In [63]:
# converting the type from a numpy int to a regular int so we can serialize it into json
# kinda messy though
for file in term_dict:
    for level in term_dict[file]:
        for terms in term_dict[file][level]:
            for term in term_dict[file][level][terms]:
                term_dict[file][level][terms][term] = int(term_dict[file][level][terms][term]) 
                # pulls the value which is a number & converts to int
                

In [64]:
term_dict

{'min_act_lang_filtered_pre_proc.ndjson': {'overall': {'overall_terms': {'parler': 2616199,
    'everyone': 2425771,
    'looking': 2287494,
    'forward': 2236112,
    'joined': 2211886,
    'looking forward': 2194244,
    'meeting': 2193320,
    'joined parler': 2181246,
    'forward meeting': 2175400,
    'parler looking': 2175205,
    'looking forward meeting': 2175073,
    'parler looking forward': 2174773,
    'meeting everyone': 2174491,
    'forward meeting everyone': 2174427,
    'joined parler looking': 2174344,
    'trump': 1592144,
    'people': 1526405,
    'like': 1213371,
    'get': 1188364,
    'know': 1119164,
    'would': 1114237,
    'u': 1074311,
    'need': 1035396,
    'one': 1002628,
    'president': 901492,
    'time': 846073,
    'go': 756742,
    'going': 725249,
    'election': 717252,
    'right': 714286,
    'biden': 697858,
    'think': 693403,
    'want': 677818,
    'see': 670639,
    'good': 645582,
    'vote': 636927,
    'god': 631022,
    'country': 

In [70]:
# serializing the term_dict

with open('top50terms#s.json', 'w') as outfile:
    json_obj = json.dumps(term_dict)
    outfile.write(json_obj)

In [67]:
term_dict

{'min_act_lang_filtered_pre_proc.ndjson': {'overall': {'overall_terms': {'parler': 2616199,
    'everyone': 2425771,
    'looking': 2287494,
    'forward': 2236112,
    'joined': 2211886,
    'looking forward': 2194244,
    'meeting': 2193320,
    'joined parler': 2181246,
    'forward meeting': 2175400,
    'parler looking': 2175205,
    'looking forward meeting': 2175073,
    'parler looking forward': 2174773,
    'meeting everyone': 2174491,
    'forward meeting everyone': 2174427,
    'joined parler looking': 2174344,
    'trump': 1592144,
    'people': 1526405,
    'like': 1213371,
    'get': 1188364,
    'know': 1119164,
    'would': 1114237,
    'u': 1074311,
    'need': 1035396,
    'one': 1002628,
    'president': 901492,
    'time': 846073,
    'go': 756742,
    'going': 725249,
    'election': 717252,
    'right': 714286,
    'biden': 697858,
    'think': 693403,
    'want': 677818,
    'see': 670639,
    'good': 645582,
    'vote': 636927,
    'god': 631022,
    'country': 

In [114]:
# runnign through the files and return a top level dictionary

In [123]:
df = pd.read_json(file, lines=True)

In [125]:
%%time
# this is the top n freqs for posts

# vectorizing the posts
# initializing the vectorizer
# setting min doc freq to 1 release b/c of the small size of the corpus (not setting max df b/c of a fear of losing things)
vectorizer = CountVectorizer(min_df=1, preprocessor=dummy_func, tokenizer=cust_tokenizer, ngram_range=(1,3))
# applying the vectorizer to posts
tf = vectorizer.fit_transform(df[df["t"]=="p"]["bo"])

# pulling the terms
tf_feature_names = vectorizer.get_feature_names()

# pulling the term counts
term_counts = np.asarray(tf.sum(axis=0))[0]

# making a dictionary w/terms and term counts
term_freqs = dict(zip(tf_feature_names, term_counts))

# making a dict w/terms and term counts for hashtags within this
hashtag_freqs = {key:value for (key, value) in term_freqs.items() if "#" in key}

# looks at the top 50 terms to be found here overall
# takes advantage of the return type of sorted (a list of tuples) and takes a slice before making a dict from that
overall = dict((sorted(term_freqs.items(), key=lambda item: item[1],reverse=True))[0:50])

# findign the top 50 terms that are hashtags
hashtags = dict((sorted(hashtag_freqs.items(), key=lambda item: item[1],reverse=True))[0:50])

CPU times: total: 1min 49s
Wall time: 1min 49s


In [137]:
%%time
term_list = vect_tf(df, 50)

CPU times: total: 2min 11s
Wall time: 2min 11s


In [138]:
term_list

[{'trump': 178037,
  'news': 90784,
  'biden': 71472,
  'say': 57241,
  'democrat': 55862,
  'u': 53020,
  'president': 52380,
  'video': 50902,
  'people': 48909,
  'right': 47464,
  'new': 45316,
  'one': 44454,
  'time': 44155,
  '#news': 42017,
  'state': 41604,
  'fox': 41578,
  '#justice': 41486,
  '#freedom': 41467,
  '#news #justice': 41350,
  'american': 41139,
  'get': 39088,
  'like': 37970,
  'report': 37333,
  'fox news': 37039,
  'election': 36543,
  'would': 36143,
  '#sovereignty': 35442,
  '#freedom #sovereignty': 35441,
  'god': 35113,
  'coronavirus': 33785,
  'know': 33226,
  'joe': 33008,
  'via': 31775,
  'america': 31174,
  'see': 30543,
  'want': 29404,
  'vote': 28370,
  'go': 27096,
  'twitter': 26739,
  'president trump': 26508,
  'day': 26388,
  'house': 26181,
  'need': 25834,
  'medium': 25365,
  'call': 24742,
  'year': 24587,
  'china': 24217,
  'police': 23978,
  '#thegreatawakening': 23897,
  '#justice #thegreatawakening': 23579},
 {'#news': 42017,
  '

In [11]:
# this code pulls the top n term freqs overall

# vectorizing the posts
# initializing the vectorizer
# setting min doc freq to 1 release b/c of the small size of the corpus (not setting max df b/c of a fear of losing things)
vectorizer = CountVectorizer(min_df=1, ngram_range=(1,3))

# applying the vectorizer
tf = vectorizer.fit_transform(df["bo"])

# pulling the terms
tf_feature_names = vectorizer.get_feature_names()

# pulling the term counts
term_counts = np.asarray(tf.sum(axis=0))[0]

# making a dictionary w/terms and term counts
term_freqs = dict(zip(tf_feature_names, term_counts))

# looks at the top 50 terms to be found here overall
# takes advantage of the return type of sorted (a list of tuples) and takes a slice before making a dict from that
dict((sorted(term_freqs.items(), key=lambda item: item[1],reverse=True))[0:50])

CPU times: total: 1min 43s
Wall time: 1min 43s


In [110]:
# this is the test to check if I'm competent or not

{'00': 13,
 '00 anyway': 1,
 '00 anyway flushed': 1,
 '00 buckshot': 4,
 '00 buckshot antifaus': 1,
 '00 buckshot messy': 1,
 '00 buckshot recommended': 1,
 '00 buckshot slug': 1,
 '00 est': 1,
 '00 est fucking': 1,
 '00 got': 1,
 '00 got get': 1,
 '00 major': 1,
 '00 major presidential': 1,
 '00 morning': 1,
 '00 morning goonna': 1,
 '00 pm': 1,
 '00 pm wall': 1,
 '00 question': 1,
 '00 question never': 1,
 '00 stop': 1,
 '00 stop steal': 1,
 '00 tariff': 1,
 '00 tariff tariff': 1,
 '000': 35,
 '000 000': 1,
 '000 000 people': 1,
 '000 1000': 1,
 '000 ballot': 1,
 '000 ballot brought': 1,
 '000 black': 1,
 '000 black black': 1,
 '000 buy': 1,
 '000 buy photo': 1,
 '000 cancer': 2,
 '000 cancer research': 2,
 '000 coincidentally': 1,
 '000 coincidentally buried': 1,
 '000 coronavirus': 2,
 '000 coronavirus 700': 1,
 '000 coronavirus death': 1,
 '000 freebreathers': 1,
 '000 guess': 1,
 '000 guess one': 1,
 '000 infected': 1,
 '000 infected coronavirus': 1,
 '000 inhab': 1,
 '000 inhab 

In [100]:
vectorizer = CountVectorizer(min_df=1, preprocessor=dummy_func, tokenizer=cust_tokenizer, ngram_range=(1,3))

In [101]:
test_tf = vectorizer.fit_transform(["#MAGA #TRUMP2020 Let's vote out these bloodsucking demoncrats!"])

In [102]:
test_counts = np.asarray(test_tf.sum(axis=0))[0]

In [103]:
test_names = vectorizer.get_feature_names()

In [104]:
test_freqs = dict(zip(test_names, term_counts))

In [112]:
for key in term_freqs:
    if "#" in key:
        print(key)

In [46]:
%%time
# this is the top n freqs for comments

# vectorizing the posts
# initializing the vectorizer
# setting min doc freq to 1 release b/c of the small size of the corpus (not setting max df b/c of a fear of losing things)
vectorizer = CountVectorizer(min_df=1, ngram_range=(1,3))

# applying the vectorizer
tf = vectorizer.fit_transform(df[df["t"]=="c"]["bo"])

# pulling the terms
tf_feature_names = vectorizer.get_feature_names()

# pulling the term counts
term_counts = np.asarray(tf.sum(axis=0))[0]

# making a dictionary w/terms and term counts
term_freqs = dict(zip(tf_feature_names, term_counts))

# looks at the top 50 terms to be found here overall
# takes advantage of the return type of sorted (a list of tuples) and takes a slice before making a dict from that
dict((sorted(term_freqs.items(), key=lambda item: item[1],reverse=True))[0:50])

CPU times: total: 16 s
Wall time: 16.1 s


{'like': 9329,
 'one': 9113,
 'know': 8920,
 'get': 8648,
 'would': 8411,
 'people': 8107,
 'trump': 6590,
 'good': 6589,
 'exactly': 6382,
 'right': 6244,
 'time': 6105,
 'think': 5487,
 'need': 5409,
 'amen': 5331,
 'see': 5311,
 'go': 5305,
 'god': 4909,
 'yes': 4903,
 'going': 4709,
 'want': 4660,
 'got': 4396,
 'thanks': 4203,
 'never': 4105,
 'lol': 3955,
 'thank': 3936,
 'year': 3876,
 'make': 3815,
 'cannot': 3755,
 'say': 3710,
 'great': 3670,
 'state': 3650,
 'back': 3523,
 'agree': 3505,
 'take': 3495,
 'many': 3445,
 'love': 3395,
 'thing': 3277,
 'way': 3246,
 'well': 3206,
 'look': 3131,
 'president': 3129,
 'democrat': 3100,
 'day': 3092,
 'could': 2910,
 'even': 2861,
 'vote': 2755,
 'said': 2744,
 'america': 2714,
 'much': 2627,
 'country': 2591}

In [None]:
# this finds hashtag term frequency
# sklearn removes the punctuation! -> that's the issue we're facing here :(
# https://stackoverflow.com/questions/39254134/how-to-preserve-punctuation-marks-in-scikit-learn-text-countvectorizer-or-tfidfv
# https://kavita-ganesan.com/how-to-use-countvectorizer/#.YiUnHOjMLD4

In [39]:
test = pd.read_json("ext_act_lang_filtered_pre_proc.ndjson", lines=True)

In [40]:
# regex pattern needed to find hashtags (#[a-zA-Z]+)

In [42]:
re.findall(r"(#[a-zA-Z]+)", "I hate these DEMONCRATS! #MAGA #TRUMP2020 We need to vote these suckers out!")

['#MAGA', '#TRUMP']

In [43]:
test["hashtags"] = test["bo"].apply(hashtag_search)

In [44]:
test[test["hashtags"].notna()]

Unnamed: 0,t,id,cd,c,u,un,dmn,bo,bo_lang,conf,hashtags
0,p,3619348b91524430882f2b887838a3e4,20201018,1602989910000,8b67993183a14587a001010058d089d2,chucknellis,noqreport.com,facebooks public policy manager global electio...,English,[0.8107684255000001],[]
1,p,1365051bfe6243599e6af1055b71c4a2,20201202,1606897009000,eeeb8dd25b7142b1bc69cbdbe1d8bb62,ThomasFox,rumble.com,usps driver drop shocking claim 200000 ballot ...,English,[0.6809220314000001],[]
2,p,d755c2ce09e74c9b9b34953aae839554,20190829,1567070618000,21031f424913456591d9a9aed4ff26c7,Cobrarick98,i.imgur.com,antifa touch trump supporter coming take infec...,English,[0.8768866658000001],[]
3,p,a0124e3f9afa4fe29d404e41e3ac9a5f,20201009,1602262587000,6e6d4f8e7479446f8f06d5d5c0fae9a3,AppleJax,image-cdn.parler.com,guilty charged election dirty democrat may pul...,English,[0.9819905758],[]
4,p,857c1349a1cc4bfba84cb75a9a34ec93,20201026,1603746735000,eeeb8dd25b7142b1bc69cbdbe1d8bb62,ThomasFox,deadline.com,60 minute trump walkoff soar 17m viewer watche...,English,[0.7728871703],[]
...,...,...,...,...,...,...,...,...,...,...,...
1266677,c,1f31dd86a29646598640c6ce5ca7d97c,20200830,1598759370000,e11890c2ffc348df9cb703651a3cc9c0,LibertyElaine,image-cdn.parler.com,libertyelaine pelosi rigged midterm #midtermsr...,English,[0.368270874],"[#midtermsrigged, #pelosicrimefamily, #pelosim..."
1266678,c,3a1e2b23206f4955bfc25a24acaeff90,20200523,1590226917000,8f30c4c4215249b583741639d4070a39,Clearskies,image-cdn.parler.com,going need bigger gun,English,[0.897129178],[]
1266679,c,dc3fa8b7c12c4e3c898af4b14c99cbf5,20200727,1595810868000,6e6d4f8e7479446f8f06d5d5c0fae9a3,AppleJax,image-cdn.parler.com,yeah know bullshit also racist blame everythin...,English,[0.8486205935000001],[]
1266680,c,1dc1abde35234e32a872bcf76bd6ab45,20200820,1597891420000,efaac1ece6a893e0fab48f558e3a5c57,BethocAeilflaed,image-cdn.parler.com,full wine thing keeping sane teaching online,English,[0.6988052726],[]
