In [3]:
import ujson as json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import fasttext
from pycountry import languages
import re

In [2]:
# function to help figure out the language
def lang_class(x):
    
    x = x .replace("\n", " ")
    lang = model.predict(x)
           
    return lang

In [3]:
# function to help retrieve the language name using the language code
def lang_from_code(x):
    x = x[0]
    x = str(x)
  
    match = re.match(r"__label__([a-zA-Z]+)", x)
    match = match.group(1)

    if len(match) == 2:
        
        try:
            lang = languages.get(alpha_2=match).name

        except:
            lang = match
    elif len(match) == 3:
        try:
            lang = languages.get(alpha_3=match).name
        
        except:   
            lang = match
        
    return(lang)

In [4]:
# function to get the counts for domains in series passed into it
# pass in the series and it'll iterate through the whole thing
def count_dmns(series) -> dict:
    dmn_dict = {}
    
    # kinda brute force but I'm not sure how to more efficiently do this
    for item in series:
        if type(item) == list:
            for dmn in item:
                if dmn not in dmn_dict:
                    dmn_dict.setdefault(dmn, 1)

                else:
                    dmn_dict[dmn] += 1

        else:
            if item not in dmn_dict:
                dmn_dict.setdefault(item, 1)

            else:
                dmn_dict[item] += 1
                
    # returning the dictionary of terms 
    return dmn_dict

In [10]:
# returns set of dmns without nans
dmns_df = df[~df["dmn"].isna()]

# returns set of dmns without nans -> only those rows with dmns
dmns_df = df[~df["dmn"].isna()]

# returns those dmns which are from the comments
dmns_df[dmns_df["t"]=="c"]["dmn"]

# returns those dmns which are from the posts
dmns_df[dmns_df["t"]=="p"]["dmn"]

# returns those dmns which are the ones where the body of the post is empty
dmns_df[dmns_df["bo"].isna()]["dmn"]

0          image-cdn.parler.com
3          image-cdn.parler.com
4          image-cdn.parler.com
6          image-cdn.parler.com
8          image-cdn.parler.com
                   ...         
2254418    image-cdn.parler.com
2254420    image-cdn.parler.com
2254421    image-cdn.parler.com
2254422    image-cdn.parler.com
2254424    image-cdn.parler.com
Name: dmn, Length: 855447, dtype: object

1984413                   wo.an
1984421    image-cdn.parler.com
1984423                youtu.be
1984438          www.google.com
1984444    image-cdn.parler.com
                   ...         
2254421    image-cdn.parler.com
2254422    image-cdn.parler.com
2254423    image-cdn.parler.com
2254424    image-cdn.parler.com
2254425    image-cdn.parler.com
Name: dmn, Length: 36581, dtype: object

#### Domain Analysis

Analysis to do with domains:
* top domains within category
* compare between comments and posts
* compare when body is empty

In [5]:
%%time
# reading in the data
df = pd.read_json("ext_act.ndjson", lines=True)

dmns_df = df[~df["dmn"].isna()]

CPU times: total: 11.8 s
Wall time: 11.8 s


In [49]:
# trying to take the domains results and put that in a nested dict
dmn_data = {}

In [52]:
len(dmns_count)

16203

In [53]:
%%time
# top domains
top_n = 50

# counting the domains overall
dmns_count = count_dmns(dmns_df["dmn"])

# sorting dict by values & putting in dict
dmn_data.setdefault("overall", 
                    dict((sorted(dmns_count.items(), key=lambda item: item[1],reverse=True))[0:top_n]))

# posts
# pulling those dmns and their counts where the provenance is posts
posts_dmns = count_dmns(dmns_df[dmns_df["prov"]=="posts"]["dmn"])

# sorting dmn_dict by values & putting in dict
dmn_data.setdefault("posts",
                   dict((sorted(comments_dmns.items(), key=lambda item: item[1],reverse=True))[0:top_n]))

# comments
# pulling those dmns and their counts where the provenance is comments
comments_dmns = count_dmns(dmns_df[dmns_df["prov"]=="comment"]["dmn"])

# sorting dmn_dict by values & putting in dict
dmn_data.setdefault("comments",
                   dict((sorted(comments_dmns.items(), key=lambda item: item[1],reverse=True))[0:top_n]))

# empty bodies
empty_post_dmns = count_dmns(dmns_df[dmns_df["bo"].isna()]["dmn"])

dmn_data.setdefault("empty", 
                    dict((sorted(empty_post_dmns.items(), key=lambda item: item[1],reverse=True))[0:top_n]))

CPU times: total: 20.5 s
Wall time: 35.8 s


{'image-cdn.parler.com': 271122,
 'youtu.be': 105968,
 'youtube.com': 44452,
 'twitter.com': 40682,
 'i.imgur.com': 35042,
 'facebook.com': 25407,
 'tiny.iavian.net': 15757,
 'thegatewaypundit.com': 14716,
 'bbc.co.uk': 13140,
 'breitbart.com': 9450,
 'foxnews.com': 9158,
 'reuters.com': 5622,
 'naturalnews.com': 5366,
 'nypost.com': 4994,
 'newswars.com': 4839,
 'americanthinker.com': 4817,
 'washingtonexaminer.com': 4540,
 'zerohedge.com': 4502,
 'parler.com': 4492,
 'townhall.com': 4452,
 'www.youtube.com': 4328,
 'westernjournal.com': 3893,
 'cnn.com': 3712,
 'tmq2.wordpress.com': 3522,
 'rumble.com': 3285,
 'nytimes.com': 3110,
 'bbc.com': 3067,
 'instagram.com': 3019,
 'twitchy.com': 2654,
 'thehill.com': 2644,
 'dailycaller.com': 2631,
 'aljazeera.com': 2601,
 'theepochtimes.com': 2486,
 'thefederalist.com': 2478,
 'michaelsavage.com': 2424,
 'redstate.com': 2297,
 'wnd.com': 2217,
 'washingtonpost.com': 2187,
 'jornaldacidadeonline.com.br': 2157,
 'bitchute.com': 2019,
 'www.br

#### Top Domains Overall

In [30]:
%%time
# returns df where dmns are without nans -> only those rows with dmns
dmns_df = df[~df["dmn"].isna()]

# counting the domains overall
dmns_count = count_dmns(dmns_df["dmn"])

# sorting dict by values & pulling end of dict list
dict((sorted(dmns_count.items(), key=lambda item: item[1],reverse=True))[0:50]) 

CPU times: total: 1.55 s
Wall time: 1.55 s


{'image-cdn.parler.com': 353114,
 'youtu.be': 161416,
 'youtube.com': 85515,
 'twitter.com': 84931,
 'thegatewaypundit.com': 83150,
 'breitbart.com': 57699,
 'i.imgur.com': 54973,
 'facebook.com': 31766,
 'foxnews.com': 30889,
 'townhall.com': 22175,
 'tiny.iavian.net': 22121,
 'counterglobalist.news': 20205,
 'westernjournal.com': 18487,
 'pjmedia.com': 16205,
 'zerohedge.com': 14650,
 'nypost.com': 14595,
 'gab.com': 14577,
 'www.youtube.com': 13692,
 'bbc.co.uk': 13418,
 'twitchy.com': 13277,
 'dennismichaellynch.com': 13153,
 'thefederalist.com': 11721,
 'dailycaller.com': 11335,
 'steve-finnell.blogspot.com': 11183,
 'washingtonexaminer.com': 10829,
 'www.foxnews.com': 10520,
 'www.thegatewaypundit.com': 10471,
 'infowars.com': 10025,
 'theepochtimes.com': 9884,
 'www.breitbart.com': 9511,
 'redstate.com': 8978,
 'americanthinker.com': 8844,
 'dmlnewsapp.com': 8535,
 'parler.com': 8369,
 'newsmax.com': 8190,
 'thehill.com': 7920,
 'bizpacreview.com': 7446,
 'theblaze.com': 7240,
 

#### Top Domains in Posts

In [31]:
%%time
# pulling those dmns and their counts where the provenance is comments
comments_dmns = count_dmns(dmns_df[dmns_df["prov"]=="post"]["dmn"])

# sorting dmn_dict by values
dict((sorted(comments_dmns.items(), key=lambda item: item[1],reverse=True))[0:50])

CPU times: total: 2.27 s
Wall time: 2.27 s


{'image-cdn.parler.com': 325714,
 'youtu.be': 160562,
 'youtube.com': 84917,
 'twitter.com': 84158,
 'thegatewaypundit.com': 83129,
 'breitbart.com': 57681,
 'i.imgur.com': 53704,
 'facebook.com': 31743,
 'foxnews.com': 30878,
 'townhall.com': 22170,
 'tiny.iavian.net': 22121,
 'counterglobalist.news': 20173,
 'westernjournal.com': 18481,
 'pjmedia.com': 16201,
 'zerohedge.com': 14639,
 'nypost.com': 14580,
 'gab.com': 14565,
 'www.youtube.com': 13641,
 'bbc.co.uk': 13417,
 'twitchy.com': 13275,
 'dennismichaellynch.com': 13152,
 'thefederalist.com': 11715,
 'dailycaller.com': 11316,
 'washingtonexaminer.com': 10821,
 'steve-finnell.blogspot.com': 10728,
 'www.foxnews.com': 10520,
 'www.thegatewaypundit.com': 10471,
 'infowars.com': 10021,
 'theepochtimes.com': 9872,
 'www.breitbart.com': 9509,
 'redstate.com': 8977,
 'americanthinker.com': 8841,
 'dmlnewsapp.com': 8535,
 'parler.com': 8311,
 'newsmax.com': 8181,
 'thehill.com': 7917,
 'bizpacreview.com': 7444,
 'theblaze.com': 7236,
 

#### Top Domains in comments

In [32]:
%%time
# returns those dmns which are from the comments
posts_dmns = count_dmns(dmns_df[dmns_df["prov"]=="comment"]["dmn"])

dict((sorted(posts_dmns.items(), key=lambda item: item[1],reverse=True))[0:50])

CPU times: total: 234 ms
Wall time: 235 ms


{'image-cdn.parler.com': 27400,
 'i.imgur.com': 1269,
 'youtu.be': 854,
 'twitter.com': 773,
 'youtube.com': 598,
 'media0.giphy.com': 470,
 'media2.giphy.com': 465,
 'steve-finnell.blogspot.com': 455,
 'media1.giphy.com': 449,
 'media3.giphy.com': 417,
 'media.tenor.com': 308,
 'truepundit.com': 245,
 'media4.giphy.com': 94,
 'pbs.twimg.com': 87,
 'qanon.pub': 71,
 'google.com': 64,
 'share.par.pw': 63,
 'biblegateway.com': 63,
 'www.google.com': 59,
 'parler.com': 58,
 'www.youtube.com': 51,
 'yourhillary.us': 48,
 'frontpagemag.com': 46,
 'tmq2.wordpress.com': 38,
 'missingchildreneurope.eu': 37,
 'c.tenor.com': 35,
 'censoredbyjack.com': 32,
 'counterglobalist.news': 32,
 't.co': 32,
 'en.wikipedia.org': 31,
 'understandingthethreat.com': 28,
 'www.frontpagemag.com': 27,
 'thedailycoin.org': 27,
 'insight2bp.homestead.com': 25,
 'overcast.fm': 24,
 'facebook.com': 23,
 'thegatewaypundit.com': 21,
 'dailymail.co.uk': 20,
 'banned.video': 19,
 'dailycaller.com': 19,
 'breitbart.com':

#### dmns from empty posts

In [33]:
%%time
empty_post_dmns = count_dmns(dmns_df[dmns_df["bo"].isna()]["dmn"])

dict((sorted(empty_post_dmns.items(), key=lambda item: item[1],reverse=True))[0:50])

CPU times: total: 750 ms
Wall time: 745 ms


{'image-cdn.parler.com': 271122,
 'youtu.be': 105968,
 'youtube.com': 44452,
 'twitter.com': 40682,
 'i.imgur.com': 35042,
 'facebook.com': 25407,
 'tiny.iavian.net': 15757,
 'thegatewaypundit.com': 14716,
 'bbc.co.uk': 13140,
 'breitbart.com': 9450,
 'foxnews.com': 9158,
 'reuters.com': 5622,
 'naturalnews.com': 5366,
 'nypost.com': 4994,
 'newswars.com': 4839,
 'americanthinker.com': 4817,
 'washingtonexaminer.com': 4540,
 'zerohedge.com': 4502,
 'parler.com': 4492,
 'townhall.com': 4452,
 'www.youtube.com': 4328,
 'westernjournal.com': 3893,
 'cnn.com': 3712,
 'tmq2.wordpress.com': 3522,
 'rumble.com': 3285,
 'nytimes.com': 3110,
 'bbc.com': 3067,
 'instagram.com': 3019,
 'twitchy.com': 2654,
 'thehill.com': 2644,
 'dailycaller.com': 2631,
 'aljazeera.com': 2601,
 'theepochtimes.com': 2486,
 'thefederalist.com': 2478,
 'michaelsavage.com': 2424,
 'redstate.com': 2297,
 'wnd.com': 2217,
 'washingtonpost.com': 2187,
 'jornaldacidadeonline.com.br': 2157,
 'bitchute.com': 2019,
 'www.br

How should I best compare these? I'm not sure

How do I store these? I'm not sure

#### Applying language classification

In [7]:
PRETRAINED_MODEL_PATH = 'lid.176.bin'
model = fasttext.load_model(PRETRAINED_MODEL_PATH)



Looking at the languages present first and foremost.

In [8]:
%%time
# applying the function to the rows which have a body
df["bo_lang"] = df[~df["bo"].isna()]["bo"].apply(lang_class)

CPU times: total: 34 s
Wall time: 34 s


In [9]:
%%time
# splitting the tuples found in the lang_class column
# using the index of the filtered df as the index
# using the df where the bo_lang isn't na
df[["bo_lang","conf"]] = pd.DataFrame(df[df['bo_lang'].notna()]["bo_lang"].tolist(), index=df[df["bo"].notna()].index)

CPU times: total: 3 s
Wall time: 3.01 s


In [10]:
%%time
# applying func to pull lang code to the rows where there is a lang code
df["bo_lang"] = df[df["bo_lang"].notna()]["bo_lang"].apply(lang_from_code)


CPU times: total: 5.88 s
Wall time: 5.89 s


In [11]:
df

Unnamed: 0,t,id,cd,c,u,un,ls,cn,uv,url,...,cv,rt,po,pa,z,cz,sc,dv,bo_lang,conf
0,p,50f27e734766438698d4d5c2d689a5f1,20201109,1604927870000,87be30dacdc74cd2936bcbe4ece6010d,Bullscricker,1.610175e+12,1.0,61.0,"[{'id': 'NfKA3', 'u': 'https://image-cdn.parle...",...,,,,,,,,,,
1,p,3619348b91524430882f2b887838a3e4,20201018,1602989910000,8b67993183a14587a001010058d089d2,chucknellis,1.610249e+12,-1.0,113.0,"[{'id': 'rqVjB', 'u': 'https://noqreport.com/2...",...,,,,,,,,,English,[0.8107684254646301]
2,p,1365051bfe6243599e6af1055b71c4a2,20201202,1606897009000,eeeb8dd25b7142b1bc69cbdbe1d8bb62,ThomasFox,1.607430e+12,17.0,426.0,"[{'id': 'ZHMCp', 'u': 'https://rumble.com/vbiz...",...,,,,,,,,,English,[0.6809220314025879]
3,p,5f5e02a226454cbb9f56741a3aa6adce,20200824,1598283523000,e04afa3752f94366813196fb88022440,Kreermary,1.606561e+12,1.0,13.0,"[{'id': 'Zu4DC', 'u': 'https://image-cdn.parle...",...,,,,,,,,,,
4,p,3e689379be5b4e1fa333bbf7f5e1fad2,20201219,1608385852000,46563f08c3a34bebbf4960877d77230f,Starblazer692003,1.610230e+12,-1.0,181.0,"[{'id': 'KbQpt', 'u': 'https://image-cdn.parle...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2254421,c,1ff3e15913c34456828bc70a7966bda1,20201009,1602255581000,e11890c2ffc348df9cb703651a3cc9c0,LibertyElaine,1.608365e+12,1.0,,"[{'id': 'QnA7M', 'u': 'https://image-cdn.parle...",...,,,7f18556bd531453c952295a4a90fe3e5,,2.0,,0.0,,,
2254422,c,0a94034622254a0f96f943a2f7b98621,20200829,1598718069000,f8b37b14d52f493eab60c60486d5e6c6,Aj118,1.607136e+12,,1.0,"[{'id': 'gm0vg', 'u': 'https://image-cdn.parle...",...,,,6f961bfcc04c44da938f62f0032f6eb4,,1.0,,1.0,,,
2254423,c,1dc1abde35234e32a872bcf76bd6ab45,20200820,1597891420000,efaac1ece6a893e0fab48f558e3a5c57,BethocAeilflaed,1.607239e+12,2.0,3.0,"[{'id': 'Ye37z', 'u': 'https://image-cdn.parle...",...,138398b39eef4eedafc3ee14743af8ff,,2c8487380e7146c8becec36618d7b8a7,241d0ceb76454cdda278f441e450c84c,3.0,1.0,3.0,,English,[0.6988052725791931]
2254424,c,4b80d4aa6f9d4775a8ba82f940272507,20200817,1597632198000,f8b37b14d52f493eab60c60486d5e6c6,Aj118,1.607136e+12,,,"[{'id': 'Z6BDS', 'u': 'https://image-cdn.parle...",...,,,cdaceedc947d46159376fa0f495d51ae,,2.0,,0.0,,,


#### Comparing languages

In [17]:
# overall Language counts for top ten languages
df[df["bo_lang"].notna()]["bo_lang"].value_counts().head(10)

English       1266682
Portuguese      25279
Japanese        21815
Spanish         12620
German          10262
Chinese          8512
French           6606
Catalan          4367
Russian          4361
Italian          3756
Name: bo_lang, dtype: int64

In [22]:
# overall Language counts and percents for top ten languages
# excludes nans though -> those w/o body...
print("Overall")
print(pd.concat([df[df["bo_lang"].notna()]["bo_lang"].value_counts().head(10), 
          df[df["bo_lang"].notna()]["bo_lang"].value_counts(normalize=True).head(10)], axis=1, keys=["count", "pct"]))

Overall
              count       pct
English     1266682  0.916547
Portuguese    25279  0.018291
Japanese      21815  0.015785
Spanish       12620  0.009132
German        10262  0.007425
Chinese        8512  0.006159
French         6606  0.004780
Catalan        4367  0.003160
Russian        4361  0.003156
Italian        3756  0.002718


In [24]:
# top ten language counts for posts
df[(df["bo_lang"].notna()) & (df["prov"] == "post")]["bo_lang"].value_counts().head(10)

English       1061052
Portuguese      24135
Japanese        13695
Spanish          7227
German           5597
Chinese          5029
French           3852
Catalan          3815
Russian          2231
Italian          1557
Name: bo_lang, dtype: int64

In [27]:
# top ten language counts and percents for posts
print(pd.concat([df[(df["bo_lang"].notna()) & (df["prov"] == "post")]["bo_lang"].value_counts().head(10), 
         df[(df["bo_lang"].notna()) & (df["prov"] == "post")]["bo_lang"].value_counts(normalize=True).head(10)],
            axis=1, keys=["count", "pct"]))

              count       pct
English     1061052  0.933251
Portuguese    24135  0.021228
Japanese      13695  0.012045
Spanish        7227  0.006357
German         5597  0.004923
Chinese        5029  0.004423
French         3852  0.003388
Catalan        3815  0.003355
Russian        2231  0.001962
Italian        1557  0.001369


In [39]:
# top ten language counts for comments
df[(df["bo_lang"].notna()) & (df["prov"] == "comment")]["bo_lang"].value_counts().head(10)

English       205630
Japanese        8120
Spanish         5393
German          4665
Chinese         3483
French          2754
Italian         2199
Russian         2130
Portuguese      1144
Esperanto        938
Name: bo_lang, dtype: int64

In [28]:
print(pd.concat([df[(df["bo_lang"].notna()) & (df["prov"] == "comment")]["bo_lang"].value_counts().head(10), 
         df[(df["bo_lang"].notna()) & (df["prov"] == "comment")]["bo_lang"].value_counts(normalize=True).head(10)],
            axis=1, keys=["count", "pct"]))

             count       pct
English     205630  0.839056
Japanese      8120  0.033133
Spanish       5393  0.022006
German        4665  0.019035
Chinese       3483  0.014212
French        2754  0.011237
Italian       2199  0.008973
Russian       2130  0.008691
Portuguese    1144  0.004668
Esperanto      938  0.003827


#### Trying Sentiment Analysis