In [1]:
import pandas as pd

In [2]:
!ls

Inspection of translation.ipynb         metadata.ipynb
data_exploration.ipynb                  positive_case_timeslice2.jsonl
filtered_negative_case_timeslice1.jsonl positive_case_timesline1.jsonl
filtered_negative_case_timeslice2.jsonl


In [3]:
def _extract(df, conds):    
    cond = []
    for k, v in conds.items():
        cond.append(df[k] == v)

    cond = list(map(all, zip(*cond)))
    out = df[cond]
    return out

In [4]:
filenames = ["filtered_negative_case_timeslice1.jsonl", 
             "filtered_negative_case_timeslice2.jsonl", 
             "positive_case_timesline1.jsonl", 
             "positive_case_timeslice2.jsonl"]

df_neg1 = pd.read_json(filenames[0], lines=True)
df_neg2 = pd.read_json(filenames[1], lines=True)
df_pos1 = pd.read_json(filenames[2], lines=True)
df_pos2 = pd.read_json(filenames[3], lines=True)

dfs = [df_neg1, df_neg2, df_pos1, df_pos2]

# Overall 

- Empty columns: "extraAttributes", "segments", "mentionedUsers", 

## Language

In [5]:
def num_not_na(df, col):
    return df[col].notna().sum()

In [6]:
data = []
col = ["# Doc", "# En Doc", "# Fr Doc", 
       "# translatedContentText (Fr->En)", "# translatedTitle (Fr->En)"]

for df in dfs:
    num_doc = len(df)
    num_en = len(df[df.language=="English"])
    num_fr = len(df[df.language=="French"])
    num_trans_text = num_not_na(df, "translatedContentText")
    num_trans_title = num_not_na(df, "translatedTitle")
    
    data.append([num_doc, num_en, num_fr, 
                 num_trans_text, num_trans_title])
    
pd.DataFrame(data, columns=col, index=filenames)

Unnamed: 0,# Doc,# En Doc,# Fr Doc,# translatedContentText (Fr->En),# translatedTitle (Fr->En)
filtered_negative_case_timeslice1.jsonl,7762,325,7437,5849,5822
filtered_negative_case_timeslice2.jsonl,624,30,594,520,496
positive_case_timesline1.jsonl,691,218,473,299,292
positive_case_timeslice2.jsonl,513,237,276,179,178


## Mediatype

In [7]:
data = []
col = ["# Doc", "# News", "# Blog", "# Twitter", 
       "# Reddit", "# Forum", "# Other"]

types = ["News", "Blog", "Twitter", 
         "Reddit", "Forum", "Other"]

for df in dfs:
    sub = [len(df)]
    dic = df[["mediaType", "contentText"]].groupby(["mediaType"]).count().to_dict()["contentText"]
    
    for t in types:
        sub.append(dic.get(t, 0))
    data.append(sub)

pd.DataFrame(data, columns=col, index=filenames)

Unnamed: 0,# Doc,# News,# Blog,# Twitter,# Reddit,# Forum,# Other
filtered_negative_case_timeslice1.jsonl,7762,1715,655,2846,370,2032,144
filtered_negative_case_timeslice2.jsonl,624,126,65,308,44,75,6
positive_case_timesline1.jsonl,691,127,326,80,5,143,10
positive_case_timeslice2.jsonl,513,78,305,46,0,80,4


### mediaType word count

In [8]:
data = []

for df, fname in zip(dfs, filenames):
    df["wordCount"] = df["contentText"].apply(lambda x: len(x.split()))
    res = df[["wordCount", "mediaType"]].groupby(["mediaType"])["wordCount"]
    res = res.describe().to_dict()
    
    sub = []
    mean, std = res["mean"], res["std"]
    
    for t in types:
        m = str(round(mean.get(t, 0), 2))
        s = str(round(std.get(t, 0), 2))
        sub.append(m + " + " + s)
    
    data.append(sub)

print("Word counts: Mean + Std")
pd.DataFrame(data, columns=col[1:], index=filenames)

Word counts: Mean + Std


Unnamed: 0,# News,# Blog,# Twitter,# Reddit,# Forum,# Other
filtered_negative_case_timeslice1.jsonl,628.64 + 813.18,1089.39 + 2408.32,27.36 + 14.47,75.46 + 145.76,335.92 + 599.99,90.27 + 95.98
filtered_negative_case_timeslice2.jsonl,682.71 + 653.37,683.17 + 1270.45,31.77 + 14.63,67.16 + 86.4,290.37 + 357.86,187.67 + 149.6
positive_case_timesline1.jsonl,1546.94 + 1715.71,8237.43 + 11017.45,27.99 + 14.7,169.2 + 85.7,856.79 + 1522.62,1450.0 + 4094.06
positive_case_timeslice2.jsonl,1502.83 + 1130.1,11735.17 + 10638.1,32.46 + 13.43,0 + 0,913.61 + 1510.26,289.0 + 183.58


In [9]:
print("Word counts: Detailed", end="\n\n\n")

for df, fname in zip(dfs, filenames):
    df["wordCount"] = df["contentText"].apply(lambda x: len(x.split()))
    res = df[["wordCount", "mediaType"]].groupby(["mediaType"])["wordCount"].describe()
    res.drop(columns=["25%", "75%"], inplace=True)
    print(fname, end="\n\n")
    print(res, end="\n\n\n\n")

Word counts: Detailed


filtered_negative_case_timeslice1.jsonl

            count         mean          std   min    50%      max
mediaType                                                        
Blog        655.0  1089.389313  2408.320105  10.0  302.0  22225.0
Forum      2032.0   335.915354   599.990990   1.0  154.0   7352.0
News       1715.0   628.642566   813.179788  12.0  452.0  12180.0
Other       144.0    90.270833    95.979326   8.0   60.5    659.0
Reddit      370.0    75.464865   145.764930   1.0   33.0   1798.0
Twitter    2846.0    27.361560    14.466457   1.0   27.0     61.0



filtered_negative_case_timeslice2.jsonl

           count        mean          std   min    50%     max
mediaType                                                     
Blog        65.0  683.169231  1270.445561  15.0   83.0  5793.0
Forum       75.0  290.373333   357.855919  10.0  179.0  1655.0
News       126.0  682.706349   653.369360  36.0  513.5  4084.0
Other        6.0  187.666667   149.603030  56.0 

# English mediatype

In [10]:
data = []
col = ["# Doc", "# News", "# Blog", "# Twitter", 
       "# Reddit", "# Forum", "# Other"]

types = ["News", "Blog", "Twitter", 
         "Reddit", "Forum", "Other"]

for df in dfs:
    df = df[df["language"] == "English"]
    sub = [len(df)]
    dic = df[["mediaType", "contentText"]].groupby(["mediaType"]).count().to_dict()["contentText"]
    
    for t in types:
        sub.append(dic.get(t, 0))
    data.append(sub)
    
    
pd.DataFrame(data, columns=col, index=filenames)

Unnamed: 0,# Doc,# News,# Blog,# Twitter,# Reddit,# Forum,# Other
filtered_negative_case_timeslice1.jsonl,325,1,27,276,7,12,2
filtered_negative_case_timeslice2.jsonl,30,0,0,28,2,0,0
positive_case_timesline1.jsonl,218,1,161,32,2,21,1
positive_case_timeslice2.jsonl,237,0,214,9,0,14,0


### mediaType word count

- Turns out that for English, the length of each News article is fixed.

In [11]:
data = []

for df, fname in zip(dfs, filenames):
    df["wordCount"] = df["contentText"].apply(lambda x: len(x.split()))
    df = df[df["language"] == "English"]
    
    res = df[["wordCount", "mediaType"]].groupby(["mediaType"])["wordCount"]
    res = res.describe().to_dict()
    
    sub = []
    mean, std = res["mean"], res["std"]
    
    for t in types:
        m = str(round(mean.get(t, 0), 2))
        s = str(round(std.get(t, 0), 2))
        sub.append(m + " + " + s)
    
    data.append(sub)

print("Word counts: Mean + Std")
pd.DataFrame(data, columns=col[1:], index=filenames)

Word counts: Mean + Std


Unnamed: 0,# News,# Blog,# Twitter,# Reddit,# Forum,# Other
filtered_negative_case_timeslice1.jsonl,538.0 + nan,4841.44 + 5780.53,21.53 + 14.79,255.57 + 211.55,1265.83 + 1833.61,318.5 + 62.93
filtered_negative_case_timeslice2.jsonl,0 + 0,0 + 0,22.46 + 15.13,310.0 + 156.98,0 + 0,0 + 0
positive_case_timesline1.jsonl,1268.0 + nan,15085.01 + 12208.57,24.44 + 16.0,126.5 + 50.2,2855.9 + 3250.23,13093.0 + nan
positive_case_timeslice2.jsonl,0 + 0,15884.4 + 10117.68,27.78 + 16.49,0 + 0,2995.07 + 2666.15,0 + 0


In [12]:
print("Word counts: Detailed", end="\n\n\n")

for df, fname in zip(dfs, filenames):
    df["wordCount"] = df["contentText"].apply(lambda x: len(x.split()))
    df = df[df["language"] == "English"]
    
    res = df[["wordCount", "mediaType"]].groupby(["mediaType"])["wordCount"].describe()
    res.drop(columns=["25%", "75%"], inplace=True)
    print(fname, end="\n\n")
    print(res, end="\n\n\n\n")

Word counts: Detailed


filtered_negative_case_timeslice1.jsonl

           count         mean          std    min     50%      max
mediaType                                                         
Blog        27.0  4841.444444  5780.527460   53.0  2357.0  22225.0
Forum       12.0  1265.833333  1833.614505  152.0   713.0   6853.0
News         1.0   538.000000          NaN  538.0   538.0    538.0
Other        2.0   318.500000    62.932504  274.0   318.5    363.0
Reddit       7.0   255.571429   211.552875   19.0   317.0    575.0
Twitter    276.0    21.528986    14.788050    1.0    23.0     57.0



filtered_negative_case_timeslice2.jsonl

           count        mean         std    min    50%    max
mediaType                                                    
Reddit       2.0  310.000000  156.977705  199.0  310.0  421.0
Twitter     28.0   22.464286   15.127806    2.0   19.0   43.0



positive_case_timesline1.jsonl

           count          mean           std      min      50%      max


# French mediatype

In [13]:
data = []
col = ["# Doc", "# News", "# Blog", "# Twitter", 
       "# Reddit", "# Forum", "# Other"]

types = ["News", "Blog", "Twitter", 
         "Reddit", "Forum", "Other"]

for df in dfs:
    df = df[df["language"] == "French"]
    sub = [len(df)]
    dic = df[["mediaType", "contentText"]].groupby(["mediaType"]).count().to_dict()["contentText"]
    
    for t in types:
        sub.append(dic.get(t, 0))
    data.append(sub)
    

pd.DataFrame(data, columns=col, index=filenames)

Unnamed: 0,# Doc,# News,# Blog,# Twitter,# Reddit,# Forum,# Other
filtered_negative_case_timeslice1.jsonl,7437,1714,628,2570,363,2020,142
filtered_negative_case_timeslice2.jsonl,594,126,65,280,42,75,6
positive_case_timesline1.jsonl,473,126,165,48,3,122,9
positive_case_timeslice2.jsonl,276,78,91,37,0,66,4


### mediaType word count


In [14]:
data = []

for df, fname in zip(dfs, filenames):
    df["wordCount"] = df["contentText"].apply(lambda x: len(x.split()))
    df = df[df["language"] == "French"]
    
    res = df[["wordCount", "mediaType"]].groupby(["mediaType"])["wordCount"]
    res = res.describe().to_dict()
    
    sub = []
    mean, std = res["mean"], res["std"]
    
    for t in types:
        m = str(round(mean.get(t, 0), 2))
        s = str(round(std.get(t, 0), 2))
        sub.append(m + " + " + s)
    
    data.append(sub)

print("Word counts: Mean + Std")
pd.DataFrame(data, columns=col[1:], index=filenames)

Word counts: Mean + Std


Unnamed: 0,# News,# Blog,# Twitter,# Reddit,# Forum,# Other
filtered_negative_case_timeslice1.jsonl,628.7 + 813.41,928.07 + 2007.95,27.99 + 14.29,71.99 + 142.4,330.39 + 581.93,87.06 + 92.55
filtered_negative_case_timeslice2.jsonl,682.71 + 653.37,683.17 + 1270.45,32.7 + 14.28,55.6 + 64.92,290.37 + 357.86,187.67 + 149.6
positive_case_timesline1.jsonl,1549.15 + 1722.38,1555.86 + 2046.22,30.35 + 13.41,197.67 + 101.93,512.68 + 402.02,156.33 + 169.64
positive_case_timeslice2.jsonl,1502.83 + 1130.1,1977.64 + 1669.52,33.59 + 12.58,0 + 0,472.09 + 468.38,289.0 + 183.58


In [15]:
print("Word counts: Detailed", end="\n\n\n")

for df, fname in zip(dfs, filenames):
    df["wordCount"] = df["contentText"].apply(lambda x: len(x.split()))
    df = df[df["language"] == "French"]
    
    res = df[["wordCount", "mediaType"]].groupby(["mediaType"])["wordCount"].describe()
    res.drop(columns=["25%", "75%"], inplace=True)
    print(fname, end="\n\n")
    print(res, end="\n\n\n\n")

Word counts: Detailed


filtered_negative_case_timeslice1.jsonl

            count        mean          std   min    50%      max
mediaType                                                       
Blog        628.0  928.074841  2007.951096  10.0  259.0  18743.0
Forum      2020.0  330.391089   581.928555   1.0  153.0   7352.0
News       1714.0  628.695449   813.414159  12.0  451.0  12180.0
Other       142.0   87.056338    92.549222   8.0   60.0    659.0
Reddit      363.0   71.991736   142.397787   1.0   33.0   1798.0
Twitter    2570.0   27.987938    14.293545   1.0   28.0     61.0



filtered_negative_case_timeslice2.jsonl

           count        mean          std   min    50%     max
mediaType                                                     
Blog        65.0  683.169231  1270.445561  15.0   83.0  5793.0
Forum       75.0  290.373333   357.855919  10.0  179.0  1655.0
News       126.0  682.706349   653.369360  36.0  513.5  4084.0
Other        6.0  187.666667   149.603030  56.0  152.5  