In [1]:
import pandas as pd
from langdetect import detect

In [2]:
def _extract(df, conds):    
    cond = []
    for k, v in conds.items():
        cond.append(df[k] == v)

    cond = list(map(all, zip(*cond)))
    out = df[cond]
    return out

In [3]:
filenames = ["filtered_negative_case_timeslice1.jsonl", 
             "filtered_negative_case_timeslice2.jsonl", 
             "positive_case_timesline1.jsonl", 
             "positive_case_timeslice2.jsonl"]

df_neg1 = pd.read_json(filenames[0], lines=True)
df_neg2 = pd.read_json(filenames[1], lines=True)
df_pos1 = pd.read_json(filenames[2], lines=True)
df_pos2 = pd.read_json(filenames[3], lines=True)

dfs = [df_neg1, df_neg2, df_pos1, df_pos2]

# Translation errors

### Evaluation of the detection ability of `detect` 

- all above the random baselines: always guessing the most likely class

In [4]:
def lang_detect(text):
    if text == None:
        return None
        
    try: x = detect(text)
    except: x = "NA"
    
    return {"en": "English", "fr": "French"}.get(x, x)

In [5]:
for df, fname in zip(dfs, filenames):
    df["detected lang"] = df["contentText"].apply(lang_detect)
    accu = (df["detected lang"] == df["language"]).sum() / len(df)
    
    print(f"Language detection accuracy for {fname} is {accu}")

Language detection accuracy for filtered_negative_case_timeslice1.jsonl is 0.979515588765782
Language detection accuracy for filtered_negative_case_timeslice2.jsonl is 0.9839743589743589
Language detection accuracy for positive_case_timesline1.jsonl is 0.984081041968162
Language detection accuracy for positive_case_timeslice2.jsonl is 0.9941520467836257


### translatedTitle

In [6]:
data = []

for df, fname in zip(dfs, filenames):
    
    df["transTitleLang"] = df["translatedTitle"].apply(lang_detect)
    
    rate = (df["transTitleLang"] == "English").sum() / df["translatedTitle"].notna().sum()
    print(f"Rate of English for {fname}: {rate}")
    
    df["badTransTitle"] = df["transTitleLang"].apply(lambda x: x == "French")
    
    a = df[["title", "translatedTitle", "transTitleLang"]][df["badTransTitle"]]
    a.insert(0, "Filename", fname)
    a.insert(1, "Original Index", a.index)
    data.append(a)

Rate of English for filtered_negative_case_timeslice1.jsonl: 0.9445207832360014
Rate of English for filtered_negative_case_timeslice2.jsonl: 0.9616935483870968
Rate of English for positive_case_timesline1.jsonl: 0.976027397260274
Rate of English for positive_case_timeslice2.jsonl: 0.9550561797752809


In [7]:
title_df = pd.concat(data).reset_index()

In [8]:
title_df

Unnamed: 0,index,Filename,Original Index,title,translatedTitle,transTitleLang
0,18,filtered_negative_case_timeslice1.jsonl,18,En direct. Présidentielle 2022 : Marine Le Pen...,Live. President 2022: Marine Le Pen envisages ...,French
1,117,filtered_negative_case_timeslice1.jsonl,117,[ALERTE] Russie/Ukraine… Suite/suite 14-02- 21...,[ALERT] Russia/Ukraine... Continue/cont'd 14-0...,French
2,200,filtered_negative_case_timeslice1.jsonl,200,[ALERTE] Russie/Ukraine… Suite/suite 14-02- 21...,[ALERT] Russia/Ukraine... Continue/cont'd 14-0...,French
3,221,filtered_negative_case_timeslice1.jsonl,221,[ALERTE] Russie/Ukraine… Suite/suite 1402-2022...,[ALERT] Russia/Ukraine... Continue/cont'd 1402...,French
4,224,filtered_negative_case_timeslice1.jsonl,224,[ALERTE] Russie/Ukraine… Suite/suite 14-02- 21...,[ALERT] Russia/Ukraine... Continue/cont'd 14-0...,French
...,...,...,...,...,...,...
125,480,positive_case_timesline1.jsonl,480,[ALERTE] Russie/Ukraine… Suite/suite 14-02- 21...,[ALERT] Russia/Ukraine... Continue/cont'd 14-0...,French
126,597,positive_case_timesline1.jsonl,597,[ALERTE] Russie/Ukraine… Suite/suite sur le fo...,[ALERT] Russia/Ukraine... Continue/suite on th...,French
127,133,positive_case_timeslice2.jsonl,133,L’empire du mensonge en matière d’armes biolog...,Le empire du lie en matière d'armes biologique...,French
128,153,positive_case_timeslice2.jsonl,153,@UPR_Asselineau ⚠️#Donbass - INFO CONFIRMEE -\...,@UPR_Asselineau //t.co/nSM4lPq72f,French


### translatedContentText

In [9]:
data = []

for df, fname in zip(dfs, filenames):
    
    df["transTextLang"] = df["translatedContentText"].apply(lang_detect)
    rate = (df["transTextLang"] == "English").sum() / df["translatedContentText"].notna().sum()
    print(f"Rate of English for {fname}: {rate}")
    
    # df["badTransText"] = df["transTextLang"].apply(lambda x: x == "French")
    df["badTransText"] = df["transTextLang"].apply(lambda x: x not in [None, "English"])
    
    a = df[["contentText", "translatedContentText", "transTextLang"]][df["badTransText"]]
    a.insert(0, "Filename", fname)
    a.insert(1, "Original Index", a.index)
    data.append(a)

Rate of English for filtered_negative_case_timeslice1.jsonl: 0.9847837237134552
Rate of English for filtered_negative_case_timeslice2.jsonl: 0.9865384615384616
Rate of English for positive_case_timesline1.jsonl: 1.0
Rate of English for positive_case_timeslice2.jsonl: 0.994413407821229


In [10]:
text_df = pd.concat(data).reset_index()
text_df

Unnamed: 0,index,Filename,Original Index,contentText,translatedContentText,transTextLang
0,335,filtered_negative_case_timeslice1.jsonl,335,enfonçage de porte ouverte ou truisme ?,Open door or truism?,nl
1,350,filtered_negative_case_timeslice1.jsonl,350,RT @UPR_Asselineau 🚨Suite prévisible 1- le Con...,RT @UPR_Asselineau,et
2,351,filtered_negative_case_timeslice1.jsonl,351,RT @UPR_Asselineau ⚠️ la Russie n'est que «pay...,RT @UPR_Asselineau,et
3,535,filtered_negative_case_timeslice1.jsonl,535,U stupid aussie wanker,U stupid also wanker,af
4,653,filtered_negative_case_timeslice1.jsonl,653,Il ne se prive pas de le faire avec ceux qu’il...,He doesn't stop doing it with people he doesn'...,af
...,...,...,...,...,...,...
92,375,filtered_negative_case_timeslice2.jsonl,375,@EmmanuelMacron Bnsoir macron,@EmmanuelMacron Bnsoir macron,French
93,451,filtered_negative_case_timeslice2.jsonl,451,RT @UPR_Asselineau 🚨L'UKRAINE À COURT DE MUNIT...,RT @UPR_Asselineau,et
94,452,filtered_negative_case_timeslice2.jsonl,452,RT @UPR_Asselineau 🚨L'UKRAINE À COURT DE MUNIT...,RT @UPR_Asselineau,et
95,529,filtered_negative_case_timeslice2.jsonl,529,RT @UPR_Asselineau ▪️que les Africains rejette...,RT @UPR_Asselineau ▪,et


In [11]:
title_df.to_csv("transTitlesInspected.csv")
text_df.to_csv("transTextsInspected.csv")