In [2]:
import pandas as pd

# For a 0.001% sample of WP2 outputs for data release 2:
### load urls and detected languages

In [118]:
!for x in ../../sample0.01_sample0.001/*; do zstdcat $x/all.zst|jq -c '{"lang":.lang,"u":.u}' |zstd>$x/url_lang.zst; done

In [3]:
from pathlib import Path
dfs = []
for p in Path('../../sample0.01_sample0.001/').glob('*/url_lang.zst'):
    df = pd.read_json(p, lines=True, encoding_errors='ignore')
    df['crawl'] = p.parent.name
    dfs.append(df)
#     break
df = pd.concat(dfs, ignore_index=True)

In [4]:
len(df)

1055157

In [5]:
df.head()

Unnamed: 0,lang,u,crawl
0,spa_Latn,http://academia.org.mx/academicos-2001/item/ma...,CC-MAIN-2018-26
1,eng_Latn,http://aftek.com/testingservices/testjig%20arc...,CC-MAIN-2018-26
2,,http://albany.upickem.net/engine/Welcome.aspx?...,CC-MAIN-2018-26
3,jpn_Jpan,http://allinfor.ga/468217yuasa-pe2dd_qa9bdd_q4...,CC-MAIN-2018-26
4,ita_Latn,http://carirac.com/differenze/honda-logo-opel-...,CC-MAIN-2018-26


### Split urls into parts

In [6]:
df['site'] = df.u.str.split('//').str[-1].str.split('/').str[0]
df['tld'] = df.site.str.split('.').str[-1]
df['domain'] = df.site.str.split('.').str[-2]
df

Unnamed: 0,lang,u,crawl,site,tld,domain
0,spa_Latn,http://academia.org.mx/academicos-2001/item/ma...,CC-MAIN-2018-26,academia.org.mx,mx,org
1,eng_Latn,http://aftek.com/testingservices/testjig%20arc...,CC-MAIN-2018-26,aftek.com,com,aftek
2,,http://albany.upickem.net/engine/Welcome.aspx?...,CC-MAIN-2018-26,albany.upickem.net,net,upickem
3,jpn_Jpan,http://allinfor.ga/468217yuasa-pe2dd_qa9bdd_q4...,CC-MAIN-2018-26,allinfor.ga,ga,allinfor
4,ita_Latn,http://carirac.com/differenze/honda-logo-opel-...,CC-MAIN-2018-26,carirac.com,com,carirac
...,...,...,...,...,...,...
1055152,ita_Latn,https://www.svapofone.com/tag-prodotto/g25/,CC-MAIN-2022-49,www.svapofone.com,com,svapofone
1055153,eng_Latn,https://www.theyucatantimes.com/2021/11/exclus...,CC-MAIN-2022-49,www.theyucatantimes.com,com,theyucatantimes
1055154,eng_Latn,https://www.urbandictionary.com/define.php?ter...,CC-MAIN-2022-49,www.urbandictionary.com,com,urbandictionary
1055155,ita_Latn,https://www.visititaly.ch/fiera-turistica-a-be...,CC-MAIN-2022-49,www.visititaly.ch,ch,visititaly


### look at most/least frequent langauges and TLDs

In [7]:
lang_cnts = df.lang.value_counts().head(50)
lang_cnts

eng_Latn    404169
zho_Hans    123171
rus_Cyrl     63778
deu_Latn     37568
jpn_Jpan     30682
spa_Latn     26790
fra_Latn     25277
por_Latn     18504
pol_Latn     17658
zho_Hant     16805
nld_Latn     16383
ita_Latn     14150
zsm_Latn     13571
jav_Latn     11511
mlt_Latn     10152
swe_Latn      9908
tur_Latn      9248
pes_Arab      7472
dan_Latn      7123
vie_Latn      7011
zul_Latn      6722
nno_Latn      6232
arb_Arab      6179
kor_Hang      5957
ces_Latn      5730
ind_Latn      4996
cat_Latn      4853
hun_Latn      4807
ell_Grek      4444
ron_Latn      4273
slk_Latn      3594
ilo_Latn      3261
fin_Latn      3041
tha_Thai      2950
lvs_Latn      2841
nob_Latn      2550
ukr_Cyrl      2508
hat_Latn      2410
hrv_Latn      2299
bul_Cyrl      2258
vec_Latn      2135
srd_Latn      1985
lit_Latn      1892
arz_Arab      1868
lim_Latn      1801
est_Latn      1781
slv_Latn      1746
grn_Latn      1612
heb_Hebr      1556
prs_Arab      1275
Name: lang, dtype: int64

In [8]:
lang_cnts = df.lang.value_counts().tail(25)
lang_cnts

snd_Arab    9
shn_Mymr    8
san_Deva    8
fij_Latn    8
lua_Latn    8
cjk_Latn    7
kmb_Latn    7
kas_Arab    7
mos_Latn    7
dik_Latn    7
asm_Beng    5
kik_Latn    4
umb_Latn    4
mai_Deva    4
tum_Latn    4
sot_Latn    4
tir_Ethi    4
kas_Deva    3
kam_Latn    3
kon_Latn    2
taq_Latn    2
fon_Latn    2
ace_Arab    2
kac_Latn    1
acq_Arab    1
Name: lang, dtype: int64

In [9]:
tld_cnts = df.tld.value_counts().head(100)
tld_cnts

com       591681
ru         44734
net        43759
org        43172
cn         39040
           ...  
bid          299
name         297
la           273
am           260
webcam       243
Name: tld, Length: 100, dtype: int64

# Study P(tld|lang) and P(lang|tld)

### for clarity leave only 50 most frequent languages and 100 most frequent TLDs

In [11]:
fdf = df[df.lang.isin(df.lang.value_counts().head(50).index) & df.tld.isin(df.tld.value_counts().head(100).index)]

In [12]:
df_langp = fdf.groupby('tld').lang.value_counts(normalize=True)
df_tldp = fdf.groupby('lang').tld.value_counts(normalize=True)
df_langp

tld  lang    
ae   eng_Latn    0.726027
     arb_Arab    0.150685
     arz_Arab    0.030822
     nld_Latn    0.020548
     jav_Latn    0.013699
                   ...   
za   lit_Latn    0.000926
     lvs_Latn    0.000926
     rus_Cyrl    0.000926
     slv_Latn    0.000926
     zho_Hans    0.000926
Name: lang, Length: 3294, dtype: float64

In [13]:
ldf = df_langp.to_frame().rename(columns={'lang':'P(lang|tld)'}).reset_index()
tdf = df_tldp.to_frame().rename(columns={'tld':'P(tld|lang)'}).reset_index()

qdf = ldf.merge(tdf, on=['tld','lang'], how='inner')

In [14]:
# Which TLDs have more than 50% of documents in a single language?
qdf[qdf["P(lang|tld)"]>0.5].sort_values(by='P(tld|lang)', ascending=False).style.background_gradient(axis=0)  

Unnamed: 0,tld,lang,P(lang|tld),P(tld|lang)
749,cz,ces_Latn,0.806546,0.726953
699,com,eng_Latn,0.506948,0.689378
2457,ro,ron_Latn,0.765946,0.671305
1294,hu,hun_Latn,0.867026,0.642707
1208,gr,ell_Grek,0.806471,0.610577
802,de,deu_Latn,0.816396,0.610003
2315,pl,pol_Latn,0.885822,0.600057
2524,ru,rus_Cyrl,0.869051,0.599387
1379,il,heb_Hebr,0.625203,0.498703
1576,it,ita_Latn,0.73216,0.491739


# Select TLDs to sample from for a few languages we might be interested in labeling some data for; just to debug the data selection procedure

In [15]:
langs = [
    'nob_Latn', 'nno_Latn',
    'rus_Cyrl','glg_Latn','eus_Latn','cat_Latn','spa_Latn',
         'fin_Latn','ces_Latn',
    'gle_Latn', 'gla_Latn','cym_Latn',
    'est_Latn','heb_Hebr','hun_Latn','fra_Latn',
    'prs_Arab']

### When selecting documents from some TLD as an approximation of selecting documents in a given language, P(lang|tld) corresponds to precision and P(tld|lang) to recall. 

In [16]:
# tmp = qdf[qdf['P(tld|lang)']>0.1]
tmp = qdf
toptld = tmp.sort_values(by='P(lang|tld)', ascending=False).groupby('lang').apply(lambda g: g.head(3))
toptld[toptld.lang.isin(langs)].style.background_gradient(axis=0)  

Unnamed: 0_level_0,Unnamed: 1_level_0,tld,lang,P(lang|tld),P(tld|lang)
lang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cat_Latn,439,cat,cat_Latn,0.8025,0.066681
cat_Latn,3172,xxx,cat_Latn,0.041573,0.023058
cat_Latn,975,es,cat_Latn,0.019198,0.023473
ces_Latn,749,cz,ces_Latn,0.806546,0.726953
ces_Latn,3102,webcam,ces_Latn,0.052632,0.002107
ces_Latn,1020,eu,ces_Latn,0.031465,0.020896
est_Latn,945,ee,est_Latn,0.70557,0.302445
est_Latn,3133,ws,est_Latn,0.007212,0.001706
est_Latn,3190,xxx,est_Latn,0.006742,0.010233
fin_Latn,1062,fi,fin_Latn,0.726786,0.404573


In [17]:
qdf[qdf.lang.str.contains('fra_Latn')].sort_values(by="P(lang|tld)", ascending=False).style.background_gradient(axis=0)  

Unnamed: 0,tld,lang,P(lang|tld),P(tld|lang)
1130,fr,fra_Latn,0.762872,0.289585
175,be,fra_Latn,0.203617,0.028187
497,ch,fra_Latn,0.125561,0.018092
394,ca,fra_Latn,0.112544,0.021847
2721,tk,fra_Latn,0.087786,0.000929
2400,pro,fra_Latn,0.057971,0.001131
1017,eu,fra_Latn,0.055791,0.008521
654,co,fra_Latn,0.044724,0.004159
1464,info,fra_Latn,0.03683,0.013044
305,blog,fra_Latn,0.033003,0.000404
