In [63]:
import os
import csv
import cld3
import pandas as pd

from tqdm import tqdm
from itertools import chain

tqdm.pandas()

In [2]:
!ls /eee/tgnews/meta/all_clean

20191101.tsv  20191106.tsv  20191111.tsv  20191116.tsv	20191121.tsv
20191102.tsv  20191107.tsv  20191112.tsv  20191117.tsv	20191122.tsv
20191103.tsv  20191108.tsv  20191113.tsv  20191118.tsv	20191123.tsv
20191104.tsv  20191109.tsv  20191114.tsv  20191119.tsv	20191124.tsv
20191105.tsv  20191110.tsv  20191115.tsv  20191120.tsv	20191125.tsv


In [3]:
def detect_lang(text):
    
    lang_pred = cld3.get_language(text)
    
    if lang_pred is not None:
        return lang_pred.language
    
    return None

In [4]:
load_path = "/eee/tgnews/meta/all_clean"
dump_path = "/eee/tgnews/meta/all_lang"

for table_name in os.listdir(load_path):
    
    table = pd.read_csv(os.path.join(load_path, table_name), sep='\t', keep_default_na=False, quoting=csv.QUOTE_NONE)
    table["lang"] = table["text"].progress_apply(detect_lang)
    
    table.to_csv(os.path.join(dump_path, table_name), sep='\t', index=False, quoting=csv.QUOTE_NONE)

100%|██████████| 82168/82168 [00:45<00:00, 1804.48it/s]
100%|██████████| 33686/33686 [00:18<00:00, 1869.99it/s]
100%|██████████| 39811/39811 [00:21<00:00, 1879.05it/s]
100%|██████████| 65288/65288 [00:35<00:00, 1840.27it/s]
100%|██████████| 17025/17025 [00:09<00:00, 1855.26it/s]
100%|██████████| 25336/25336 [00:13<00:00, 1846.80it/s]
100%|██████████| 85834/85834 [00:47<00:00, 1801.92it/s]
100%|██████████| 18171/18171 [00:09<00:00, 1846.85it/s]
100%|██████████| 26492/26492 [00:14<00:00, 1816.35it/s]
100%|██████████| 27382/27382 [00:14<00:00, 1841.26it/s]
100%|██████████| 25337/25337 [00:13<00:00, 1818.82it/s]
100%|██████████| 99066/99066 [00:54<00:00, 1811.50it/s]
100%|██████████| 46523/46523 [00:24<00:00, 1874.77it/s]
100%|██████████| 65066/65066 [00:35<00:00, 1845.61it/s]
100%|██████████| 63752/63752 [00:34<00:00, 1839.48it/s]
100%|██████████| 73818/73818 [00:40<00:00, 1833.48it/s]
100%|██████████| 78129/78129 [00:42<00:00, 1822.57it/s]
100%|██████████| 24997/24997 [00:13<00:00, 1860.

In [5]:
table.head(1)

Unnamed: 0,path,og:site_name,og:url,og:title,og:description,article:published_time,text,related_links,lang
0,/eee/tgnews/data/20191113/21/78958786282511225...,The Insider,https://theins.ru/news/187548,Бундестаг принял поправки к газовой директиве ...,Бундестаг большинством голосов принял поправки...,2019-11-13T21:27:32+00:00,Бундестаг большинством голосов принял поправки...,,ru


In [54]:
#Считаем количество всевозможных языков для каждого источника

load_path = "/eee/tgnews/meta/all_lang"

site_langs = {}
for table_name in tqdm(os.listdir(load_path)):
    
    table = pd.read_csv(os.path.join(load_path, table_name), usecols=["path", "og:site_name", "lang"], sep='\t', quoting=csv.QUOTE_NONE)
    table = table.dropna()
    
    for (site_name, lang), count in table.groupby(["og:site_name", "lang"])["path"].count().iteritems():
        
        site_langs.setdefault(site_name, {})
        
        if lang not in {"ru", "en"}:
            lang = "other"
        
        site_langs[site_name].setdefault(lang, 0)
        site_langs[site_name][lang] += count

100%|██████████| 25/25 [00:12<00:00,  2.96it/s]


In [55]:
#Оставляем только те источники, которые содержат более статей

site_langs = {site_name:langs for site_name, langs in site_langs.items() if sum(langs.values()) >= 250}
print(f"Всего {len(site_langs)} таких источников")

Всего 1100 таких источников


In [56]:
#Нормируем число статей каждого языка для каждого источника

for site_name, langs in tqdm(site_langs.items()):
    
    sum_count = sum(langs.values())
    site_langs[site_name] = {lang:count/sum_count for lang, count in langs.items()}

100%|██████████| 1100/1100 [00:00<00:00, 1230329.17it/s]


In [57]:
#Нормируем списки источников, для которых мы уверены в языке

sites = {"ru":[], "en":[], "other":[]}

for site_name, langs in site_langs.items():
    for lang, weight in langs.items():
        
        if weight > 0.99:
            sites[lang].append(site_name)
            break

In [58]:
#Сохраняем списки

for lang, lang_sites in sites.items():
    with open(f"../lang_list/{lang}.txt", "w") as fl:
        fl.write('\n'.join(lang_sites))

In [59]:
!realpath ../lang_list

/home/arina/Documents/TelegramNews/lang_list


In [60]:
sites = dict(chain(*[[(site, lang) for site in sites] for lang, sites in sites.items()]))

In [62]:
#Скоректируем языки с учётом списков

load_path = "/eee/tgnews/meta/all_lang"
dump_path = "/eee/tgnews/meta/all_lang_list"

for table_name in tqdm(os.listdir(load_path)):
    
    table = pd.read_csv(os.path.join(load_path, table_name), sep='\t', keep_default_na=False, quoting=csv.QUOTE_NONE)
    
    table["lang"] = table["lang"].apply(lambda x: x if x in {"ru", "en"} else "other")
    table["lang"] = table.apply(lambda x: sites[x["og:site_name"]] if x["og:site_name"] in sites else x["lang"] , axis=1)
    
    table.to_csv(os.path.join(dump_path, table_name), sep='\t', index=False, quoting=csv.QUOTE_NONE)

100%|██████████| 25/25 [01:17<00:00,  2.05s/it]


In [71]:
#Разделяем на языки и сохраняем в разные таблицы

load_path = "/eee/tgnews/meta/all_lang_list"
dump_path = "/eee/tgnews/meta/"

table_ru = []
table_en = []

for table_name in tqdm(os.listdir(load_path)):
    
    table = pd.read_csv(os.path.join(load_path, table_name), sep='\t', keep_default_na=False, quoting=csv.QUOTE_NONE)
    
    table_ru.append(table[table["lang"] == "ru"])
    table_en.append(table[table["lang"] == "en"])
    
table_ru = pd.concat(table_ru, axis=0, ignore_index=True, sort=False).drop(columns="lang")
table_en = pd.concat(table_en, axis=0, ignore_index=True, sort=False).drop(columns="lang")

table_ru.to_csv(os.path.join(dump_path, "ru.tsv"), sep='\t', index=False, quoting=csv.QUOTE_NONE)
table_en.to_csv(os.path.join(dump_path, "en.tsv"), sep='\t', index=False, quoting=csv.QUOTE_NONE)

100%|██████████| 25/25 [00:25<00:00,  1.42it/s]


In [72]:
len(table_ru), len(table_en)

(266102, 233388)