In [1]:
import re
import os
import csv
import artm
import numpy as np
import pandas as pd

from tqdm import tqdm
from itertools import chain 
from collections import Counter
from ufal.udpipe import Model, Sentence, ProcessingError

In [2]:
load_path = "/eee/tgnews/meta/"

columns = ["path", "og:site_name", "og:url"]

table_ru = pd.read_csv(os.path.join(load_path, "ru.tsv"), sep='\t', usecols=columns, keep_default_na=False, quoting=csv.QUOTE_NONE)
table_en = pd.read_csv(os.path.join(load_path, "en.tsv"), sep='\t', usecols=columns, keep_default_na=False, quoting=csv.QUOTE_NONE)

In [3]:
len(table_ru), len(table_en)

(266102, 233388)

In [4]:
#Выделение токенов из url

def url_to_tokens(url, lowercase=True, split_pattern="[^a-z]+", filter_pattern="[a-z]{3,}"):
    
    tokens = re.sub("https*://[^/]*", "", url).strip("/").split("/")[:-1]
    
    if lowercase:
        tokens = [token.lower() for token in tokens]
        
    if split_pattern is not None:
        tokens = list(chain(*[re.split(split_pattern, token) for token in tokens]))
        
    if filter_pattern is not None:
        tokens = list(filter(lambda x: re.search(filter_pattern, x), tokens))
    
    return tokens

table_ru["tokens:url"] = table_ru["og:url"].apply(url_to_tokens)
table_en["tokens:url"] = table_en["og:url"].apply(url_to_tokens)

In [5]:
#Формируем список наиболее частотных токенов

site_tokens = dict([(site, Counter(tokens)) for site, tokens in table_ru.groupby("og:site_name")["tokens:url"].sum().iteritems()])

for site, tokens in table_en.groupby("og:site_name")["tokens:url"].sum().iteritems():
    
    site_tokens.setdefault(site, Counter())
    site_tokens[site] += Counter(tokens)
    
token_count = {}
token_sites = {}

for site, tokens in site_tokens.items():
    for token, count in tokens.items():
        
        token_count.setdefault(token, 0)
        token_count[token] += count
        
        token_sites.setdefault(token, [])
        token_sites[token].append(site)
        
tokens = [token for token, count in token_count.items() if (count >= 100) and (len(token_sites[token]) >= 10)]

with open("../url_tokens.txt", "w") as fl:
    fl.write('\n'.join(tokens))

In [6]:
def predict_class_news_by_url(url_tokens):
    
    url_tokens = " ".join(url_tokens)
    classes = []
    
    for part in ["society",
                 "politic",
                 "incident",
                 "criminal",
                 "proisshestviya",
                 "obshchestvo",
                 "social",
                 "murder",
                 "politika",
                 "crime",
                 "kriminal",
                 "parliament",
                 "president",
                 "law",
                 "government",
                 "political"]:
    
        if part in url_tokens:
            classes.append("Society")
            break
        
    for part in ["econom",
                 "ekonomika",
                 "business",
                 "industry",
                 "biznes",
                 "bank",
                 "energy",
                 "company",
                 "finance"]:
    
        if part in url_tokens:
            classes.append("Economy")
            break
        
    for part in ["sport",
                 "hockey",
                 "football",
                 "tennis",
                 "basketball",
                 "box",
                 "fitness",
                 "athletic",
                 "golf"]:
        
        if part in url_tokens:
            classes.append("Sports")
            break
        
    for part in ["auto",
                 "tehn",
                 "internet",
                 "digital",
                 "mobile",
                 "gadgets",
                 "smart"]:
        
        if part in url_tokens:
            classes.append("Technology")
            break
        
    for part in ["art",
                 "kino",
                 "culture",
                 "kultura",
                 "entertainment",
                 "afisha",
                 "cinema",
                 "book",
                 "game",
                 "music",
                 "party",
                 "gaming"]:
        
        if part in url_tokens:
            classes.append("Entertainment")
            break
    
    for part in ["nauka",
                 "health",
                 "research",
                 "science",
                 "medicine"]:
        
        if part in url_tokens:
            classes.append("Science")
            break
    
    if not classes:
        return None
    
    return classes

table_ru["class"] = table_ru["tokens:url"].apply(predict_class_news_by_url)
table_en["class"] = table_en["tokens:url"].apply(predict_class_news_by_url)

In [6]:
url_token = "features"

paths = table_ru[table_ru["tokens:url"].apply(lambda x: url_token in x)]["path"].tolist()
paths = paths + table_en[table_en["tokens:url"].apply(lambda x: url_token in x)]["path"].tolist()

In [22]:
with open(paths[np.random.choice(range(len(paths)))]) as fl:
    print(fl.read())

<!DOCTYPE html>
<html>
  <head>
    <meta charset="utf-8"/>
    <meta property="og:url" content="https://nowtoronto.com/movies/features/disney-frozen-2-indigenous-culture-sami/"/>
    <meta property="og:site_name" content="NOW"/>
    <meta property="article:published_time" content="2019-11-19T16:32:33+00:00"/>
    <meta property="og:title" content="Disney signed a contract with Indigenous people before making Frozen II"/>
    <meta property="og:description" content="After a debate about cultural appropriation surrounded Frozen, Disney agreed in writing to respectfully portray Sámi culture in the sequel"/>
  </head>
  <body>
    <article>
      <h1>Disney signed a contract with Indigenous people before making Frozen II</h1>
      <h2>After a debate about cultural appropriation surrounded Frozen, Disney agreed in writing to respectfully portray Sámi culture in the sequel</h2>
      <address><time datetime="2019-11-19T16:32:33+00:00">19 Nov 2019, 16:32</time> by <a rel="author">Radheyan S

In [7]:
Counter(chain(*table_ru["class"].dropna())).most_common()

[('Society', 25169),
 ('Entertainment', 18662),
 ('Economy', 6747),
 ('Sports', 6638),
 ('Science', 2394),
 ('Technology', 2159)]

In [8]:
Counter(chain(*table_en["class"].dropna())).most_common()

[('Entertainment', 36445),
 ('Sports', 20183),
 ('Economy', 9490),
 ('Society', 8379),
 ('Science', 2132),
 ('Technology', 1219)]

**ru**

In [24]:
table_ru = table_ru.join(pd.read_csv(os.path.join(load_path, "ru_preproc.csv"), keep_default_na=False).set_index("path"), on="path")

In [10]:
udpipe_model = Model.load('/eee/tgnews/misc/ru.udpipe')
conllu_tokenizer = udpipe_model.newTokenizer("conllu").newConlluInputFormat()

def conllu_encode(conllu_text):
    
    sentence = Sentence()
    error = ProcessingError()
    
    conllu_tokenizer.setText(conllu_text)
    
    sentences = []
    while conllu_tokenizer.nextSentence(sentence, error):
        sentences.append(sentence.words[1:]) 
        
    return sentences

In [25]:
table_ru["og:title"] = table_ru["og:title"].apply(conllu_encode)
table_ru["text"] = table_ru["text"].apply(conllu_encode)

In [12]:
def vowpal_wabbit_encode(sents):
    
    upostags = {'ADJ', 'INTJ', 'NOUN', 'PROPN', 'VERB'}
    
    tokens = [[token.lemma for token in sent if token.upostag in upostags] for sent in sents]
    tokens = [token.replace(":", "") for token in chain(*tokens)]
    tokens = Counter(tokens)
    tokens = [token + ('' if count == 1 else f':{count}') for token, count in tokens.items()]
    
    return ' '.join(tokens)

In [26]:
table_ru["og:title"] = table_ru["og:title"].apply(vowpal_wabbit_encode)
table_ru["text"] = table_ru["text"].apply(vowpal_wabbit_encode)

In [29]:
with open("../table_ru_vw.txt", "w") as fl:
    fl.write('\n'.join(table_ru.apply(lambda x: x["path"] + " |title_tokens " + x["og:title"] + " |text_tokens " + x["text"], axis=1)) + '\n')
    
batch_vectorizer = artm.BatchVectorizer(data_path='../table_ru_vw.txt', 
                                        data_format='vowpal_wabbit',
                                        target_folder='../batches_ru')

In [30]:
batch_vectorizer.dictionary.filter(min_df=3, min_tf=5)

batch_vectorizer.dictionary.save_text(dictionary_path='../models/dictionary_ru.txt')
batch_vectorizer.dictionary.save(dictionary_path='../models/dictionary_ru.dict')

In [31]:
num_topics = 8
num_tokens = 10

class_ids = ["title_tokens", "text_tokens"]
topic_names=["Society", "Economy", "Sports", "Technology", "Entertainment", "Science", "Other", "background"]

docs_for_sst = table_ru[~table_ru["class"].isna()][["path", "class"]]
n_docs = len(docs_for_sst)

doc_titles = docs_for_sst.path.tolist()
topic_indecies = docs_for_sst["class"].apply(lambda x: [topic_names.index(topic) for topic in x]).tolist()

doc_topic_coef = np.zeros((n_docs, num_topics))
for index, topic_index in enumerate(topic_indecies):
    doc_topic_coef[index, topic_index] = 1
    
doc_topic_coef = doc_topic_coef / doc_topic_coef.sum(axis=1)[:, np.newaxis]
doc_topic_coef = doc_topic_coef.tolist()

In [32]:
model = artm.ARTM(num_topics=num_topics, 
                  class_ids=class_ids,
                  topic_names=topic_names,
                  dictionary=batch_vectorizer.dictionary, 
                  theta_columns_naming="title")

model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='SST', tau=100, 
                                                         doc_titles=doc_titles, 
                                                         doc_topic_coef=doc_topic_coef))

model.regularizers.add(artm.SmoothSparsePhiRegularizer('SmoothPhi_back', tau=1, gamma=0, 
                                                       topic_names=["background"]))

model.regularizers.add(artm.DecorrelatorPhiRegularizer('DecPhi', 
                                                       tau=0.1, 
                                                       gamma=0))

for class_id in class_ids:
    model.scores.add(artm.TopTokensScore(name=f'TT_{class_id}', class_id=class_id, num_tokens=num_tokens))
    
model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=5)

In [33]:
for modality, modality_top_tokens_tracker in model.score_tracker.items():
    print(modality)
    for topic_name, modality_top_tokens in modality_top_tokens_tracker.last_tokens.items():
        print(topic_name)
        print('; '.join(modality_top_tokens))
    print()

TT_title_tokens
Society
Россия; Украина; область; дело; задержать; человек; район; погибнуть; суд; житель
Economy
год; рубль; миллион; Украина; миллиард; газ; банк; мочь; бюджет; рынок
Sports
матч; сборная; Кубок; победа; тренер; Динамо; чемпионат; команда; игрок; выиграть
Technology
США; Трамп; выборы; представить; президент; поезд; автомобиль; Зеленский; Турция; Лукашенко
Entertainment
новый; год; стать; ноябрь; день; пройти; рассказать; фильм; конкурс; Москва
Science
ученый; врач; назвать; Сирия; рак; продукт; здоровье; опасный; обнаружить; рассказывать
Other
школа; знак; приложение; школьник; Госдума; Гугл; пользователь; Мелитополь; умный; мусор
background
к; ро; ча; примета; м; н; ний; р; в; ние

TT_text_tokens
Society
ноябрь; Россия; сообщать; дело; человек; Украина; сообщить; область; время; район
Economy
рубль; компания; миллион; миллиард; Россия; банк; проект; тысяча; рынок; цена
Sports
команда; матч; сборная; клуб; чемпионат; игра; победа; тренер; сезон; игрок
Technology
США;

In [34]:
model.dump_artm_model('../models/cl_news_model_ru.dump_tm')
model.save('../models/cl_news_model_ru.save_tm')

**en**

In [9]:
table_en = table_en.join(pd.read_csv(os.path.join(load_path, "en_preproc.csv"), keep_default_na=False).set_index("path"),  on="path")

In [11]:
table_en["og:title"] = table_en["og:title"].apply(conllu_encode)
table_en["text"] = table_en["text"].apply(conllu_encode)

In [13]:
table_en["og:title"] = table_en["og:title"].apply(vowpal_wabbit_encode)
table_en["text"] = table_en["text"].apply(vowpal_wabbit_encode)

In [14]:
with open("../table_en_vw.txt", "w") as fl:
    fl.write('\n'.join(table_en.apply(lambda x: x["path"] + " |title_tokens " + x["og:title"] + " |text_tokens " + x["text"], axis=1)) + '\n')
    
batch_vectorizer = artm.BatchVectorizer(data_path='../table_en_vw.txt', 
                                        data_format='vowpal_wabbit',
                                        target_folder='../batches_en')

In [15]:
batch_vectorizer.dictionary.filter(min_df=3, min_tf=5)

batch_vectorizer.dictionary.save_text(dictionary_path='../models/dictionary_en.txt')
batch_vectorizer.dictionary.save(dictionary_path='../models/dictionary_en.dict')

In [16]:
num_topics = 8
num_tokens = 10

class_ids = ["title_tokens", "text_tokens"]
topic_names=["Society", "Economy", "Sports", "Technology", "Entertainment", "Science", "Other", "background"]

docs_for_sst = table_en[~table_en["class"].isna()][["path", "class"]]
n_docs = len(docs_for_sst)

doc_titles = docs_for_sst.path.tolist()
topic_indecies = docs_for_sst["class"].apply(lambda x: [topic_names.index(topic) for topic in x]).tolist()

doc_topic_coef = np.zeros((n_docs, num_topics))
for index, topic_index in enumerate(topic_indecies):
    doc_topic_coef[index, topic_index] = 1
    
doc_topic_coef = doc_topic_coef / doc_topic_coef.sum(axis=1)[:, np.newaxis]
doc_topic_coef = doc_topic_coef.tolist()

In [17]:
model = artm.ARTM(num_topics=num_topics, 
                  class_ids=class_ids,
                  topic_names=topic_names,
                  dictionary=batch_vectorizer.dictionary, 
                  theta_columns_naming="title")

model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='SST', tau=100, 
                                                         doc_titles=doc_titles, 
                                                         doc_topic_coef=doc_topic_coef))

model.regularizers.add(artm.SmoothSparsePhiRegularizer('SmoothPhi_back', tau=1, gamma=0, 
                                                       topic_names=["background"]))

model.regularizers.add(artm.DecorrelatorPhiRegularizer('DecPhi', 
                                                       tau=0.1, 
                                                       gamma=0))

for class_id in class_ids:
    model.scores.add(artm.TopTokensScore(name=f'TT_{class_id}', class_id=class_id, num_tokens=num_tokens))
    
model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=5)

In [18]:
for modality, modality_top_tokens_tracker in model.score_tracker.items():
    print(modality)
    for topic_name, modality_top_tokens in modality_top_tokens_tracker.last_tokens.items():
        print(topic_name)
        print('; '.join(modality_top_tokens))
    print()

TT_title_tokens
Society
election; trump; court; impeachment; president; house; Ukraine; vote; Impeachment; Johnson
Economy
trade; China; US; market; bank; price; business; deal; company; Q
Sports
win; have; game; v; man; star; make; city; team; play
Technology
Friday; Google; black; get; best; New; deal; Christmas; Pro; Disney
Entertainment
say; new; year; have; make; US; get; day; protest; first
Science
man; death; child; health; study; drug; attack; hospital; find; murder
Other
school; star; crash; student; woman; police; shoot; dog; dead; celebrity
background
IndiaGlitz.co; sums; Burna; Lottery; Lanez; Winners; Telugu; Davido; en; Nomination

TT_text_tokens
Society
president; party; state; trump; mister; election; government; house; politic; court
Economy
company; market; year; business; cent; trade; price; bank; growth; stock
Sports
have; game; go; play; make; year; team; first; time; get
Technology
new; make; feature; re; Google; black; use; device; get; look
Entertainment
say; ye

In [19]:
model.dump_artm_model('../models/cl_news_model_en.dump_tm')
model.save('../models/cl_news_model_en.save_tm')