In [1]:
import os
import re
import csv
import pickle
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
from itertools import chain
from collections import Counter
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from ufal.udpipe import Model, Sentence, ProcessingError
from sklearn.metrics import balanced_accuracy_score, classification_report

warnings.filterwarnings("ignore")

In [2]:
!ls /eee/tgnews/meta/all_lang_list

20191101.tsv  20191106.tsv  20191111.tsv  20191116.tsv	20191121.tsv
20191102.tsv  20191107.tsv  20191112.tsv  20191117.tsv	20191122.tsv
20191103.tsv  20191108.tsv  20191113.tsv  20191118.tsv	20191123.tsv
20191104.tsv  20191109.tsv  20191114.tsv  20191119.tsv	20191124.tsv
20191105.tsv  20191110.tsv  20191115.tsv  20191120.tsv	20191125.tsv


In [3]:
load_path = "/eee/tgnews/meta/"

columns = ["path", "og:site_name", "og:url", "og:title"]

table_ru = pd.read_csv(os.path.join(load_path, "ru.tsv"), sep='\t', usecols=columns, keep_default_na=False, quoting=csv.QUOTE_NONE)
table_en = pd.read_csv(os.path.join(load_path, "en.tsv"), sep='\t', usecols=columns, keep_default_na=False, quoting=csv.QUOTE_NONE)

In [4]:
#Выделение токенов из url

def url_to_tokens(url, lowercase=True, split_pattern="[^a-z]+", filter_pattern="[a-z]{3,}"):
    
    tokens = re.sub("https*://[^/]*", "", url).strip("/").split("/")[:-1]
    
    if lowercase:
        tokens = [token.lower() for token in tokens]
        
    if split_pattern is not None:
        tokens = list(chain(*[re.split(split_pattern, token) for token in tokens]))
        
    if filter_pattern is not None:
        tokens = list(filter(lambda x: re.search(filter_pattern, x), tokens))
    
    return tokens

table_ru["tokens:url"] = table_ru["og:url"].apply(url_to_tokens)
table_en["tokens:url"] = table_en["og:url"].apply(url_to_tokens)

In [5]:
#Формируем список наиболее частотных токенов

site_tokens = dict([(site, Counter(tokens)) for site, tokens in table_ru.groupby("og:site_name")["tokens:url"].sum().iteritems()])

for site, tokens in table_en.groupby("og:site_name")["tokens:url"].sum().iteritems():
    
    site_tokens.setdefault(site, Counter())
    site_tokens[site] += Counter(tokens)
    
token_count = {}
token_sites = {}

for site, tokens in site_tokens.items():
    for token, count in tokens.items():
        
        token_count.setdefault(token, 0)
        token_count[token] += count
        
        token_sites.setdefault(token, [])
        token_sites[token].append(site)
        
tokens = [token for token, count in token_count.items() if (count >= 100) and (len(token_sites[token]) >= 10)]

with open("../url_tokens.txt", "w") as fl:
    fl.write('\n'.join(tokens))

In [6]:
url_token = "recipes"

paths = table_ru[table_ru["tokens:url"].apply(lambda x: url_token in x)]["path"].tolist()
paths = paths + table_en[table_en["tokens:url"].apply(lambda x: url_token in x)]["path"].tolist()

In [7]:
with open(paths[np.random.choice(range(len(paths)))]) as fl:
    print(fl.read())

<!DOCTYPE html>
<html>
  <head>
    <meta charset="utf-8"/>
    <meta property="og:url" content="https://www.telegraph.co.uk/recipes/0/seeded-crackersfor-cheese-recipe/"/>
    <meta property="og:site_name" content="The Telegraph"/>
    <meta property="article:published_time" content="2019-11-22T12:00:00+00:00"/>
    <meta property="og:title" content="Seeded crackers for cheese recipe"/>
    <meta property="og:description" content="There are some excellent ready-made crackers available (Peter&amp;rsquo;s Yard are my particular weakness), but there&amp;rsquo;s still something special about home-made ones, particularly if you present them with a whole soft cheese, or a good sized chunk of stilton or cheddar."/>
  </head>
  <body>
    <article>
      <h1>Seeded crackers for cheese recipe</h1>
      <address><time datetime="2019-11-22T12:00:00+00:00">22 Nov 2019, 12:00</time> by <a rel="author" href="https://www.telegraph.co.uk/authors/xanthe-clay/" target="_blank">Xanthe Clay</a></address>

In [6]:
#По ключевым словам из url определяем является ли статья новостью и формируем обуч. выборку

def predict_is_news_by_url(url_tokens):
    
    url_tokens = " ".join(url_tokens)
    
    for part in ["news",
                 "novosti"]:
    
        if part in url_tokens:
            return 1
        
    for part in ["fashion",
                 "style",
                 "beauty",
                 "trend",
                 "sex",
                 "recipe"]:
    
        if part in url_tokens:
            return 0
    
    return None

table_ru["is_news"] = table_ru["tokens:url"].apply(predict_is_news_by_url)
table_en["is_news"] = table_en["tokens:url"].apply(predict_is_news_by_url)

In [7]:
#считываем разметку

table_ru = table_ru.set_index('og:url')
table_en = table_en.set_index('og:url')

for table_name in os.listdir("../articles_with_true_is_news"):
    
    table_with_true_is_news = pd.read_csv(os.path.join("../articles_with_true_is_news", table_name)).set_index("og:url")
    
    table_ru.update(table_with_true_is_news)
    table_en.update(table_with_true_is_news)
    
table_ru = table_ru.reset_index()
table_en = table_en.reset_index()

In [8]:
#Удаляем 100000 строчек с is_news == 1

table_ru = table_ru.drop(np.random.choice(table_ru[table_ru["is_news"] == 1].index, 100000))
table_en = table_en.drop(np.random.choice(table_en[table_en["is_news"] == 1].index, 100000))

In [9]:
Counter(table_ru["is_news"].dropna()), Counter(table_en["is_news"].dropna())

(Counter({1.0: 53577, 0.0: 1619}), Counter({0.0: 2557, 1.0: 23251}))

In [10]:
#Удаляем 100000 строчек с is_news == None

table_ru = table_ru.drop(np.random.choice(table_ru[table_ru["is_news"].isna()].index, 100000))
table_en = table_en.drop(np.random.choice(table_en[table_en["is_news"].isna()].index, 100000))

In [11]:
sum(table_ru["is_news"].isna()), sum(table_en["is_news"].isna())

(71009, 77030)

Далее будем пользоваться данными предобработанными с помощью команд:

python preproc_udpipe.py /eee/tgnews/misc/ru.udpipe /eee/tgnews/meta/ru.tsv /eee/tgnews/meta/ru_preproc.csv 10 path og:title text

python preproc_udpipe.py /eee/tgnews/misc/en.udpipe /eee/tgnews/meta/en.tsv /eee/tgnews/meta/en_preproc.csv 10 path og:title text

In [12]:
#Добавляем предобработанные тексты

table_ru = table_ru.join(pd.read_csv(os.path.join(load_path, "ru_preproc.csv"), keep_default_na=False).set_index("path"), 
                         on="path", rsuffix="_preproc")
table_en = table_en.join(pd.read_csv(os.path.join(load_path, "en_preproc.csv"), keep_default_na=False).set_index("path"), 
                         on="path", rsuffix="_preproc")

In [13]:
#Извлечение предложений из формата conllu

udpipe_model = Model.load('/eee/tgnews/misc/ru.udpipe')
conllu_tokenizer = udpipe_model.newTokenizer("conllu").newConlluInputFormat()

def conllu_encode(conllu_text):
    
    sentence = Sentence()
    error = ProcessingError()
    
    conllu_tokenizer.setText(conllu_text)
    
    sentences = []
    while conllu_tokenizer.nextSentence(sentence, error):
        sentences.append(sentence.words[1:]) 
        
    return sentences

table_ru["og:title_preproc"] = table_ru["og:title_preproc"].apply(conllu_encode)
table_ru["text"] = table_ru["text"].apply(conllu_encode)

table_en["og:title_preproc"] = table_en["og:title_preproc"].apply(conllu_encode)
table_en["text"] = table_en["text"].apply(conllu_encode)

In [14]:
flags_na_ru = table_ru["is_news"].isna()
flags_na_en = table_en["is_news"].isna()

In [15]:
#Преобразование текста в вектор признаков (морфология)

with open("../morph_list/Upostags.txt") as fl:
    upostag_map = fl.read().split('\n')
    
with open("../morph_list/Feats.txt") as fl:
    feat_map = fl.read().split('\n')
    
upostag_map = dict(zip(upostag_map, range(len(upostag_map))))
feat_map = dict(zip(feat_map, range(len(feat_map))))

def get_vec(item_map, items):
    
    vec = np.zeros(len(item_map))
    
    if not len(items):
        return vec
    
    n_items = 0
    for item in items:
        
        if item in item_map:
            
            vec[item_map[item]] += 1
            n_items += 1
            
    return vec / n_items

In [16]:
#Валидация логистической регрессии

def validation(X, y, penalty="l2", l1_ratio=0):
    
    kf = KFold(n_splits=10)

    scores = []
    for train_index, test_index in kf.split(X, y):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        lr = LogisticRegression(penalty=penalty, l1_ratio=l1_ratio, solver='saga', class_weight='balanced', n_jobs=-1).fit(X_train, y_train)
        scores.append(balanced_accuracy_score(y_test, lr.predict(X_test)))

    print(f"balanced_accuracy: {sum(scores) / len(scores)}\n")
    print(classification_report(y_test, lr.predict(X_test)))

In [17]:
#Подбор комбинации признаков

def feature(table, y):
    
    title_upostag = table["og:title_preproc"].apply(lambda x: [[word.upostag for word in sent] for sent in x]) \
                                             .apply(lambda x: get_vec(upostag_map, list(chain(*x))))

    title_upostag = np.array(title_upostag.tolist())
    
    text_upostag = table["text"].apply(lambda x: [[word.upostag for word in sent] for sent in x]) \
                                .apply(lambda x: get_vec(upostag_map, list(chain(*x))))

    text_upostag = np.array(text_upostag.tolist())
    
    title_text_upostag = (table["og:title_preproc"].apply(lambda x: [[word.upostag for word in sent] for sent in x]) + \
                          table["text"].apply(lambda x: [[word.upostag for word in sent] for sent in x]))

    title_text_upostag = title_text_upostag.apply(lambda x: get_vec(upostag_map, list(chain(*x))))
    title_text_upostag = np.array(title_text_upostag.tolist())
    
    title_feats = table["og:title_preproc"].apply(lambda x: [[word.feats.split("|") for word in sent if word.feats] for sent in x]) \
                                           .apply(lambda x: [list(chain(*sent)) for sent in x]) \
                                           .apply(lambda x: get_vec(feat_map, list(chain(*x))))

    title_feats = np.array(title_feats.tolist())
    
    text_feats = table["text"].apply(lambda x: [[word.feats.split("|") for word in sent if word.feats] for sent in x]) \
                              .apply(lambda x: [list(chain(*sent)) for sent in x]) \
                              .apply(lambda x: get_vec(feat_map, list(chain(*x))))

    text_feats = np.array(text_feats.tolist())
    
    title_text_feats = (table["og:title_preproc"].apply(lambda x: [[word.feats.split("|") for word in sent if word.feats] for sent in x]) \
                                                 .apply(lambda x: [list(chain(*sent)) for sent in x]) + \
                        table["text"].apply(lambda x: [[word.feats.split("|") for word in sent if word.feats] for sent in x]) \
                                     .apply(lambda x: [list(chain(*sent)) for sent in x]))

    title_text_feats = title_text_feats.apply(lambda x: get_vec(feat_map, list(chain(*x))))
    title_text_feats = np.array(title_text_feats.tolist())
    
    print("titles upostag")
    validation(title_upostag, y)
    
    print("texts upostag")
    validation(text_upostag, y)
    
    print("sum titles texts upostag")
    validation((title_upostag + text_upostag) / 2, y)

    print("concat titles and texts upostag ")
    validation(np.concatenate([title_upostag, text_upostag], axis=1) / 2, y)
    
    print("join titles and texts upostag")
    validation(title_text_upostag, y)
    
    print("titles feats")
    validation(title_feats, y)
    
    print("texts feats")
    validation(text_feats, y)
    
    print("sum titles texts feats")
    validation((title_feats + text_feats) / 2, y)

    print("concat titles and texts feats ")
    validation(np.concatenate([title_feats, text_feats], axis=1) / 2, y)
    
    print("join titles and texts feats")
    validation(title_text_feats, y)
    
    print("concat titles upostag and titles feats")
    validation(np.concatenate([title_upostag, title_feats], axis=1) / 2, y)
    
    print("concat texts upostag and texts feats")
    validation(np.concatenate([text_upostag, text_feats], axis=1) / 2, y)
    
    print("concat titles, texts upostag and texts feats")
    validation(np.concatenate([title_upostag, text_upostag, text_feats], axis=1) / 3, y)

In [18]:
#Сохранение модели

def dump_model(model, scaler, path):
    
    from collections import namedtuple
    
    num_coeffs = model.coef_.size
    
    if scaler is None:
        scaler = namedtuple('DummyScaler', ['mean_', 'scale_'])(np.zeros(num_coeffs), np.ones(num_coeffs))
    
    with open(path, 'w') as f:
        f.write(f'{model.coef_.shape[1]}\n')
        f.write(f'{model.intercept_[0]}\n')
        for c, m, s in zip(model.coef_.flatten(), scaler.mean_.flatten(), scaler.scale_.flatten()):
            f.write(f'{c} {m} {s}\n')

**ru**

In [19]:
table = table_ru[~flags_na_ru]
y = table["is_news"].values

In [22]:
feature(table, y)

titles upostag
balanced_accuracy: 0.6865171523774437

              precision    recall  f1-score   support

         0.0       0.06      0.76      0.12       160
         1.0       0.99      0.66      0.79      5341

    accuracy                           0.66      5501
   macro avg       0.53      0.71      0.45      5501
weighted avg       0.96      0.66      0.77      5501

texts upostag
balanced_accuracy: 0.6959547725328192

              precision    recall  f1-score   support

         0.0       0.08      0.69      0.14       160
         1.0       0.99      0.75      0.85      5341

    accuracy                           0.75      5501
   macro avg       0.53      0.72      0.50      5501
weighted avg       0.96      0.75      0.83      5501

sum titles texts upostag
balanced_accuracy: 0.7040129450767412

              precision    recall  f1-score   support

         0.0       0.03      0.99      0.07       160
         1.0       1.00      0.17      0.29      5341

    accurac

In [20]:
def X(table):

    text_feats = table["text"].apply(lambda x: [[word.feats.split("|") for word in sent if word.feats] for sent in x]) \
                              .apply(lambda x: [list(chain(*sent)) for sent in x]) \
                              .apply(lambda x: get_vec(feat_map, list(chain(*x))))

    return np.array(text_feats.tolist())

lr = LogisticRegression(class_weight='balanced', solver='saga', n_jobs=-1)
lr.fit(X(table), y)

dump_model(model=lr, scaler=None, path="../models/news_model_ru.logreg")
pickle.dump(lr, open("../models/news_model_ru.logreg.pkl", 'wb'))

Предсказание на неразмеченных данных

In [24]:
table = table_ru[flags_na_ru]
table["predict"] = lr.predict_proba(X(table))[:, 1]

b1 = 0.25
b2 = 0.35

table_middle = table[(table["predict"] > b1) & (table["predict"] < b2)]

table_middle = table_middle[["path", "og:url", "predict"]].join(pd.concat([table_ru[["path", "og:title"]], table_en[["path", "og:title"]]], 
                                                                          ignore_index=True, axis=0).set_index("path"), on="path")

table_middle = table_middle[["og:url", "og:title", "predict"]]
table_middle["is_news"] = 1

table_middle.sort_values("predict").to_excel("../articles_with_true_is_news_ru.xlsx", index=False)

**en**

In [21]:
table = table_en[~flags_na_en]
y = table["is_news"].values

In [26]:
feature(table, y)

titles upostag
balanced_accuracy: 0.6455908274576098

              precision    recall  f1-score   support

         0.0       0.15      0.55      0.23       246
         1.0       0.93      0.66      0.77      2345

    accuracy                           0.65      2591
   macro avg       0.54      0.61      0.50      2591
weighted avg       0.86      0.65      0.72      2591

texts upostag
balanced_accuracy: 0.6991939113531349

              precision    recall  f1-score   support

         0.0       0.18      0.69      0.29       246
         1.0       0.95      0.68      0.80      2345

    accuracy                           0.68      2591
   macro avg       0.57      0.68      0.54      2591
weighted avg       0.88      0.68      0.75      2591

sum titles texts upostag
balanced_accuracy: 0.6594081458388381

              precision    recall  f1-score   support

         0.0       0.15      0.56      0.23       246
         1.0       0.93      0.66      0.77      2345

    accurac

In [22]:
lr = LogisticRegression(class_weight='balanced', solver='saga', n_jobs=-1)
lr.fit(X(table), y)

dump_model(model=lr, scaler=None, path="../models/news_model_en.logreg")
pickle.dump(lr, open("../models/news_model_en.logreg.pkl", 'wb'))

Предсказание на неразмеченных данных

In [None]:
table = table_en[flags_na_en]
table["predict"] = lr.predict_proba(X(table))[:, 1]

b1 = 0.45
b2 = 0.55

table_middle = table[(table["predict"] > b1) & (table["predict"] < b2)]

table_middle = table_middle[["path", "og:url", "predict"]].join(pd.concat([table_ru[["path", "og:title"]], table_en[["path", "og:title"]]], 
                                                                          ignore_index=True, axis=0).set_index("path"), on="path")

table_middle = table_middle[["og:url", "og:title", "predict"]]
table_middle["is_news"] = 1

table_middle.sort_values("predict").to_excel("../articles_with_true_is_news_en.xlsx", index=False)

Копируем таблицу на компьютер 

scp arina@128.0.134.202:~/Documents/TelegramNews/articles_with_true_is_news_(ru или en).xlsx .

Размечаем

Копируем обратно 

scp articles_with_true_is_news_(ru или en).xlsx arina@128.0.134.202:~/Documents/TelegramNews/

In [None]:
table_middle = pd.read_excel("../articles_with_true_is_news_ru.xlsx", usecols=["og:url", "is_news"])
table_middle = table_middle[~table_middle["is_news"].isna()]
table_middle = table_middle[:table_middle["is_news"].tolist().index(-1)]

#ИЗМЕНИ ИНДЕКС У PART
table_middle.to_csv("../articles_with_true_is_news/part_1_ru.csv", index=False)

In [None]:
table_middle = pd.read_excel("../articles_with_true_is_news_en.xlsx", usecols=["og:url", "is_news"])
table_middle = table_middle[~table_middle["is_news"].isna()]
table_middle = table_middle[:table_middle["is_news"].tolist().index(-1)]

#ИЗМЕНИ ИНДЕКС У PART
table_middle.to_csv("../articles_with_true_is_news/part_0_en.csv", index=False)