In [21]:
import pandas as pd
import numpy as np
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering, DBSCAN
from sklearn.metrics import pairwise_distances

In [5]:
def process_parser_data(file_name, start_date, end_date):
    dataset = pd.read_csv(
        file_name, sep=',', quotechar='\"', escapechar='\\',
        encoding='utf-8', error_bad_lines=False, header=0,
        verbose=False, keep_date_col=True, index_col=False)
    dataset = dataset[["date", "url", "edition", "title", "text", "authors", "topics"]]
    dataset = dataset[(~dataset["text"].isnull() & ~dataset["title"].isnull())]
    dataset["date"] = pd.to_datetime(dataset["date"])
    dataset["text"] = dataset["text"].apply(lambda x: x.replace("\\n", " "))
    dataset["edition"] = dataset["edition"].apply(lambda x: None if x == "-" else x)
    if start_date:
        dataset = dataset[dataset["date"] >= start_date]
    if end_date:
        dataset = dataset[dataset["date"] < end_date]
    dataset.sort_values("date", inplace=True)
    dataset.drop_duplicates(subset=["title", "text"], keep='last', inplace=True)
    dataset.drop_duplicates(subset=["url"], keep='last', inplace=True)
    print(dataset.info())
    print(dataset.head(5))
    return dataset

In [6]:
dataset = process_parser_data('./datasets/meduza_19800101_20191002.csv', '', '')

b'Skipping line 57426: expected 17 fields, saw 19\n'


<class 'pandas.core.frame.DataFrame'>
Int64Index: 72569 entries, 49408 to 74060
Data columns (total 7 columns):
date       72569 non-null datetime64[ns]
url        72569 non-null object
edition    0 non-null object
title      72569 non-null object
text       72569 non-null object
authors    0 non-null float64
topics     0 non-null float64
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 4.4+ MB
None
                     date                                                url  \
49408 2014-09-12 09:04:00  https://meduza.io/news/2014/09/12/v-den-rozhde...   
49409 2014-10-07 13:19:00  https://meduza.io/news/2014/10/07/v-pravitelst...   
49410 2014-10-08 08:36:00  https://meduza.io/cards/7-russkiy-veb-chto-s-n...   
49411 2014-10-09 09:15:00  https://meduza.io/news/2014/10/09/sk-potrebova...   
49412 2014-10-10 08:27:00  https://meduza.io/cards/pridetsya-li-platit-za...   

      edition                                              title  \
49408    None  В день рождения Пут

In [28]:
dataset['title_clean'] = dataset['title'].str.lower().str.strip(string.digits)
dataset.title_clean.head(10)

49408    в день рождения путина предложили отмечать ден...
49409    в правительстве назвали четырехдневную рабочую...
49410         отключат ли россию от глобального интернета?
49411               ск потребовал 190 млрд с афк «система»
49412                      придется ли платить за youtube?
49413    под донецком обстреляли автомобиль бывшего «на...
49414           втб и сбербанк создадут альтернативу swift
49415                  ротенберги передали активы сыновьям
49416    обнародован список зарегистрировавшихся в роск...
49417    путин ответил на санкции созданием госкорпорац...
Name: title_clean, dtype: object

In [29]:
vectorizer = TfidfVectorizer()
titles_tfidf = vectorizer.fit_transform(dataset['title_clean'])
titles_tfidf

<72569x72365 sparse matrix of type '<class 'numpy.float64'>'
	with 637384 stored elements in Compressed Sparse Row format>

In [38]:
vectorizer = TfidfVectorizer(max_features=1000)
titles_tfidf = vectorizer.fit_transform(dataset['title'])
titles_tfidf

<72569x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 276971 stored elements in Compressed Sparse Row format>

In [87]:
train_size = 20000

In [88]:
titles_tfidf_sample = pd.DataFrame(titles_tfidf.toarray())[:train_size]

distances = pairwise_distances(titles_tfidf_sample,
                               metric='cosine',
                               n_jobs=3)

distances

clustering = AgglomerativeClustering(
            affinity="precomputed",
            distance_threshold=0.04,
            n_clusters=None,
            linkage="average")

labels = clustering.fit_predict(distances)

In [89]:
len(labels), len(set(labels))

(20000, 16110)

In [90]:
labels[labels == 0]

array([0, 0], dtype=int64)

In [94]:
print(*dataset[:train_size][labels == 3]['url'])

https://meduza.io/news/2014/10/22/kinokompaniyu-amedia-vystavili-na-prodazhu https://meduza.io/news/2015/12/10/restoran-praga-vystavili-na-prodazhu https://meduza.io/news/2016/01/10/osobnyak-osnovatelya-playboy-vystavyat-na-prodazhu https://meduza.io/news/2016/02/01/na-turetskih-kurortah-vystavili-na-prodazhu-1300-oteley
