In [None]:
import numpy as np
import pandas as pd

import nltk, re
from nltk.corpus import stopwords
nltk.download('stopwords')

import warnings
warnings.filterwarnings('ignore')

In [None]:
texts_month = pd.read_csv('data/news/texts_31days.csv', index_col=0)
texts_first = pd.read_csv('data/news/texts_12.04-13.04.csv', index_col=0)
texts_second = pd.read_csv('data/news/texts_13.04-14.04.csv', index_col=0)
texts = pd.concat([texts_month, texts_first, texts_second])
texts.index.names = ['url_id']

In [None]:
# Using texts.csv to make urls for each url_id
def make_urls_df():
    texts = pd.read_csv('data/news/texts.csv')

    tag_cleaned = texts['tag'].str.split().str.get(0)
    texts['tag_cleaned'] = tag_cleaned
    texts['url_id'] = texts['url_id'].astype(str)
    texts['pagePath'] = '/t/' + texts['tag_cleaned'] + '/' + texts['url_id']

    urls = texts.drop(['subtitle', 'tag', 'tag_cleaned'], axis=1)
    return urls

urls = make_urls_df()
urls.dropna(how='any', inplace=True)
urls.drop_duplicates(['title'], inplace=True)

In [None]:
# Text cleaning
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

def clean_text(text, remove_stopwords=True):
    '''Clean the text, with the option to remove stopwords'''
    text = text.lower()
    tokens = tokenizer.tokenize(text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("russian"))
        tokens = [w for w in tokens if not w in stops]
    
#     text = " ".join(tokens)
    return tokens

# PoS tagging
from ufal.udpipe import Model, Pipeline
modelfile = 'models/udpipe_syntagrus.model'

def tag_ud(text, modelfile='udpipe_syntagrus.model'):
    model = Model.load(modelfile)
    pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')
    processed = pipeline.process(text)
    output = [l for l in processed.split('\n') if not l.startswith('#')]
    tagged = [w.split('\t')[2].lower() + '_' + w.split('\t')[3] for w in output if w]
    tagged_propn = []
    propn  = []
    for t in tagged:
        if t.endswith('PROPN'):
            if propn:
                propn.append(t)
            else:
                propn = [t]
        else:
            if len(propn) > 1:
                name = '::'.join([x.split('_')[0] for x in propn]) + '_PROPN'
                tagged_propn.append(name)
            elif len(propn) == 1:
                tagged_propn.append(propn[0])
            tagged_propn.append(t)
            propn = []
    return tagged_propn

In [None]:
import gensim
from gensim.models.keyedvectors import KeyedVectors

ft_model = gensim.models.fasttext.FastText.load('models/araneum_none_fasttextcbow_300_5_2018/araneum_none_fasttextcbow_300_5_2018.model')
# word2vec_model = gensim.models.KeyedVectors.load_word2vec_format("models/word2vec/ruscorpora_upos_skipgram_300_5_2018.vec", binary=False)

In [None]:
# doc2vec for every news title
vec_dim = 300

def create_average_vec(doc):
    average = np.zeros((vec_dim,), dtype='float32')
    num_words = 0.
    for word in doc:
        if word in ft_model.wv.vocab:
            average = np.add(average, ft_model[word])
            num_words += 1.
    if num_words != 0.:
        average = np.divide(average, num_words)
    return average


def create_doc2vec(text):
    text = str(text)
    processed_text = clean_text(text)
#     processed_ud = tag_ud(text=processed_text, modelfile=modelfile)
    vec = create_average_vec(processed_text)
    return vec

urls['doc2vec'] = urls['title'].apply(create_doc2vec)
urls.to_csv('data/news/urls_with_unique_titles_fasttext.csv')

In [1]:
# USEFUL CODE STARTS HERE
import numpy as np
import pandas as pd

from datetime import datetime
from datetime import timezone
from zipfile import ZipFile
import os

In [2]:
urls = pd.read_csv('data/news/urls_with_unique_titles_fasttext.csv', index_col=0)

In [3]:
def make_df(start_time, end_time):
    timestamps = sorted(os.listdir('data/news/timestamps'))
    start_timestamp, end_timestamp = make_timestamps_from_datetime(start_time, end_time, timestamps)
    interval = make_interval(start_timestamp, end_timestamp, timestamps)
    with ZipFile('data/news/timestamps.zip') as timestamps_zip:
        df_list = [pd.read_csv(timestamps_zip.open("timestamps/" + file), header=None, names=['fullVisitorId', 'url_id', 'visitStartTime']) for file in interval]
    df = pd.concat(df_list)
    labels, levels = pd.factorize(df['fullVisitorId'])
    df['user_id'] = labels
    return df


# first = 12/03/2017 07:00:00(1491818423), last = 14/04/2017(1491991225) 11:11:29
def make_timestamps_from_datetime(start_time, end_time, timestamps):
    if start_time == 'first':
        start_timestamp = timestamps[0]
    else:
        start_datetime = datetime.strptime(start_time, '%d/%m/%Y %H:%M:%S')
        start_timestamp = (start_datetime - datetime(1970, 1, 1)).total_seconds()
        
    if end_time == 'last':
        end_timestamp = timestamps[-1]
    else:
        end_datetime = datetime.strptime(end_time, '%d/%m/%Y %H:%M:%S')
        end_timestamp = (end_datetime - datetime(1970, 1, 1)).total_seconds()
    return (start_timestamp, end_timestamp)


def make_interval(start_timestamp, end_timestamp, timestamps):
    start_timestamp = str(start_timestamp)
    end_timestamp = str(end_timestamp)
    interval = [t for t in timestamps if t >= start_timestamp and t <= end_timestamp]
    return interval


def merge_df(df, urls):
    urls['url_id'] = urls['url_id'].astype(int)
    urls['title'] = urls['title'].astype(str)
    
    df_result = pd.merge(df, urls, on='url_id', how='left')

    labels, levels = pd.factorize(df_result['url_id'])
    df_result['url_id'] = labels
    df_result.set_index(['user_id', 'url_id'], inplace=True)
    df_result.sort_index(inplace=True)
    df_result.dropna(how='any',inplace=True)
    df_result.drop_duplicates(inplace=True)
    
    df_result['fullVisitorId'] = df_result['fullVisitorId'].astype(str)
    df_result['title'] = df_result['title'].astype(str)
    df_result['pagePath'] = df_result['pagePath'].astype(str)

    return df_result

In [4]:
# You can choose time interval beetween 12/03/2017 07:00:00 and 14/04/2017 11:11:29
# Also you can use as arguments 'first' or 'last'
start_time = '12/04/2017 10:00:00'
end_time = '14/04/2017 11:11:29'

df = make_df(start_time, end_time)
df_result = merge_df(df, urls)

In [5]:
# Calculates user clicks score for each news article
scores = df_result['pagePath'].value_counts()
df_result['scores'] = df_result['pagePath'].apply(lambda path: scores[str(path)])

In [6]:
# OPTIONALLY. Thats can take some time
# df_result.to_csv('data/news/news_final_dataset.csv')

In [7]:
display(df_result.info())
display(df_result.head(20))
display(df_result.tail(20))

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2810230 entries, (0, 0) to (1594565, 263)
Data columns (total 6 columns):
fullVisitorId     object
visitStartTime    int64
title             object
pagePath          object
doc2vec           object
scores            int64
dtypes: int64(2), object(4)
memory usage: 162.6+ MB


None

Unnamed: 0_level_0,Unnamed: 1_level_0,fullVisitorId,visitStartTime,title,pagePath,doc2vec,scores
user_id,url_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,-3506357654279954220,1491991460,Егор Кончаловский: Сергей Бодров-младший вряд ...,/t/памятники/926495,[ 9.80217196e-03 2.96098627e-02 -2.8206456...,18
0,322,-3506357654279954220,1491991460,Мизулина пожелала Бузовой и Тарасову сохранить...,/t/ольгабузова/926408,[ -2.92406976e-02 3.41553651e-02 -1.7812741...,19
0,460,-3506357654279954220,1491991411,63-летний Борис Гребенщиков закрутил роман с 3...,/t/роман/995193,[ 3.77307553e-03 2.24123616e-02 -1.5817986...,1267
0,496,-3506357654279954220,1491991415,Егор Кончаловский о курсах российского кино в ...,/t/образование/962629,[ 6.83865882e-03 5.39650992e-02 -4.4041827...,36
0,615,-3506357654279954220,1491991419,Егор Кончаловский снимет фильм про футбол к ЧМ...,/t/футбол/929364,[ 1.33793261e-02 1.61679927e-02 -1.9726116...,16
0,778,-3506357654279954220,1491991869,Глюкоза продемонстрировала нижнее бельё на бал...,/t/натальяионова/925963,[-0.01354337 -0.03651074 0.03594377 -0.004558...,8
0,855,-3506357654279954220,1491991842,"""Проклятье Мадонны"": Фанатка Киркорова на конц...",/t/филиппкиркоров/926029,[ -5.78167848e-02 -8.13967884e-02 2.3190723...,7
0,891,-3506357654279954220,1491991876,Портал Gawker выплатит Халку Хогану $31 млн за...,/t/халкхоган/925992,[ 0.02201042 -0.07285431 0.02588295 0.021550...,3
0,1027,-3506357654279954220,1491991876,Жена Владимира Кехмана Ида ждёт ребенка,/t/владимиркехман/925990,[ 1.17481044e-02 3.21980305e-02 2.2344827...,5
1,1,4462027711335455019,1491991407,Бабушкопереводчик. Понимаете ли вы устаревший ...,/t/квн/973967,[ 0.03069289 0.07980222 -0.03808387 -0.020487...,17


Unnamed: 0_level_0,Unnamed: 1_level_0,fullVisitorId,visitStartTime,title,pagePath,doc2vec,scores
user_id,url_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1594367,11116,-9171727571029563544,1492168179,Против диктора ВГТРК Дениса Стойкова возбудили...,/t/вгтрк/338348,[ -1.30737145e-02 4.37873136e-03 -3.2131470...,12
1594403,405,-8818761725042633618,1492168170,"""Дети Терминатора"" показали будущее",/t/фестивали/988274,[ -1.60844252e-02 6.81089833e-02 -1.9694723...,644
1594404,480,-2645857799535966917,1492168217,К Пасхе Caviar выпустила Apple iPhone 7 Credo ...,/t/технологии/996896,[ 1.38974506e-02 -5.57469130e-02 3.0415961...,451
1594509,6352,-4318814951530510783,1492168180,Сколько стоят проститутки в разных странах,/t/проститутки/400504,[ -1.14188910e-01 1.43640337e-03 1.6642497...,55
1594510,15945,5247444881633947648,1492168171,Сhange.org против РОИ — какие петиции работают...,/t/футбол/876936,[ -4.99384552e-02 -1.38225947e-02 -1.3780001...,4
1594511,2878,4352482798791194064,1492168182,"Парочка занималась сексом в детском купе ""Сапс...",/t/ржд/898961,[ 0.00624714 -0.0775308 0.08442804 -0.014373...,21
1594536,39647,1730641630717181912,1492168180,Песков: Позиции Британии и США могут нанести в...,/t/новости/908452,[ 4.73317644e-03 1.10630216e-02 2.9174991...,1
1594537,176,7769592663963565178,1492168185,"""Педофил"" в тёмных очках оказался назойливым б...",/t/происшествия/997274,[ 0.02649269 0.09860864 -0.0154133 -0.035669...,4283
1594538,12660,-4436992379488362537,1492168194,"5 московских борделей, которые работают и посл...",/t/проститутки/400321,[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,6
1594542,19423,2667040721705386079,1492168192,Валентина Матвиенко: Россия очень встревожена ...,/t/новости/126288,[-0.01650023 0.038729 -0.01020037 0.025530...,6
