In [1]:
import numpy as np
import pandas as pd

import nltk, re
from nltk.corpus import stopwords
nltk.download('stopwords')

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /home/ideis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
texts_month = pd.read_csv('data/news/texts_31days.csv', index_col=0)
texts_first = pd.read_csv('data/news/texts_12.04-13.04.csv', index_col=0)
texts_second = pd.read_csv('data/news/texts_13.04-14.04.csv', index_col=0)
texts = pd.concat([texts_month, texts_first, texts_second])
texts.index.names = ['url_id']

In [3]:
# Using texts.csv to make urls for each url_id
def make_urls_df():
    texts = pd.read_csv('data/news/texts.csv')

    tag_cleaned = texts['tag'].str.split().str.get(0)
    texts['tag_cleaned'] = tag_cleaned
    texts['url_id'] = texts['url_id'].astype(str)
    texts['pagePath'] = '/t/' + texts['tag_cleaned'] + '/' + texts['url_id']

    urls = texts.drop(['subtitle', 'tag', 'tag_cleaned'], axis=1)
    return urls

urls = make_urls_df()
urls.dropna(how='any', inplace=True)
urls.drop_duplicates(['title'], inplace=True)

In [4]:
# Text cleaning
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

def clean_text(text, remove_stopwords=True):
    '''Clean the text, with the option to remove stopwords'''
    text = text.lower()
    tokens = tokenizer.tokenize(text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("russian"))
        tokens = [w for w in tokens if not w in stops]
    
    text = " ".join(tokens)
    return text

# PoS tagging
from ufal.udpipe import Model, Pipeline
modelfile = 'models/udpipe_syntagrus.model'

def tag_ud(text, modelfile='udpipe_syntagrus.model'):
    model = Model.load(modelfile)
    pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')
    processed = pipeline.process(text)
    output = [l for l in processed.split('\n') if not l.startswith('#')]
    tagged = [w.split('\t')[2].lower() + '_' + w.split('\t')[3] for w in output if w]
    tagged_propn = []
    propn  = []
    for t in tagged:
        if t.endswith('PROPN'):
            if propn:
                propn.append(t)
            else:
                propn = [t]
        else:
            if len(propn) > 1:
                name = '::'.join([x.split('_')[0] for x in propn]) + '_PROPN'
                tagged_propn.append(name)
            elif len(propn) == 1:
                tagged_propn.append(propn[0])
            tagged_propn.append(t)
            propn = []
    return tagged_propn

In [15]:
import gensim

word2vec_model = gensim.models.KeyedVectors.load_word2vec_format("models/word2vec/ruscorpora_upos_skipgram_300_5_2018.vec", binary=False)

In [16]:
# doc2vec for every news title
vec_dim = 300

def create_average_vec(doc):
    average = np.zeros((vec_dim,), dtype='float32')
    num_words = 0.
    for word in doc:
        if word in word2vec_model:
            average = np.add(average, word2vec_model[word])
            num_words += 1.
    if num_words != 0.:
        average = np.divide(average, num_words)
    return average


def create_doc2vec(text):
    text = str(text)
    processed_text = clean_text(text)
    processed_ud = tag_ud(text=processed_text, modelfile=modelfile)
    vec = create_average_vec(processed_ud)
    return vec

urls['doc2vec'] = urls['title'].apply(create_doc2vec)
urls.to_csv('data/news/unique_titles_urls_with_doc2vec.csv')

Unnamed: 0,url_id,title,pagePath,doc2vec
0,20,Андрей Мягков прооперирован в Германии,/t/новости/20,"[-0.0707825, 0.066976, 0.0525985, -0.0433205, ..."
1,21,Брат Гуса Хиддинка написал песню для России,/t/новости/21,"[-0.0603787, 0.0231322, 0.059768, -0.0441762, ..."
2,25,Литва встает на защиту внука Пугачевой,/t/новости/25,"[-0.0449517, -0.0494433, 0.0192017, -0.0299023..."
3,28,Dell выпустят смартфон на базе Android,/t/новости/28,"[-0.0411953, 0.0155413, 0.000527667, -0.007926..."
4,29,Ребенка ранили после ссоры в маршрутке,/t/новости/29,"[-0.0481922, 0.0016745, -0.0038685, -0.0150153..."
5,34,Владимиру Путину изготовили личный биоЧИП,/t/новости/34,"[-0.000262, 0.0151814, 0.0158028, -0.000719999..."
6,42,Перед смертью Япончика соборовал священник,/t/новости/42,"[-0.0715463, -0.0621635, 0.0723903, -0.052593,..."
7,44,300 км/ч на шоссе общего пользования,/t/новости/44,"[-0.020147, -0.02574, 0.0193276, -0.027425, -0..."
8,49,"Чубайс платит 4,3 млрд за счетчики для нефти",/t/новости/49,"[-0.0354266, 0.0628282, -0.0084598, 0.031462, ..."
9,61,Карикатурист поймал вора за 15 минут,/t/новости/61,"[-0.0745085, -0.021892, 0.0608343, -0.0498217,..."


In [17]:
from datetime import datetime
from datetime import timezone
from zipfile import ZipFile
import os

def make_df(start_time, end_time):
    timestamps = sorted(os.listdir('data/news/timestamps'))
    start_timestamp, end_timestamp = make_timestamps_from_datetime(start_time, end_time, timestamps)
    interval = make_interval(start_timestamp, end_timestamp, timestamps)
    with ZipFile('data/news/timestamps.zip') as timestamps_zip:
        df_list = [pd.read_csv(timestamps_zip.open("timestamps/" + file), header=None, names=['fullVisitorId', 'url_id', 'visitStartTime']) for file in interval]
    df = pd.concat(df_list)
    labels, levels = pd.factorize(df['fullVisitorId'])
    df['user_id'] = labels
    return df


# first = 12/03/2017 07:00:00, last = 14/04/2017 11:11:29 1491818423 1491991225
def make_timestamps_from_datetime(start_time, end_time, timestamps):
    if start_time == 'first':
        start_timestamp = timestamps[0]
    else:
        start_datetime = datetime.strptime(start_time, '%d/%m/%Y %H:%M:%S')
        start_timestamp = (start_datetime - datetime(1970, 1, 1)).total_seconds()
        
    if end_time == 'last':
        end_timestamp = timestamps[-1]
    else:
        end_datetime = datetime.strptime(end_time, '%d/%m/%Y %H:%M:%S')
        end_timestamp = (end_datetime - datetime(1970, 1, 1)).total_seconds()
    return (start_timestamp, end_timestamp)


def make_interval(start_timestamp, end_timestamp, timestamps):
    start_timestamp = str(start_timestamp)
    end_timestamp = str(end_timestamp)
    interval = [t for t in timestamps if t >= start_timestamp and t <= end_timestamp]
    return interval


def merge_df(df, urls):
    df['fullVisitorId'] = df['fullVisitorId'].astype(str)
    urls['url_id'] = urls['url_id'].astype(int)
    urls['title'] = urls['title'].astype(str)
    
    df_result = pd.merge(df, urls, on='url_id', how='left')

    labels, levels = pd.factorize(df_result['url_id'])
    df_result['url_id'] = labels
    df_result.set_index(['user_id', 'url_id'], inplace=True)
    df_result.sort_index(inplace=True)
    df_result.dropna(how='any',inplace=True)
    df_result.drop_duplicates(inplace=True)
    return df_result


df = make_df('16/03/2017 9:00:00', '16/03/2017 10:00:00')
df_result = merge_df(df, urls)

In [19]:
display(df_result.info())
display(df_result.head(20))
display(df_result.tail(20))

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 0 entries
Data columns (total 5 columns):
fullVisitorId     0 non-null object
visitStartTime    0 non-null int64
title             0 non-null object
pagePath          0 non-null object
doc2vec           0 non-null object
dtypes: int64(1), object(4)
memory usage: 684.9+ KB


None

Unnamed: 0_level_0,Unnamed: 1_level_0,fullVisitorId,visitStartTime,title,pagePath,doc2vec
user_id,url_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


Unnamed: 0_level_0,Unnamed: 1_level_0,fullVisitorId,visitStartTime,title,pagePath,doc2vec
user_id,url_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
