In [None]:
import numpy as np
import pandas as pd

import nltk, re
from nltk.corpus import stopwords
nltk.download('stopwords')

import warnings
warnings.filterwarnings('ignore')

In [None]:
texts_month = pd.read_csv('data/news/texts_31days.csv', index_col=0)
texts_first = pd.read_csv('data/news/texts_12.04-13.04.csv', index_col=0)
texts_second = pd.read_csv('data/news/texts_13.04-14.04.csv', index_col=0)
texts = pd.concat([texts_month, texts_first, texts_second])
texts.index.names = ['url_id']

In [None]:
# Using texts.csv to make urls for each url_id
def make_urls_df():
    texts = pd.read_csv('data/news/texts.csv')

    tag_cleaned = texts['tag'].str.split().str.get(0)
    texts['tag_cleaned'] = tag_cleaned
    texts['url_id'] = texts['url_id'].astype(str)
    texts['pagePath'] = '/t/' + texts['tag_cleaned'] + '/' + texts['url_id']

    urls = texts.drop(['subtitle', 'tag', 'tag_cleaned'], axis=1)
    return urls

urls = make_urls_df()
urls.dropna(how='any', inplace=True)
urls.drop_duplicates(['title'], inplace=True)

In [None]:
# Text cleaning
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

def clean_text(text, remove_stopwords=True):
    '''Clean the text, with the option to remove stopwords'''
    text = text.lower()
    tokens = tokenizer.tokenize(text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("russian"))
        tokens = [w for w in tokens if not w in stops]
    
#     text = " ".join(tokens)
    return tokens

# PoS tagging
from ufal.udpipe import Model, Pipeline
modelfile = 'models/udpipe_syntagrus.model'

def tag_ud(text, modelfile='udpipe_syntagrus.model'):
    model = Model.load(modelfile)
    pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')
    processed = pipeline.process(text)
    output = [l for l in processed.split('\n') if not l.startswith('#')]
    tagged = [w.split('\t')[2].lower() + '_' + w.split('\t')[3] for w in output if w]
    tagged_propn = []
    propn  = []
    for t in tagged:
        if t.endswith('PROPN'):
            if propn:
                propn.append(t)
            else:
                propn = [t]
        else:
            if len(propn) > 1:
                name = '::'.join([x.split('_')[0] for x in propn]) + '_PROPN'
                tagged_propn.append(name)
            elif len(propn) == 1:
                tagged_propn.append(propn[0])
            tagged_propn.append(t)
            propn = []
    return tagged_propn

In [None]:
import gensim
from gensim.models.keyedvectors import KeyedVectors

ft_model = gensim.models.fasttext.FastText.load('models/araneum_none_fasttextcbow_300_5_2018/araneum_none_fasttextcbow_300_5_2018.model')
# word2vec_model = gensim.models.KeyedVectors.load_word2vec_format("models/word2vec/ruscorpora_upos_skipgram_300_5_2018.vec", binary=False)

In [None]:
# doc2vec for every news title
vec_dim = 300

def create_average_vec(doc):
    average = np.zeros((vec_dim,), dtype='float32')
    num_words = 0.
    for word in doc:
        if word in ft_model.wv.vocab:
            average = np.add(average, ft_model[word])
            num_words += 1.
    if num_words != 0.:
        average = np.divide(average, num_words)
    return average


def create_doc2vec(text):
    text = str(text)
    processed_text = clean_text(text)
#     processed_ud = tag_ud(text=processed_text, modelfile=modelfile)
    vec = create_average_vec(processed_text)
    return vec

urls['doc2vec'] = urls['title'].apply(create_doc2vec)
urls.to_csv('data/news/urls_with_unique_titles_fasttext.csv')

In [1]:
# USEFUL CODE STARTS HERE
import numpy as np
import pandas as pd

from datetime import datetime
from datetime import timezone
from zipfile import ZipFile
import os

In [2]:
urls = pd.read_csv('data/news/urls_with_unique_titles_fasttext.csv', index_col=0)

In [3]:
def make_df(start_time, end_time):
    timestamps = sorted(os.listdir('data/news/timestamps'))
    start_timestamp, end_timestamp = make_timestamps_from_datetime(start_time, end_time, timestamps)
    interval = make_interval(start_timestamp, end_timestamp, timestamps)
    with ZipFile('data/news/timestamps.zip') as timestamps_zip:
        df_list = [pd.read_csv(timestamps_zip.open("timestamps/" + file), header=None, names=['fullVisitorId', 'url_id', 'visitStartTime']) for file in interval]
    df = pd.concat(df_list)
    labels, levels = pd.factorize(df['fullVisitorId'])
    df['user_id'] = labels
    return df


# first = 12/03/2017 07:00:00(1491818423), last = 14/04/2017(1491991225) 11:11:29
def make_timestamps_from_datetime(start_time, end_time, timestamps):
    if start_time == 'first':
        start_timestamp = timestamps[0]
    else:
        start_datetime = datetime.strptime(start_time, '%d/%m/%Y %H:%M:%S')
        start_timestamp = (start_datetime - datetime(1970, 1, 1)).total_seconds()
        
    if end_time == 'last':
        end_timestamp = timestamps[-1]
    else:
        end_datetime = datetime.strptime(end_time, '%d/%m/%Y %H:%M:%S')
        end_timestamp = (end_datetime - datetime(1970, 1, 1)).total_seconds()
    return (start_timestamp, end_timestamp)


def make_interval(start_timestamp, end_timestamp, timestamps):
    start_timestamp = str(start_timestamp)
    end_timestamp = str(end_timestamp)
    interval = [t for t in timestamps if t >= start_timestamp and t <= end_timestamp]
    return interval


def merge_df(df, urls):
    urls['url_id'] = urls['url_id'].astype(int)
    urls['title'] = urls['title'].astype(str)
    
    df_result = pd.merge(df, urls, on='url_id', how='left')

    labels, levels = pd.factorize(df_result['url_id'])
    df_result['url_id'] = labels
    df_result.set_index(['user_id', 'url_id'], inplace=True)
    df_result.sort_index(inplace=True)
    df_result.dropna(how='any',inplace=True)
    df_result.drop_duplicates(inplace=True)
    
    df_result['fullVisitorId'] = df_result['fullVisitorId'].astype(str)
    df_result['title'] = df_result['title'].astype(str)
    df_result['pagePath'] = df_result['pagePath'].astype(str)

    return df_result

In [None]:
# You can choose time interval beetween 12/03/2017 07:00:00 and 14/04/2017 11:11:29
# Also you can use as arguments 'first' or 'last'
start_time = '12/04/2017 10:00:00'
stop_time = '14/04/2017 11:11:29'

df = make_df(start_time, stop_time)
df_result = merge_df(df, urls)

In [None]:
# Calculates user clicks score for each news article
scores = df_result['pagePath'].value_counts()
df_result['scores'] = df_result['pagePath'].apply(lambda path: scores[str(path)])

In [None]:
# OPTIONALLY
df_result.to_csv('data/news/news_final_dataset.csv')

In [None]:
display(df_result.info())
display(df_result.head(20))
display(df_result.tail(20))