In [1]:
import numpy as np
import pandas as pd

In [2]:
texts_month = pd.read_csv('data/texts_31days.csv', index_col=0)
texts_first = pd.read_csv('data/texts_12.04-13.04.csv', index_col=0)
texts_second = pd.read_csv('data/texts_13.04-14.04.csv', index_col=0)
texts = pd.concat([texts_month, texts_first, texts_second])
texts.index.names = ['url_id']

texts.to_csv('data/texts.csv')

In [14]:
from datetime import datetime
from datetime import timezone
from zipfile import ZipFile
import os

def make_df(start_time, end_time):
    timestamps = sorted(os.listdir('data/timestamps'))
    start_timestamp, end_timestamp = make_timestamps_from_datetime(start_time, end_time, timestamps)
    interval = make_interval(start_timestamp, end_timestamp, timestamps)
    with ZipFile('data/timestamps.zip') as timestamps_zip:
        df_list = [pd.read_csv(timestamps_zip.open("timestamps/" + file), header=None, names=['fullVisitorId', 'url_id', 'visitStartTime']) for file in interval]
    df = pd.concat(df_list)
    labels, levels = pd.factorize(df['fullVisitorId'])
    df['user_id'] = labels
    df.set_index(['user_id','url_id'], inplace=True, drop=True)
    return df

# first = 12/03/2017 07:00:00, last = 14/04/2017 11:11:29 1491818423 1491991225
def make_timestamps_from_datetime(start_time, end_time, timestamps):
    if start_time == 'first':
        start_timestamp = timestamps[0]
    else:
        start_datetime = datetime.strptime(start_time, '%d/%m/%Y %H:%M:%S')
        start_timestamp = (start_datetime - datetime(1970, 1, 1)).total_seconds()
        
    if end_time == 'last':
        end_timestamp = timestamps[-1]
    else:
        end_datetime = datetime.strptime(end_time, '%d/%m/%Y %H:%M:%S')
        end_timestamp = (end_datetime - datetime(1970, 1, 1)).total_seconds()
    return (start_timestamp, end_timestamp)

def make_interval(start_timestamp, end_timestamp, timestamps):
    start_timestamp = str(start_timestamp)
    end_timestamp = str(end_timestamp)
    interval = [t for t in timestamps if t >= start_timestamp and t <= end_timestamp]
    return interval

# Using texts.csv to make urls for each url_id
def make_urls_df():
    texts = pd.read_csv('data/texts.csv')
    tag_cleaned = texts['tag'].str.split().str.get(0)
    texts['tag_cleaned'] = tag_cleaned
    texts['url_id'] = texts['url_id'].astype(str)
    texts['pagePath'] = '/t/' + texts['tag_cleaned'] + '/' + texts['url_id']
    texts.set_index(['url_id'], inplace=True)
    urls = texts.drop(['subtitle', 'tag', 'title', 'tag_cleaned'], axis=1)
    return urls

def merge_df(df, urls):
    df.reset_index(level=['url_id'], inplace=True)
    df.reset_index(level=['user_id'], inplace=True)
    urls.reset_index(level=['url_id'], inplace=True)
    urls['url_id'] = urls['url_id'].astype(int)
    df['fullVisitorId'] = df['fullVisitorId'].astype(str)
    df_result = pd.merge(df, urls, on='url_id', how='left')
    labels, levels = pd.factorize(df_result['url_id'])
    df_result['url_id'] = labels
    df_result.set_index(['user_id', 'url_id'], inplace=True)
    return df_result

df = make_df('15/03/2017 10:00:00', '16/03/2017 10:00:00')
urls = make_urls_df()
df_result = merge_df(df, urls)

In [15]:
display(df_result.info())
display(df_result.head(150))
display(df_result.tail(150))

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 11969533 entries, (0, 0) to (1012770, 12641)
Data columns (total 3 columns):
fullVisitorId     object
visitStartTime    int64
pagePath          object
dtypes: int64(1), object(2)
memory usage: 373.4+ MB


None

Unnamed: 0_level_0,Unnamed: 1_level_0,fullVisitorId,visitStartTime,pagePath
user_id,url_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,1036536157958312061,1489571954,/t/экономика/939611
0,0,1036536157958312061,1489571954,/t/экономика/939611
1,1,-4611953592302795458,1489571994,/t/новости/163224
1,1,-4611953592302795458,1489571994,/t/новости/163224
1,1,-4611953592302795458,1489571994,/t/новости/163224
2,2,5259739520833585814,1489572000,/t/происшествия/985017
2,2,5259739520833585814,1489572000,/t/новости/985017
2,2,5259739520833585814,1489572000,/t/новости/985017
3,2,6061855442569952128,1489571984,/t/происшествия/985017
3,2,6061855442569952128,1489571984,/t/новости/985017


Unnamed: 0_level_0,Unnamed: 1_level_0,fullVisitorId,visitStartTime,pagePath
user_id,url_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
934528,2347,442690634461732406,1489658313,/t/мнения/984473
934528,2347,442690634461732406,1489658313,/t/мнения/984473
1002862,13238,3373014268182322385,1489658288,/t/технологии/985616
1002862,13238,3373014268182322385,1489658288,/t/авто/985616
1013315,13238,7198888419577621261,1489658302,/t/технологии/985616
1013315,13238,7198888419577621261,1489658302,/t/авто/985616
294489,40031,7121298180140998711,1489658318,/t/спорт/985929
939873,40031,-1494589981853249330,1489658303,/t/спорт/985929
434859,40031,-4998192895174106326,1489658343,/t/спорт/985929
1008658,40031,-7196422238886125038,1489658305,/t/спорт/985929
