In [8]:
# Nativos
import random as rn
import os
import sys
import gc

#calculo
import numpy as np
import pandas as pd
import scipy as sp

#grafico
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
%matplotlib inline
sns.set(style="whitegrid")

#warning ignore future
import warnings
# warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
gc.collect()

BASE_DIR = os.path.dirname(os.getcwd())
if BASE_DIR not in sys.path: sys.path.append(BASE_DIR)

from utils import *
from graphs import *

SEED = 29082013
os.environ['PYTHONHASHSEED']=str(SEED)
np.random.seed(SEED)
rn.seed(SEED)

subfolder = "data"
os.listdir(subfolder)

['CONTENT_CATEGORY.csv',
 '.ipynb_checkpoints',
 'device_data.csv',
 'sampleSubmission.csv',
 'SITE_ID.csv',
 'PAGE.csv',
 'CONTENT_CATEGORY_TOP.csv',
 'conversiones.csv',
 'CONTENT_CATEGORY_BOTTOM.csv',
 'pageviews.csv']

In [9]:
def get_schedule(val):
    if val < 6:
        return 'madrugada'
    elif val < 9:
        return 'antes_del_trabajo'
    elif val < 13:
        return 'trabajo_manana'
    elif val < 16:
        return 'almuerzo'
    elif val < 19:
        return 'trabajo_tarde'
    else:
        return 'luego_del_trabajo'
    
def get_day_cut(val):
    if val < 7:
        return 'inicio_mes'
    elif val < 13:
        return '7_12'
    elif val < 18:
        return 'quincena'
    elif val < 25:
        return '18_24'
    else:
        return 'fin_de_mes'

In [3]:
col_user = 'USER_ID'
data = pd.read_csv(
    "{}/pageviews.csv".format(subfolder), parse_dates=["FEC_EVENT"], dtype=str
).sort_values(
    [col_user, "FEC_EVENT"]
)
data.shape

(17936934, 8)

In [4]:
data.dtypes

FEC_EVENT                  datetime64[ns]
PAGE                               object
CONTENT_CATEGORY                   object
CONTENT_CATEGORY_TOP               object
CONTENT_CATEGORY_BOTTOM            object
SITE_ID                            object
ON_SITE_SEARCH_TERM                object
USER_ID                            object
dtype: object

In [7]:
data.head(1000)

Unnamed: 0,FEC_EVENT,PAGE,CONTENT_CATEGORY,CONTENT_CATEGORY_TOP,CONTENT_CATEGORY_BOTTOM,SITE_ID,ON_SITE_SEARCH_TERM,USER_ID
284,2018-01-01 09:56:47,14,4,2,4,2,1,0
285,2018-01-01 09:57:19,14,4,2,4,2,1,0
286,2018-01-01 09:57:48,12,4,2,4,2,1,0
287,2018-01-01 09:57:48,10,4,2,4,2,1,0
288,2018-01-01 09:57:49,11,4,2,4,2,1,0
289,2018-01-01 10:02:09,11,4,2,4,2,1,0
290,2018-01-01 10:02:09,12,4,2,4,2,1,0
291,2018-01-01 10:02:09,10,4,2,4,2,1,0
292,2018-01-01 10:07:12,11,4,2,4,2,1,0
293,2018-01-01 10:07:12,12,4,2,4,2,1,0


In [19]:
group_data = data.groupby(
    by=['USER_ID']
).agg(
    lambda x:x.value_counts().index[0]
).add_suffix(
    '_group'
).sort_index()

group_data.head(20)

Unnamed: 0_level_0,FEC_EVENT_group,PAGE_group,CONTENT_CATEGORY_group,CONTENT_CATEGORY_TOP_group,CONTENT_CATEGORY_BOTTOM_group,SITE_ID_group,ON_SITE_SEARCH_TERM_group
USER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2018-08-10 14:25:19,10,4,2,4,2,1
1,2018-03-14 10:28:44,10,4,2,4,2,1
10,2018-01-10 09:45:57,10,4,2,4,2,1
100,2018-01-03 09:51:31,2,2,2,2,3,1
1000,2018-02-03 22:53:00,2,2,2,2,3,1
10000,2018-04-03 21:11:31,85,2,2,2,3,1
10001,2018-01-02 17:35:32,2,2,2,2,3,1
10002,2018-06-22 17:35:47,14,2,2,2,2,1
10003,2018-04-05 17:55:08,3,2,2,2,3,1
10004,2018-03-08 08:56:08,10,4,2,4,2,1


In [20]:
group_data.tail(20)

Unnamed: 0_level_0,FEC_EVENT_group,PAGE_group,CONTENT_CATEGORY_group,CONTENT_CATEGORY_TOP_group,CONTENT_CATEGORY_BOTTOM_group,SITE_ID_group,ON_SITE_SEARCH_TERM_group
USER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9981,2018-04-10 20:11:13,3,2,2,2,3,1
9982,2018-04-18 10:10:54,2,2,2,2,3,1
9983,2018-05-14 16:34:08,2,2,2,2,3,1
9984,2018-01-26 11:22:52,3,2,2,2,3,1
9985,2018-01-13 21:13:33,14,4,2,4,2,1
9986,2018-04-25 15:46:34,202,2,2,2,1,1
9987,2018-01-02 20:25:06,2,2,2,2,3,1
9988,2018-03-25 21:12:53,3,2,2,2,3,1
9989,2018-01-04 09:28:48,3,2,2,2,3,1
999,2018-01-09 13:22:28,2,2,2,2,2,1


In [11]:
for colx in group_data.columns:
    print(colx, group_data[colx].unique().shape)

FEC_EVENT (11506,)
PAGE (60,)
CONTENT_CATEGORY (8,)
CONTENT_CATEGORY_TOP (4,)
CONTENT_CATEGORY_BOTTOM (8,)
SITE_ID (4,)
ON_SITE_SEARCH_TERM (1,)


In [16]:
data = data[data.FEC_EVENT.dt.month < 10]

group_data_less_10 = data.groupby(by=['USER_ID']).agg(
    lambda x:x.value_counts().index[0]
).add_suffix('_group').sort_index()
group_data_less_10.head(20)

Unnamed: 0_level_0,FEC_EVENT_group,PAGE_group,CONTENT_CATEGORY_group,CONTENT_CATEGORY_TOP_group,CONTENT_CATEGORY_BOTTOM_group,SITE_ID_group,ON_SITE_SEARCH_TERM_group
USER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2018-08-10 14:25:19,10,4,2,4,2,1
1,2018-03-14 10:28:44,10,4,2,4,2,1
10,2018-01-10 09:45:57,10,4,2,4,2,1
100,2018-01-03 09:51:31,2,2,2,2,3,1
1000,2018-02-03 22:53:00,2,2,2,2,3,1
10000,2018-04-03 21:11:31,85,2,2,2,3,1
10001,2018-01-02 17:35:32,2,2,2,2,3,1
10002,2018-06-22 17:35:47,14,2,2,2,2,1
10003,2018-04-05 17:55:08,3,2,2,2,3,1
10004,2018-03-08 08:56:08,10,4,2,4,2,1


In [17]:
for colx in group_data_less_10.columns:
    print(colx, group_data_less_10[colx].unique().shape)

FEC_EVENT_group (11179,)
PAGE_group (62,)
CONTENT_CATEGORY_group (8,)
CONTENT_CATEGORY_TOP_group (5,)
CONTENT_CATEGORY_BOTTOM_group (8,)
SITE_ID_group (4,)
ON_SITE_SEARCH_TERM_group (1,)


In [None]:
"""
FEC_EVENT (11506,)
PAGE (60,)
CONTENT_CATEGORY (8,)
CONTENT_CATEGORY_TOP (4,)
CONTENT_CATEGORY_BOTTOM (8,)
SITE_ID (4,)
ON_SITE_SEARCH_TERM (1,)
"""

In [15]:
group_data_less_10.shape, group_data.shape

((11387, 7), (11676, 7))