In [1]:
#Importing Essential libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from datetime import datetime
from sklearn.model_selection import train_test_split, KFold

In [2]:
#Importing Datasets
tr = pd.read_csv('train.csv')
ts = pd.read_csv('test.csv')
view = pd.read_csv('view_log.csv')
item = pd.read_csv('item_data.csv')

In [3]:
#Converting Impression_time field to datetime format in train and test dataset
tr['impression_time'] = pd.to_datetime(tr.impression_time)
ts['impression_time'] = pd.to_datetime(ts.impression_time)
tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237609 entries, 0 to 237608
Data columns (total 7 columns):
impression_id      237609 non-null object
impression_time    237609 non-null datetime64[ns]
user_id            237609 non-null int64
app_code           237609 non-null int64
os_version         237609 non-null object
is_4G              237609 non-null int64
is_click           237609 non-null int64
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 12.7+ MB


### <font color='green'>Objective:</font>
> For every unique userid in view_log dataset get the entire log data aggregation based on user_id.

### <font color='green'>Approach:</font>
> 1. Merge view_log and item dataset

> 2. Prepare user level aggregation data from the merged dataset

In [4]:
#Converting Impression_time field to datetime format in train and test dataset
view['server_time'] = pd.to_datetime(view.server_time)
#Anlaysing view_log dataset

#Deriving dayoftheweek, month, hour and day from servertime in view_log
view['server_weekday'] = view.server_time.dt.weekday_name
view['server_month'] = view.server_time.dt.month
view['server_hour'] = view.server_time.dt.round('H').dt.hour
view['server_day'] = view.server_time.dt.day

#Merging view_log and item_data
view_item = view.merge(item, on='item_id')
view_item.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3116840 entries, 0 to 3116839
Data columns (total 14 columns):
server_time       datetime64[ns]
device_type       object
session_id        int64
user_id           int64
item_id           int64
server_weekday    object
server_month      int64
server_hour       int64
server_day        int64
item_price        int64
category_1        int64
category_2        int64
category_3        int64
product_type      int64
dtypes: datetime64[ns](1), int64(11), object(2)
memory usage: 356.7+ MB


#### Creating new aggregation dataset for every user id based on merged Dataset

In [5]:
#How many device types did each user used?
#How many unique session_id's for each user in total?
#How many items & unique items did each user visit?
#What's the minimum, maximum & average price of all the items user visited?
#How many unique categories & product types did each user visited?
agg_col = {'device_type': ['nunique'], 'session_id':['nunique'], 'item_id': ['nunique', 'count'],
           'item_price': ['min', 'max', 'mean'], 'category_1': ['nunique'], 'category_2':['nunique'],
           'category_3': ['nunique'], 'product_type': ['nunique']
          }

user_agg = view_item.groupby(['user_id']).agg(agg_col)
user_agg.columns=['user_' + '_'.join(col).strip() for col in user_agg.columns.values]

In [6]:
#How many visits & unique items in each weekday, month, hour, day, category_1 & category_2?
col_l = ['server_weekday', 'server_month', 'server_hour', 'server_day', 'category_1', 'category_2']

for c in col_l:
    for x in view_item[c].unique():
        tmp_map = view_item[(view_item[c]==x)].groupby(['user_id']).size()
        cname1 = 'user_'+ str(c) + '_' + str(x) +'_item_count'
        user_agg[cname1] = user_agg.index.map(tmp_map)
        user_agg.loc[user_agg[cname1].isnull(), cname1] = 0
        
        tmp_map = view_item[(view_item[c]==x)].groupby(['user_id'])['item_id'].nunique()
        cname2 = 'user_'+ str(c) + '_' + str(x) +'_item_nunique'
        user_agg[cname2] = user_agg.index.map(tmp_map)
        user_agg.loc[user_agg[cname2].isnull(), cname2] = 0

In [7]:
#Exporting aggregation dataset into user_agg.csv file
user_agg.reset_index().to_csv('user_agg.csv', index=False)