In [1]:
import pandas as pd
import numpy as np
import json
import gzip
import seaborn as sns
from datetime import datetime
from pandas.io.json import json_normalize
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.externals import joblib
%matplotlib inline

In [42]:
def open_json(data_file):    
    with open(data_file, 'r') as f:
        json_records = []
        for line in f:
            record = json.loads(line)
            json_records.append(record)
    return json_records

def gender_to_bin(g):
    if g == 'M':
        return 1
    else:
        return 0

def to_weekday(timestamp):
    try:
        weekday = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S").weekday()
    except ValueError:
        weekday = datetime.strptime(timestamp, "%Y-%m-%d").weekday()
    except TypeError:
        weekday = 9
    return weekday

def convert_date(timestamp):
    try:
        res = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
    except ValueError:
        res = datetime.strptime(timestamp, "%Y-%m-%d")
    except TypeError:
        res = 9
    return res

def get_data_pageview(page_type):
    json_records = open_json('data/sub_data/{}'.format(page_type))
    df = pd.DataFrame(json_records, columns=['uid', 'gender', 'page_type','timestamp'])
    df.columns = ['uid', 'gender', '{}'.format(page_type), 'date']
    df.gender = list(map(gender_to_bin, df.gender.values))
    df.date = list(map(to_weekday, df.date.values))
    df = pd.get_dummies(df, columns=['{}'.format(page_type),'date'])
    return df.groupby(['uid','gender'], as_index=False).sum()

def get_target_pageview(page_type):
    json_records = open_json('data/sub_target/{}'.format(page_type))
    df = pd.DataFrame(json_records, columns=['uid', 'page_type','timestamp'])
    df.columns = ['uid', '{}'.format(page_type), 'date']
    df.date = list(map(to_weekday, df.date.values))
    df = pd.get_dummies(df, columns=['{}'.format(page_type),'date'])
    return df.groupby(['uid'], as_index=False).sum()

## Create 5 DataFrames: catalog, purchase, pageview, target_purchase, target_pageview

In [43]:
%%time
catalog = pd.read_csv('data/catalog', usecols=['pid', 'current_price','original_price', 'category', 'sub_category', 'sub_sub_category'])
catalog = catalog.fillna(value=0)
catalog = catalog[catalog.current_price!=0]
catalog['original_price'] = np.where(catalog['original_price'] == 0, catalog['current_price'], catalog['original_price'])
catalog['diff_price'] = catalog['original_price'] - catalog['current_price']

purchase_data = open_json('data/sub_data/purchase')           
purchase = json_normalize(purchase_data, 'products', ['date','gender','uid'])

pageview_data = open_json('data/sub_data/products')   
pageview = pd.DataFrame(pageview_data, columns=['productId', 'timestamp', 'gender', 'uid'])
pageview.columns = ['pid', 'date', 'gender', 'uid']


purchase_data = open_json('data/sub_target/purchase')           
target_purchase = json_normalize(purchase_data, 'products', ['date','uid'])

pageview_data = open_json('data/sub_target/purchase')   
target_pageview = pd.DataFrame(pageview_data, columns=['productId', 'timestamp', 'uid'])
target_pageview.columns = ['pid', 'date', 'uid']

CPU times: user 8.73 s, sys: 112 ms, total: 8.84 s
Wall time: 9.42 s


## Transform gender and date into numbers

In [44]:
%%time
purchase.gender = list(map(gender_to_bin, purchase.gender.values))
purchase['hour'] = purchase['date']
purchase['month'] = purchase['date']
#purchase['day'] = purchase['date']
#pageview['hour'] = pageview['date']
#pageview['month'] = pageview['date']
#pageview['day'] = pageview['date']
#purchase.day = list(map(lambda t:convert_date(t).day, purchase.day.values))
purchase.month = list(map(lambda t:convert_date(t).month, purchase.month.values))
purchase.hour = list(map(lambda t:convert_date(t).hour, purchase.hour.values))
purchase.date = list(map(to_weekday, purchase.date.values))
pageview.gender = list(map(gender_to_bin, pageview.gender.values))
#pageview.day = list(map(lambda t:convert_date(t).day, pageview.day.values))
#pageview.month = list(map(lambda t:convert_date(t).month, pageview.month.values))
#pageview.hour = list(map(lambda t:convert_date(t).hour, pageview.hour.values))
pageview.date = list(map(to_weekday, pageview.date.values))

target_purchase['hour'] = target_purchase['date']
target_purchase['month'] = target_purchase['date']
#target_purchase['day'] = target_purchase['date']
#target_pageview['hour'] = target_pageview['date']
#target_pageview['month'] = target_pageview['date']
#target_pageview['day'] = target_pageview['date']
#target_purchase.day = list(map(lambda t:convert_date(t).day, target_purchase.day.values))
target_purchase.month = list(map(lambda t:convert_date(t).month, target_purchase.month.values))
target_purchase.hour = list(map(lambda t:convert_date(t).hour, target_purchase.hour.values))
target_purchase.date = list(map(to_weekday, target_purchase.date.values))
#target_pageview.day = list(map(lambda t:convert_date(t).day, target_pageview.day.values))
#target_pageview.month = list(map(lambda t:convert_date(t).month, target_pageview.month.values))
#target_pageview.hour = list(map(lambda t:convert_date(t).hour, target_pageview.hour.values))
target_pageview.date = list(map(to_weekday, target_pageview.date.values))

CPU times: user 22.1 s, sys: 12 ms, total: 22.1 s
Wall time: 22.1 s


## Set dummies vars for categorical features

In [45]:
%%time
categorical_p = ['category', 'sub_category', 'sub_sub_category', 'date','hour','month']
categorical_v = ['category', 'sub_category', 'sub_sub_category', 'date']

purchase = purchase.join(catalog.set_index('pid'), on='pid')
purchase = pd.get_dummies(purchase, columns=categorical_p).iloc[:,1:]

pageview = pageview.join(catalog.set_index('pid'), on='pid')
pageview = pd.get_dummies(pageview, columns=categorical_v).iloc[:,1:]

target_purchase = target_purchase.join(catalog.set_index('pid'), on='pid')
target_purchase = pd.get_dummies(target_purchase, columns=categorical_p).iloc[:,1:]

target_pageview = target_pageview.join(catalog.set_index('pid'), on='pid')
target_pageview = pd.get_dummies(target_pageview, columns=categorical_v).iloc[:,1:]

CPU times: user 1.21 s, sys: 460 ms, total: 1.67 s
Wall time: 1.67 s


## Merge users with same uid (sum entries) and join DFs

In [46]:
purchase.current_price = purchase.current_price.values*purchase.quantity
purchase.original_price = purchase.original_price.values*purchase.quantity
purchase.diff_price = purchase.diff_price.values*purchase.quantity

In [47]:
%%time
purchase = purchase.groupby(['uid','gender'], as_index=False).sum()
pageview = pageview.groupby(['uid','gender'], as_index=False).sum()
data = purchase.join(pageview.set_index(['uid','gender']), on=['uid','gender'], rsuffix='_v')


target_purchase = target_purchase.groupby(['uid'], as_index=False).sum()
target_pageview = target_pageview.groupby(['uid'], as_index=False).sum()
target_data = target_purchase.join(target_pageview.set_index('uid'), on='uid', rsuffix='_v')

CPU times: user 1.82 s, sys: 1.87 s, total: 3.69 s
Wall time: 8.15 s


In [48]:
purchase.to_csv('grouped_purchase_data.csv')
pageview.to_csv('grouped_product_pageview_data.csv')
target_purchase.to_csv('grouped_target_purchase_data.csv')
target_pageview.to_csv('grouped_target_product_pageview_data.csv')

## Additional features

In [7]:
search = get_data_pageview('home')
target_search = get_target_pageview('home')

data2 = data.join(search.set_index(['uid','gender']), on=['uid','gender'], rsuffix='_s')
target_data2 = target_data.join(target_search.set_index('uid'), on='uid', rsuffix='_s')

In [8]:
len(search.uid.values)

14213

In [49]:
data = data.fillna(value=0)
target_data = target_data.fillna(value=0)

In [50]:
features = data.columns[2:]
target = data.columns[1]
features2 = target_data.columns[1:]
features = list(set(features)&set(features2))

In [51]:
X = data[features]
Y = data[target]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=23492)

In [52]:
%%time
forest = RandomForestClassifier(n_estimators=100)
forest.fit(x_train, y_train)
y_pred = forest.predict(x_test)
score1 = metrics.accuracy_score(y_test, y_pred)
score2 = metrics.f1_score(y_test, y_pred, average='binary')
score3 = metrics.roc_auc_score(y_test, y_pred)
#joblib.dump(forest, '83.pkl')
print score1, score2, score3

0.769458987784 0.808467449616 0.756076647811
CPU times: user 2.33 s, sys: 24 ms, total: 2.36 s
Wall time: 2.41 s


In [53]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100,learning_rate=1.0,max_depth=1,random_state=0)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
score1 = metrics.accuracy_score(y_test, y_pred)
score2 = metrics.f1_score(y_test, y_pred, average='binary')
score3 = metrics.roc_auc_score(y_test, y_pred)
print score1, score2, score3

0.77294938918 0.811912678907 0.759096883389


## Output

In [16]:
X = target_data[features]
answer = forest.predict(X)
users = target_data.uid.values
ans = []
for i,u in enumerate(users):
    if answer[i]:
        g = 'M'
    else:
        g = 'F'
    obj = {'a':u, 'b':g}
    ans.append(obj)
    
import csv

with open('ans10.csv', 'wb') as f:
    w = csv.DictWriter(f, fieldnames=['a','b'])
    for obj in ans:
        w.writerow(obj)

In [18]:
data.to_csv('final_data.csv')