In [1]:
import pandas as pd
import numpy as np
import json
import gzip
import seaborn as sns
from datetime import datetime
from pandas.io.json import json_normalize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
%matplotlib inline

In [2]:
def open_json(data_file):    
    with open(data_file, 'r') as f:
        json_records = []
        for line in f:
            record = json.loads(line)
            json_records.append(record)
    return json_records

def gender_to_bin(g):
    if g == 'M':
        return 1
    else:
        return 0

def bin_to_gender(g):
    if g == 1:
        return 'M'
    else:
        return 'F'

def to_weekday(timestamp):
    try:
        weekday = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S").weekday()
    except ValueError:
        weekday = datetime.strptime(timestamp, "%Y-%m-%d").weekday()
    except TypeError:
        weekday = 1
    return weekday

## Create 5 DataFrames: catalog, purchase, pageview, target_purchase, target_pageview

In [129]:
%%time
catalog = pd.read_csv('data/catalog.gz', usecols=['pid', 'current_price', 'category', 'sub_category', 'sub_sub_category'])
for i,val in enumerate(catalog.current_price.values):
    if np.isnan(val):
        catalog.current_price.values[i] = 0

purchase_data = open_json('data/purchase_data')           
purchase = json_normalize(purchase_data, 'products', ['date','gender','uid'])

pageview_data = open_json('data/products_data')   
pageview = pd.DataFrame(pageview_data, columns=['productId', 'timestamp', 'gender', 'uid'])
pageview.columns = ['pid', 'date', 'gender', 'uid']


purchase_data = open_json('data/purchase_new_target')           
target_purchase = json_normalize(purchase_data, 'products', ['date','uid'])

pageview_data = open_json('data/products_new_target')   
target_pageview = pd.DataFrame(pageview_data, columns=['productId', 'timestamp', 'uid'])
target_pageview.columns = ['pid', 'date', 'uid']

CPU times: user 10.2 s, sys: 104 ms, total: 10.3 s
Wall time: 11.1 s


## Transform gender and date into numbers

In [130]:
%%time
purchase.gender = list(map(gender_to_bin, purchase.gender.values))
purchase.date = list(map(to_weekday, purchase.date.values))

pageview.gender = list(map(gender_to_bin, pageview.gender.values))
pageview.date = list(map(to_weekday, pageview.date.values))


target_purchase.date = list(map(to_weekday, target_purchase.date.values))
target_pageview.date = list(map(to_weekday, target_pageview.date.values))

CPU times: user 21.8 s, sys: 8 ms, total: 21.8 s
Wall time: 21.8 s


## Set dummies vars for categorical features

In [131]:
%%time
categorical = ['category', 'sub_category', 'sub_sub_category', 'date']

purchase = purchase.join(catalog.set_index('pid'), on='pid')
purchase = pd.get_dummies(purchase, columns=categorical).iloc[:,1:]

pageview = pageview.join(catalog.set_index('pid'), on='pid')
pageview = pd.get_dummies(pageview, columns=categorical).iloc[:,1:]

target_purchase = target_purchase.join(catalog.set_index('pid'), on='pid')
target_purchase = pd.get_dummies(target_purchase, columns=categorical).iloc[:,1:]

target_pageview = target_pageview.join(catalog.set_index('pid'), on='pid')
target_pageview = pd.get_dummies(target_pageview, columns=categorical).iloc[:,1:]

CPU times: user 1.36 s, sys: 288 ms, total: 1.65 s
Wall time: 1.65 s


## Merge users with same uid (sum entries) and join DFs

In [132]:
len(target_purchase.uid.values)

12687

In [133]:
%%time
#quantity = purchase.pop('quantity')
purchase = purchase.groupby(['uid','gender'], as_index=False).sum()
pageview = pageview.groupby(['uid','gender'], as_index=False).sum()
data = purchase.join(pageview.set_index(['uid','gender']), on=['uid','gender'], rsuffix='_v')


target_purchase = target_purchase.groupby(['uid'], as_index=False).sum()
target_pageview = target_pageview.groupby(['uid'], as_index=False).sum()
target_data = target_purchase.join(target_pageview.set_index('uid'), on='uid', rsuffix='_v')

CPU times: user 2.54 s, sys: 1.79 s, total: 4.34 s
Wall time: 1min 17s


In [134]:
data = data.fillna(value=0)
target_data = target_data.fillna(value=0)

In [135]:
features = data.columns[3:]
target = data.columns[1]
features2 = target_data.columns[2:]
features = list(set(features)&set(features2))

In [136]:
X = data[features]
Y = data[target]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=42)

In [137]:
%%time
forest = RandomForestClassifier(n_estimators=100)
forest.fit(x_train, y_train)
y_pred = forest.predict(x_test)
score = metrics.accuracy_score(y_test, y_pred)

CPU times: user 3.51 s, sys: 284 ms, total: 3.8 s
Wall time: 17 s


In [138]:
score

0.78917975567190224

In [140]:
X = target_data[features]
answer = forest.predict(X)
users = target_data.uid.values
ans = []
for i,u in enumerate(users):
    if answer[i]:
        g = 'M'
    else:
        g = 'F'
    obj = {'a':u, 'b':g}
    ans.append(obj)
    
import csv

with open('ans6.csv', 'wb') as f:
    w = csv.DictWriter(f, fieldnames=['a','b'])
    for obj in ans:
        w.writerow(obj)

In [141]:
X = target_data[features]
answer = forest.predict(X)
users = target_data.uid.values
ans = []
for i,u in enumerate(users):
    if answer[i]:
        g = 'M'
    else:
        g = 'F'
    obj = {'a':u}
    ans.append(obj)
    
import csv

with open('usu.csv', 'wb') as f:
    w = csv.DictWriter(f, fieldnames=['a'])
    for obj in ans:
        w.writerow(obj)

In [None]:
data.head()