In [1]:
import pandas as pd
import numpy as np
import json
import gzip
import seaborn as sns
from pandas.io.json import json_normalize
%matplotlib inline

# Data Cleaning

## Open Catalog dataset

In [2]:
catalog = pd.read_csv('data/catalog.gz')

In [3]:
catalog.head()

Unnamed: 0,pid,current_price,original_price,category,sub_category,sub_sub_category
0,c9fe04e5097c087b6b6eeccc3adc4a142e14aa46,699.0,,c1bd5fd999bd577743936898ada96496b547af3c,f08770a96fb546673053ab799f5ea9cada06c06a,2d2c44a2d8f18a6271f0e8057313af68a46d0f24
1,e00988f42f7fc5f14ec6a0e7905789110f23c5de,150.0,,37b2661cc181c987f68770e43990152026920ba5,11e623a37e87cf7995c466723ec99688d55cae8c,11e623a37e87cf7995c466723ec99688d55cae8c
2,1e2e7c0d4082295728c1684a702cb43e1d332d3e,549.9,1049.9,c37df0ae71b97699a478def3001a3516a905a51d,f08770a96fb546673053ab799f5ea9cada06c06a,d6605426aaa703bf19874c0caac79a661b73de33
3,a279dd2284eb57533ca417c258ede0a0526a6f6e,0.0,0.0,c1bd5fd999bd577743936898ada96496b547af3c,6d9d48ae11ee1909235f31e9bfe5d36aa1462cb3,9a2e3cb56f6a1756fd35a4fd70172a67ecf13639
4,405b0362ae6ec149700164811b0e7773c8300e9d,79.9,0.0,c37df0ae71b97699a478def3001a3516a905a51d,78af0aac89e0f15a6c9fc70b5bff79d98c6dcc43,2541a58f702844477aab540e5df7b859a1e3d5de


## Open dataset with a JSON object per line with NaN

In [4]:
%%time
with gzip.open('data/test.gz', 'r') as f:
    json_records = []
    for line in f:
        record = json.loads(line)
        json_records.append(record)
data = pd.DataFrame(json_records)

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 2.4 ms


In [5]:
data.head()

Unnamed: 0,date,event_type,page_type,productId,products,source,timestamp,uid
0,,pageview,cart,,,,2016-01-01,54cfe1703c536e98bf02c7c4cd5b4e280f07d74c
1,,pageview,checkout,,,,2016-01-02,c808d5ecb73bee00a9730064b7652bec10778f20
2,,pageview,checkout,,,,2016-01-02,7ba0cbaaf2000d440b474ff9182383c5525d0d56
3,,pageview,confirmation,,,,2016-01-02,3b2790a1f746e59298c8c277e3ec2279343ce52d
4,,pageview,checkout,,,,2016-01-02,3b2790a1f746e59298c8c277e3ec2279343ce52d


## Open only purchase events from data dataset

In [6]:
%%time
with open('data/purchase_data', 'r') as f:
    json_records = []
    for line in f:
        record = json.loads(line)
        json_records.append(record)
purchase = pd.DataFrame(json_records)

CPU times: user 532 ms, sys: 60 ms, total: 592 ms
Wall time: 666 ms


In [7]:
purchase.head()

Unnamed: 0,date,event_type,gender,products,source,uid
0,2017-03-27 14:19:00,purchase,M,[{u'pid': u'db3eae1855619bd4462848bb7b473b2193...,desktop,1f9acd4f729c04e18e5d72bebca5e5524e67687e
1,2016-05-26 11:21:40,purchase,M,[{u'pid': u'418e19155782fc7f12ced8332c8ed025ae...,,07be0ebd1e6f2412e4a9a3f679ded05796a1e225
2,2016-03-05 06:52:27,purchase,M,[{u'pid': u'4b17c3aa28d53208c9fd147057b40fdd81...,,3cf9949b8f90daadd8f67ec46214e64d6b9cfa5b
3,2016-11-07 11:36:00,purchase,M,[{u'pid': u'acd9858f0d4d4c61ee783080a1a305b4f0...,desktop,7f1c45f10de66a7023bfa9f9ed053c5b3ae03f56
4,2016-08-28 20:48:58,purchase,M,[{u'pid': u'fca1dc5059edb8a8ee2fa424ce7a01958b...,,79ddd2b0c65947a8a11249c2fa71499055507e11


## Unnest products

In [8]:
%%time
with open('data/purchase_data', 'r') as f:
    json_records = []
    for line in f:
        record = json.loads(line)
        products = record.pop('products')
        record.pop('source')
        record.pop('event_type')
        for obj in products:
            new_record = record.copy()
            new_record.update(obj)
            json_records.append(new_record)
purchase = pd.DataFrame(json_records)

CPU times: user 580 ms, sys: 16 ms, total: 596 ms
Wall time: 595 ms


In [9]:
purchase.head()

Unnamed: 0,date,gender,pid,quantity,uid
0,2017-03-27 14:19:00,M,db3eae1855619bd4462848bb7b473b219345ee87,1.0,1f9acd4f729c04e18e5d72bebca5e5524e67687e
1,2017-03-27 14:19:00,M,3aed9b0313f9226111de8aeabaedccf8db07d428,1.0,1f9acd4f729c04e18e5d72bebca5e5524e67687e
2,2016-05-26 11:21:40,M,418e19155782fc7f12ced8332c8ed025aec227a5,1.0,07be0ebd1e6f2412e4a9a3f679ded05796a1e225
3,2016-03-05 06:52:27,M,4b17c3aa28d53208c9fd147057b40fdd8108f790,1.0,3cf9949b8f90daadd8f67ec46214e64d6b9cfa5b
4,2016-03-05 06:52:27,M,31771f432cbc8e7ee3140f70b1e866fb5a2d739a,1.0,3cf9949b8f90daadd8f67ec46214e64d6b9cfa5b


### Using json_normalize

In [10]:
%%time
with open('data/purchase_data', 'r') as f:
    json_records = []
    for line in f:
        record = json.loads(line)
        json_records.append(record)
purchase = json_normalize(json_records, 'products', ['uid','date','gender'])

CPU times: user 776 ms, sys: 24 ms, total: 800 ms
Wall time: 800 ms


In [11]:
purchase.head()

Unnamed: 0,pid,quantity,date,gender,uid
0,db3eae1855619bd4462848bb7b473b219345ee87,1.0,2017-03-27 14:19:00,M,1f9acd4f729c04e18e5d72bebca5e5524e67687e
1,3aed9b0313f9226111de8aeabaedccf8db07d428,1.0,2017-03-27 14:19:00,M,1f9acd4f729c04e18e5d72bebca5e5524e67687e
2,418e19155782fc7f12ced8332c8ed025aec227a5,1.0,2016-05-26 11:21:40,M,07be0ebd1e6f2412e4a9a3f679ded05796a1e225
3,4b17c3aa28d53208c9fd147057b40fdd8108f790,1.0,2016-03-05 06:52:27,M,3cf9949b8f90daadd8f67ec46214e64d6b9cfa5b
4,31771f432cbc8e7ee3140f70b1e866fb5a2d739a,1.0,2016-03-05 06:52:27,M,3cf9949b8f90daadd8f67ec46214e64d6b9cfa5b


In [12]:
purchase = purchase.join(catalog.set_index('pid'), on='pid')

In [13]:
purchase.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45628 entries, 0 to 45627
Data columns (total 10 columns):
pid                 45628 non-null object
quantity            45628 non-null float64
date                45628 non-null object
gender              45628 non-null object
uid                 45628 non-null object
current_price       45622 non-null float64
original_price      36256 non-null float64
category            45628 non-null object
sub_category        45628 non-null object
sub_sub_category    45628 non-null object
dtypes: float64(3), object(7)
memory usage: 3.5+ MB


In [14]:
categorical = ['category', 'sub_category', 'sub_sub_category','gender']
new_purchase = pd.get_dummies(purchase, columns=categorical)
new_purchase.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45628 entries, 0 to 45627
Columns: 109 entries, pid to gender_M
dtypes: float64(3), object(3), uint8(103)
memory usage: 6.6+ MB


In [15]:
new_purchase.head()

Unnamed: 0,pid,quantity,date,uid,current_price,original_price,category_11e623a37e87cf7995c466723ec99688d55cae8c,category_37b2661cc181c987f68770e43990152026920ba5,category_55599ee4735544f9319a030222f617a252d8ebf3,category_57ecb679698676e7a3ac843bef1e3f6844651a79,...,sub_sub_category_eb0619cfb60da489f61001b19331dad5d480f82b,sub_sub_category_ec6c67dfdd9a89bb590c67bdc7865bfdd2a810ee,sub_sub_category_ef76dd0072150112992c6db54e2a7d3d11c302cb,sub_sub_category_f08770a96fb546673053ab799f5ea9cada06c06a,sub_sub_category_f85ea193363a91d5d03995129ec35786cfa77e6f,sub_sub_category_f86a9cce8e46d9cac88fd9306a6515fef89fe3fc,sub_sub_category_f9e79addaa1175027c52d95685b4664fa7430f0f,sub_sub_category_fecdd01f32c4b0a1e0d2b70cc3891dc52f964329,gender_F,gender_M
0,db3eae1855619bd4462848bb7b473b219345ee87,1.0,2017-03-27 14:19:00,1f9acd4f729c04e18e5d72bebca5e5524e67687e,529.9,899.9,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,3aed9b0313f9226111de8aeabaedccf8db07d428,1.0,2017-03-27 14:19:00,1f9acd4f729c04e18e5d72bebca5e5524e67687e,469.9,799.9,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,418e19155782fc7f12ced8332c8ed025aec227a5,1.0,2016-05-26 11:21:40,07be0ebd1e6f2412e4a9a3f679ded05796a1e225,249.9,429.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4b17c3aa28d53208c9fd147057b40fdd8108f790,1.0,2016-03-05 06:52:27,3cf9949b8f90daadd8f67ec46214e64d6b9cfa5b,640.0,,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,31771f432cbc8e7ee3140f70b1e866fb5a2d739a,1.0,2016-03-05 06:52:27,3cf9949b8f90daadd8f67ec46214e64d6b9cfa5b,129.9,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [16]:
dummy_cols = ['category_c1bd5fd999bd577743936898ada96496b547af3c',
'sub_category_f08770a96fb546673053ab799f5ea9cada06c06a',
'sub_sub_category_2d2c44a2d8f18a6271f0e8057313af68a46d0f24',
'gender_F']
purchase2 = new_purchase.drop(dummy_cols, 1)
purchase2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45628 entries, 0 to 45627
Columns: 105 entries, pid to gender_M
dtypes: float64(3), object(3), uint8(99)
memory usage: 6.4+ MB


---
# Machine Learning Algorithms

In [40]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import calibration_curve

## Separate train and test data

In [18]:
features = purchase2.columns[6:-1]
target = purchase2.columns[-1]

## Gambiarra : nem todas as features tão no target

In [19]:
%%time
with open('data/purchase_target', 'r') as f:
    json_records = []
    for line in f:
        record = json.loads(line)
        json_records.append(record)
purchase3 = json_normalize(json_records, 'products', ['uid','date'])
purchase3 = purchase3.join(catalog.set_index('pid'), on='pid')
categorical = ['category', 'sub_category', 'sub_sub_category']
new_purchase2 = pd.get_dummies(purchase3, columns=categorical)
dummy_cols = ['category_c1bd5fd999bd577743936898ada96496b547af3c',
'sub_category_f08770a96fb546673053ab799f5ea9cada06c06a',
'sub_sub_category_2d2c44a2d8f18a6271f0e8057313af68a46d0f24']
purchase4 = new_purchase2.drop(dummy_cols, 1)
features2 = purchase4.columns[6:]

CPU times: user 256 ms, sys: 0 ns, total: 256 ms
Wall time: 256 ms


In [20]:
new_features = list(set(features)&set(features2))

In [21]:
X = purchase2[new_features]
Y = purchase2[target]

In [22]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=42)

In [23]:
len(x_train.columns)

86

## Naive Bayes

In [24]:
%%time
gnb = GaussianNB()
gnb.fit(x_train, y_train)
y_pred = gnb.predict(x_test)
score = metrics.accuracy_score(y_test, y_pred)

CPU times: user 76 ms, sys: 12 ms, total: 88 ms
Wall time: 86.4 ms


In [25]:
score

0.48197457812842426

## KNeighborsClassifier

In [19]:
%%time
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
score = metrics.accuracy_score(y_test, y_pred)

CPU times: user 1min 10s, sys: 60 ms, total: 1min 10s
Wall time: 1min 10s


In [20]:
score

0.71137409598948065

## Decision Tree

In [26]:
%%time
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)
y_pred = tree.predict(x_test)
score = metrics.accuracy_score(y_test, y_pred)

CPU times: user 176 ms, sys: 0 ns, total: 176 ms
Wall time: 172 ms


In [27]:
score

0.75345167652859957

## Forest Tree

In [43]:
%%time
forest = RandomForestClassifier(n_estimators=100)
forest.fit(x_train, y_train)
y_pred = tree.predict(x_test)
score = metrics.accuracy_score(y_test, y_pred)

CPU times: user 6.37 s, sys: 0 ns, total: 6.37 s
Wall time: 6.36 s


In [44]:
score

0.75345167652859957

## xgboost

In [35]:
%%time
boost = GradientBoostingClassifier()
boost.fit(x_train, y_train)
y_pred = boost.predict(x_test)
score = metrics.accuracy_score(y_test, y_pred)

CPU times: user 5.45 s, sys: 4 ms, total: 5.46 s
Wall time: 5.45 s


In [36]:
score

0.75076703922857768

## SGDClassifier

In [31]:
clf = linear_model.SGDClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
score = metrics.accuracy_score(y_test, y_pred)

In [32]:
score

0.74786324786324787

## Logistic Regression

In [38]:
%%time
clf = linear_model.LogisticRegression(C=1e5)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
score = metrics.accuracy_score(y_test, y_pred)

CPU times: user 452 ms, sys: 8 ms, total: 460 ms
Wall time: 452 ms


In [39]:
score

0.74983563445101908

## SVM

In [49]:
%%time
svc = LinearSVC(C=1.0)
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)
score = metrics.accuracy_score(y_test, y_pred)

CPU times: user 356 ms, sys: 0 ns, total: 356 ms
Wall time: 345 ms


In [50]:
score

0.74983563445101908

---
# Generate output

In [25]:
X = purchase4[new_features]
answer = tree.predict(X)

In [27]:
len(answer)

12681

In [32]:
users = purchase4.uid.values
len(users)

12681

In [33]:
len(set(users))

3209

In [37]:
ans = []
for i,u in enumerate(users):
    if answer[i]:
        g = 'M'
    else:
        g = 'F'
    obj = {'a':u, 'b':g}
    ans.append(obj)

In [39]:
import csv

with open('ans.csv', 'wb') as f:
    w = csv.DictWriter(f, fieldnames=['a','b'])
    for obj in ans:
        w.writerow(obj)