In [1]:
import pandas as pd
import numpy as np
import boto3
from datetime import datetime
from keys import *

# Load data from S3 bucket

In [2]:
s3 = boto3.resource('s3')
bucket = s3.Bucket("msds-630-finalproject")

client = boto3.client("s3", aws_access_key_id=access_key, aws_secret_access_key=secret_key)

obj = client.get_object(Bucket="msds-630-finalproject", Key="sessions.csv")
sessions = pd.read_csv(obj["Body"])

In [3]:
sessions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6239836 entries, 0 to 6239835
Data columns (total 22 columns):
app_id                        int64
session_id                    int64
start_timestamp               int64
timezone                      object
timezone_offset               float64
previous_sessions_duration    int64
user_created_timestamp        int64
is_user_first_session         bool
is_session                    bool
is_developer                  bool
is_wau                        bool
is_mau                        bool
country                       object
region                        object
city                          object
latitude                      float64
longitude                     float64
locale                        object
os_name                       object
session_index                 int64
device_id                     object
user_id_hash                  object
dtypes: bool(5), float64(3), int64(6), object(8)
memory usage: 839.1+ MB


In [4]:
sessions_selected = sessions[['user_id_hash','start_timestamp','previous_sessions_duration','is_developer',
                              'country','city','device_id']]

## Filter out is_developer

In [5]:
sessions_selected = sessions_selected[sessions_selected['is_developer']==False]

In [6]:
sessions_selected.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6237248 entries, 0 to 6239835
Data columns (total 7 columns):
user_id_hash                  object
start_timestamp               int64
previous_sessions_duration    int64
is_developer                  bool
country                       object
city                          object
device_id                     object
dtypes: bool(1), int64(2), object(4)
memory usage: 339.1+ MB


## Filter out December data

In [7]:
sessions_selected['start_timestamp'] = pd.to_datetime(sessions_selected['start_timestamp'], unit='ms')

In [8]:
dec1_cutoff = datetime(2018, 12, 1, 0, 0, 0)

In [9]:
df_train = sessions_selected[sessions_selected['start_timestamp'] < dec1_cutoff]

## Only select the following features from sessions

In [10]:
df_train = df_train[['user_id_hash','start_timestamp','previous_sessions_duration',
                              'country','city','device_id']]

In [11]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5447266 entries, 0 to 6239835
Data columns (total 6 columns):
user_id_hash                  object
start_timestamp               datetime64[ns]
previous_sessions_duration    int64
country                       object
city                          object
device_id                     object
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 290.9+ MB


Obtain the list of country and city each session at, if there are more than 1 we will only the first one.

In [13]:
df_train_countries = df_train.groupby('user_id_hash')['country'].apply(set)
df_train_city = df_train.groupby('user_id_hash')['city'].apply(set)

Get the number of seasons and mean of sessions durations, and number of unique device per user.

In [14]:
df_train_main = df_train.groupby('user_id_hash').agg({'user_id_hash': 'count', 'previous_sessions_duration': 'mean',
                                      'device_id': 'nunique'})

In [15]:
df_train_main['user_id_hash'] = df_train_main.index

In [16]:
df_train_countries['user_id_hash'] = df_train_countries.index
df_train_city['user_id_hash'] = df_train_city.index

Merge all those togehter

In [17]:
all_X = pd.merge(pd.DataFrame(df_train_main), pd.DataFrame(df_train_countries), on='user_id_hash')
all_X = pd.merge(all_X, pd.DataFrame(df_train_city), on='user_id_hash')

Defaulting to column, but this will raise an ambiguity error in a future version
  exec(code_obj, self.user_global_ns, self.user_ns)


Get the label that we are predicting

In [18]:
labels = pd.read_csv("features_train.csv", index_col = 0)
labels.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days,num_purchase,value_purchase
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.0,0.0,0.0,0.0
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.0,0.0,0.0,0.0
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.0,0.0,0.0,0.0
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.0,0.0,0.0,0.0
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.0,0.0,0.0,0.0


In [19]:
labels = labels[['user_id_hash','user_purchase_binary_7_days','user_purchase_binary_14_days']]
labels.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.0,0.0
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.0,0.0
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.0,0.0
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.0,0.0
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.0,0.0


In [20]:
labels.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 312568 entries, 0 to 312567
Data columns (total 3 columns):
user_id_hash                    312568 non-null object
user_purchase_binary_7_days     312568 non-null float64
user_purchase_binary_14_days    312568 non-null float64
dtypes: float64(2), object(1)
memory usage: 9.5+ MB


Fill NAs

In [21]:
all_X = pd.merge(labels, all_X, how='left', on='user_id_hash')

In [22]:
all_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 312568 entries, 0 to 312567
Data columns (total 7 columns):
user_id_hash                    312568 non-null object
user_purchase_binary_7_days     312568 non-null float64
user_purchase_binary_14_days    312568 non-null float64
previous_sessions_duration      311863 non-null float64
device_id                       311863 non-null float64
country                         311863 non-null object
city                            311863 non-null object
dtypes: float64(4), object(3)
memory usage: 19.1+ MB


In [23]:
all_X['country'] = all_X['country'].fillna('None')
all_X['city'] = all_X['city'].fillna('None')

Only select one country or city.

In [24]:
def convert(x):
    if type(x) is set:
        return list(x)[0]
    return x
all_X['country'] = all_X['country'].apply(lambda x: convert(x))
all_X['city'] = all_X['city'].apply(lambda x: convert(x))

In [25]:
all_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 312568 entries, 0 to 312567
Data columns (total 7 columns):
user_id_hash                    312568 non-null object
user_purchase_binary_7_days     312568 non-null float64
user_purchase_binary_14_days    312568 non-null float64
previous_sessions_duration      311863 non-null float64
device_id                       311863 non-null float64
country                         312121 non-null object
city                            312064 non-null object
dtypes: float64(4), object(3)
memory usage: 19.1+ MB


In [26]:
train = pd.read_csv('train_id.csv')
val = pd.read_csv('val_id.csv')
train = train.rename(index=str, columns={'0':'user_id_hash'})
val = val.rename(index=str, columns={'0':'user_id_hash'})

In [59]:
train_X = pd.merge(train, all_X, how='left', on='user_id_hash')
val_X = pd.merge(val, all_X, how='left', on='user_id_hash')

In [60]:
train_y7 = pd.DataFrame(train_X['user_purchase_binary_7_days'])
train_y14 = pd.DataFrame(train_X['user_purchase_binary_14_days'])
val_y7 = pd.DataFrame(val_X['user_purchase_binary_7_days'])
val_y14 = pd.DataFrame(val_X['user_purchase_binary_14_days'])

In [61]:
test_X = all_X[['previous_sessions_duration','device_id','country','city']]

In [62]:
train_X['country'] = train_X['country'].fillna('None')
train_X['city'] = train_X['city'].fillna('None')
val_X['country'] = val_X['country'].fillna('None')
val_X['city'] = val_X['city'].fillna('None')
test_X['country'] = test_X['country'].fillna('None')
test_X['city'] = test_X['city'].fillna('None')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [63]:
train_X = train_X[['previous_sessions_duration','device_id','country','city']]
val_X = val_X[['previous_sessions_duration','device_id','country','city']]

In [64]:
train_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 496356 entries, 0 to 496355
Data columns (total 4 columns):
previous_sessions_duration    249380 non-null float64
device_id                     249380 non-null float64
country                       496356 non-null object
city                          496356 non-null object
dtypes: float64(2), object(2)
memory usage: 18.9+ MB


In [65]:
train_y7.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 496356 entries, 0 to 496355
Data columns (total 1 columns):
user_purchase_binary_7_days    249503 non-null float64
dtypes: float64(1)
memory usage: 7.6 MB


In [66]:
psd_mean = pd.DataFrame.mean(train_X['previous_sessions_duration'])
device_mean = pd.DataFrame.mean(train_X['device_id'])
train_X['previous_sessions_duration'] = train_X['previous_sessions_duration'].fillna(psd_mean)
train_X['device_id'] = train_X['device_id'].fillna(device_mean)

In [67]:
psd_mean = pd.DataFrame.mean(val_X['previous_sessions_duration'])
device_mean = pd.DataFrame.mean(val_X['device_id'])
val_X['previous_sessions_duration'] = val_X['previous_sessions_duration'].fillna(psd_mean)
val_X['device_id'] = val_X['device_id'].fillna(device_mean)

In [68]:
psd_mean = pd.DataFrame.mean(train_X['previous_sessions_duration'])
device_mean = pd.DataFrame.mean(train_X['device_id'])
test_X['previous_sessions_duration'] = test_X['previous_sessions_duration'].fillna(psd_mean)
test_X['device_id'] = test_X['device_id'].fillna(device_mean)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Encode the strings in the dataset

In [69]:
from sklearn import preprocessing

In [70]:
le_country = preprocessing.LabelEncoder()
le_country.fit(all_X['country'].astype(str))
train_X['country'] = le_country.transform(train_X['country'])
val_X['country'] = le_country.transform(val_X['country'])
test_X['country'] = le_country.transform(test_X['country'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [75]:
le_city = preprocessing.LabelEncoder()
le_city.fit(all_X['city'].astype(str))
train_X['city'] = le_city.transform(train_X['city'])
val_X['city'] = le_city.transform(val_X['city'])
test_X['city'] = le_city.transform(test_X['city'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [100]:
train_y7 = train_y7.fillna(0)
train_y14 = train_y14.fillna(0)
val_y7 = val_y7.fillna(0)
val_y14 = val_y14.fillna(0)

In [88]:
train_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 496356 entries, 0 to 496355
Data columns (total 4 columns):
previous_sessions_duration    496356 non-null float64
device_id                     496356 non-null float64
country                       496356 non-null int64
city                          496356 non-null int64
dtypes: float64(2), int64(2)
memory usage: 18.9 MB


In [89]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [90]:
rf_7 = RandomForestClassifier(max_depth=30, n_estimators=100)
rf_14 = RandomForestClassifier(max_depth=30, n_estimators=100)

First predict the 1st week

In [91]:
rf_7.fit(train_X, train_y7)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=30, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [92]:
accuracy_score(rf_7.predict(train_X),train_y7)

0.9991941267960899

In [93]:
accuracy_score(rf_7.predict(val_X),val_y7)

0.9944049683880714

In [103]:
rf_model_week_pred = rf_7.predict(test_X)

Then predict the 14 days period

In [97]:
rf_14.fit(train_X, train_y14)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=30, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [98]:
accuracy_score(rf_14.predict(train_X),train_y14)

0.9989966878611319

In [101]:
accuracy_score(rf_14.predict(val_X),val_y14)

0.9931181111173278

In [104]:
rf_model_2week_pred = rf_14.predict(test_X)

In [108]:
result_model_rf = pd.DataFrame({'user_id_hash': all_X['user_id_hash'],'user_purchase_binary_7_days': rf_model_week_pred,
                              'user_purchase_binary_14_days': rf_model_2week_pred})

In [109]:
result_model_rf

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.0,0.0
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.0,0.0
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.0,0.0
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.0,0.0
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.0,0.0
5,7f7e3c5da13a555b2c2047c2c94b45d289ffa8083908cb...,0.0,0.0
6,28b5edd0510e8517462bae9d67a99f272d8ccc8ba2f79d...,0.0,0.0
7,b9433b9c4d559218e64469b6807cb516a5866572c1c30b...,0.0,0.0
8,607da741597a3b15fecf9a30c76e2e418b629c975e069a...,0.0,0.0
9,455ff7334b87646d3ecc05d2e0ef5a8d99dc8fa6573e54...,0.0,0.0


In [110]:
result_model1 = result_model1.sort_values(by=['user_purchase_binary_7_days', 'user_purchase_binary_14_days'])

Save as csv to submit

In [111]:
result_model1.to_csv('result_rf.csv',index=False)