In [1]:
import pandas as pd
import numpy as np
import boto3
from keys import *

In [2]:
from datetime import datetime

# Load data from S3 bucket 

In [3]:
s3 = boto3.resource('s3')
bucket = s3.Bucket("msds-630-finalproject")

client = boto3.client("s3", aws_access_key_id=access_key, aws_secret_access_key=secret_key)

obj = client.get_object(Bucket="msds-630-finalproject", Key="sessions.csv")
sessions = pd.read_csv(obj["Body"])

In [4]:
sessions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6239836 entries, 0 to 6239835
Data columns (total 22 columns):
app_id                        int64
session_id                    int64
start_timestamp               int64
timezone                      object
timezone_offset               float64
previous_sessions_duration    int64
user_created_timestamp        int64
is_user_first_session         bool
is_session                    bool
is_developer                  bool
is_wau                        bool
is_mau                        bool
country                       object
region                        object
city                          object
latitude                      float64
longitude                     float64
locale                        object
os_name                       object
session_index                 int64
device_id                     object
user_id_hash                  object
dtypes: bool(5), float64(3), int64(6), object(8)
memory usage: 839.1+ MB


# Select wanted columns

In [5]:
sessions_selected = sessions[['user_id_hash','start_timestamp','previous_sessions_duration','country','device_id']]

# Split data based on date to separate training, validation and test set

In [6]:
sessions_selected['start_timestamp'] = pd.to_datetime(sessions_selected['start_timestamp'], unit='ms')

dec1_cutoff = datetime(2018, 12, 1, 0, 0, 0)
dec14_cutoff = datetime(2018, 12, 14, 0, 0, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [7]:
sessions_selected.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6239836 entries, 0 to 6239835
Data columns (total 5 columns):
user_id_hash                  object
start_timestamp               datetime64[ns]
previous_sessions_duration    int64
country                       object
device_id                     object
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 238.0+ MB


In [8]:
df_train = sessions_selected[sessions_selected['start_timestamp'] < dec1_cutoff]
df_test = sessions_selected

## Get sessions count, average session duration, country count, device count

In [9]:
df_train = df_train.groupby('user_id_hash').agg({'user_id_hash': 'count', 'previous_sessions_duration': 'mean',
                                             'country': 'nunique', 'device_id': 'nunique'})
df_train = df_train.rename(index=str, columns={"user_id_hash": "session_count",
                                               "country": "country_count", "device_id": "device_count"})
df_train['user_id_hash'] = df_train.index

In [10]:
df_test = df_test.groupby('user_id_hash').agg({'user_id_hash': 'count', 'previous_sessions_duration': 'mean',
                                             'country': 'nunique', 'device_id': 'nunique'})
df_test = df_test.rename(index=str, columns={"user_id_hash": "session_count",
                                               "country": "country_count", "device_id": "device_count"})
df_test['user_id_hash'] = df_test.index

# Load the labels by user_id_hash, that have done in other script

In [11]:
labels = pd.read_csv("features_train.csv", index_col = 0)

# Training, Validation, Testing data have been splited in other script

In [12]:
train = pd.read_csv('train_id.csv')
val = pd.read_csv('val_id.csv')
#test = pd.read_csv('test_id.csv')

Join all tables based on user_id_hash

In [13]:
train = train.rename(index=str, columns={'0':'user_id_hash'})
val = val.rename(index=str, columns={'0':'user_id_hash'})
#test = test.rename(index=str, columns={'0':'user_id_hash'})

In [14]:
train = pd.merge(train, labels, on='user_id_hash')
train = pd.merge(train, df_train, on='user_id_hash')

Defaulting to column, but this will raise an ambiguity error in a future version
  exec(code_obj, self.user_global_ns, self.user_ns)


In [15]:
val = pd.merge(val, labels, on='user_id_hash')
val = pd.merge(val, df_train, on='user_id_hash')

Defaulting to column, but this will raise an ambiguity error in a future version
  exec(code_obj, self.user_global_ns, self.user_ns)


In [16]:
#test = pd.merge(test, labels, on='user_id_hash')
#test = pd.merge(test, df_test, on='user_id_hash')

Defaulting to column, but this will raise an ambiguity error in a future version
  exec(code_obj, self.user_global_ns, self.user_ns)


In [17]:
val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37172 entries, 0 to 37171
Data columns (total 9 columns):
user_id_hash                    37172 non-null object
user_purchase_binary_7_days     37172 non-null float64
user_purchase_binary_14_days    37172 non-null float64
num_purchase                    37172 non-null float64
value_purchase                  37172 non-null float64
session_count                   37172 non-null int64
previous_sessions_duration      37172 non-null float64
country_count                   37172 non-null int64
device_count                    37172 non-null int64
dtypes: float64(5), int64(3), object(1)
memory usage: 2.8+ MB


# Our first model is using Logistic Regression

In [22]:
train_X = train[['session_count','previous_sessions_duration','country_count','device_count']]
train_y7 = train['user_purchase_binary_7_days']
train_y14 = train['user_purchase_binary_14_days']

In [23]:
val_X = val[['session_count','previous_sessions_duration','country_count','device_count']]
val_y7 = val['user_purchase_binary_7_days']
val_y14 = val['user_purchase_binary_14_days']

In [47]:
test_X = df_test[['session_count','previous_sessions_duration','country_count','device_count']]

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [26]:
log_model_week = LogisticRegression()
log_model_week.fit(train_X, train_y7)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [27]:
accuracy_score(log_model_week.predict(train_X),train_y7)

0.9898478174611127

In [28]:
accuracy_score(log_model_week.predict(val_X),val_y7)

0.9318573119552351

In [48]:
model1_pred1week = log_model_week.predict_proba(test_X)[:,1]

In [30]:
log_model_2week = LogisticRegression()
log_model_2week.fit(train_X, train_y14)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [31]:
accuracy_score(log_model_2week.predict(train_X),train_y14)

0.9866895388031407

In [32]:
accuracy_score(log_model_2week.predict(val_X),val_y14)

0.9111965995910901

In [49]:
model1_pred2week = log_model_2week.predict_proba(test_X)[:,1]

In [58]:
result_model1 = pd.DataFrame({'user_id_hash': df_test['user_id_hash'],'user_purchase_binary_7_days': model1_pred1week,
                              'user_purchase_binary_14_days': model1_pred2week})

In [59]:
result_model1 = result_model1.sort_values(by=['user_purchase_binary_7_days', 'user_purchase_binary_14_days'])

In [60]:
len(result_model1)

621106

In [61]:
result_model1.to_csv('result_model1.csv',index=False)