# Shopee Marketing Analytics
Past kaggle competitions have shown that tabular data requires alot of data analysis and feature extraction to produce good results. They usually are an ensemble of gradient boosting techniques.

In [1]:
import math
import os
import time

import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, f1_score, recall_score, precision_score, SCORERS

from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = 999

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/student-shopee-code-league-marketing-analytics/users.csv
/kaggle/input/student-shopee-code-league-marketing-analytics/test.csv
/kaggle/input/student-shopee-code-league-marketing-analytics/sample_submission_0_1.csv
/kaggle/input/student-shopee-code-league-marketing-analytics/train.csv


In [2]:
df_train = pd.read_csv("../input/student-shopee-code-league-marketing-analytics/train.csv")
df_test = pd.read_csv("../input/student-shopee-code-league-marketing-analytics/test.csv")
df_users = pd.read_csv("../input/student-shopee-code-league-marketing-analytics/users.csv")

In [3]:
df_train = df_train.fillna(-1)
df_users = df_users.fillna(-1)
df_test = df_test.fillna(-1)

In [4]:
user_dict = {}
for row in df_users.itertuples():
    user_dict[row.user_id] = (row.attr_1,row.attr_2,row.attr_3,row.age,row.domain)

In [5]:
def get_user_feature(user_id,i):
    if user_id in user_dict:
        return user_dict[user_id][i]
    else:
        return -2

In [6]:
def fill_ints(data):
    if isinstance(data,int):
        return data
    if data.isnumeric():
        return data
    else:
        return -1

In [7]:
def time_to_categorical_series(df,type="hour"):
    if type == "hour":
        return df['date_time'].dt.hour.astype('category')
    elif type == "dayofweek":
        return df['date_time'].dt.dayofweek.astype('category')
    elif type == "month":
        return df['date_time'].dt.month.astype('category')
    else:
        return None
    
def time_to_categorical(df):
    hour_series = time_to_categorical_series(df,type='hour')
    dayofweek_series = time_to_categorical_series(df,type='dayofweek')
    month_series = time_to_categorical_series(df,type='month')

    df['hour'] = hour_series
    df['dayofweek'] = dayofweek_series
    df['month'] = month_series

In [8]:
cat_features = ['country_code','hour','dayofweek','month','domain']
numerical_features = [ 'subject_line_length',
       'last_open_day', 'last_login_day', 'last_checkout_day',
       'open_count_last_10_days', 'open_count_last_30_days',
       'open_count_last_60_days', 'login_count_last_10_days',
       'login_count_last_30_days', 'login_count_last_60_days',
       'checkout_count_last_10_days', 'checkout_count_last_30_days',
       'checkout_count_last_60_days','attr1', 'attr2',
       'attr3', 'age']

In [9]:
def make_df_features(df,train=None,encoder=None):
    df['attr1'] = df['user_id'].apply(lambda x: get_user_feature(x,0))
    df['attr2'] = df['user_id'].apply(lambda x: get_user_feature(x,1))
    df['attr3'] = df['user_id'].apply(lambda x: get_user_feature(x,2))
    df['age'] = df['user_id'].apply(lambda x: get_user_feature(x,3))
    df['domain'] = df['user_id'].apply(lambda x: get_user_feature(x,4))
    df['date_time'] = pd.to_datetime(df['grass_date'])
    df['last_open_day'] = df['last_open_day'].apply(fill_ints)
    df['last_login_day'] = df['last_login_day'].apply(fill_ints)
    df['last_checkout_day'] = df['last_checkout_day'].apply(fill_ints)
    time_to_categorical(df)
    cat = df.loc[:,cat_features].values
    if train:
        encoder = OneHotEncoder(handle_unknown='ignore',sparse=False)
        cat = encoder.fit_transform(cat).astype(np.float64)
    else:
        cat = encoder.transform(cat).astype(np.float64)
    val = df.loc[:,numerical_features].values.astype(np.float64)
    return np.concatenate([cat,val],axis=1),encoder

In [10]:
train_features,encoder = make_df_features(df_train,True)
train_labels = df_train['open_flag'].values

test_features,_ = make_df_features(df_test,False,encoder=encoder)

In [11]:
clf = LGBMClassifier(random_state=42)
clf.fit(train_features,train_labels)
predictions = clf.predict(test_features)

In [12]:
df_test = df_test.drop([col for col in df_test.columns if col!='row_id'],axis=1)
df_test['open_flag'] = predictions
df_test.to_csv('sub.csv',index=False)