# 0.0. IMPORTS

In [1]:
import random
import warnings

import pandas  as pd
import seaborn as sns

from sklearn    import model_selection as ms
from sklearn    import preprocessing   as pp
from sklearn    import metrics         as m
from scikitplot import metrics         as mt
from keras      import models          as ml
from keras      import layers          as l

In [2]:
warnings.filterwarnings("ignore")

## 0.1. Aux Functions

In [3]:
# jupyter design
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    %config InlineBackend.figure_format = 'retina'

    plt.style.use('ggplot')
    plt.rcParams['figure.figsize'] = [16, 8]
    plt.rcParams['font.size'] = 24

#     display(HTML('<style>.container { width:100% !important; }</style>'))
#     pd.options.display.max_columns = None
#     pd.options.display.max_rows = None
#     pd.options.display.float_format = '{:,.4f}'.format
#     pd.set_option('display.expand_frame_repr', False)
#     pd.set_option('display.max_columns', None)
#     pd.set_option('display.max_rows', None)

    sns.set()
    
jupyter_settings()

# imbalanced metrics ( balanced accuracy and kappa score )
def imbalanced_metrics( model_name, y_val, yhat ):
    # Balanced Accuracy
    balanced_accuracy = m.balanced_accuracy_score( y_val, yhat)
    
    # Kappa Score
    kappa_score = m.cohen_kappa_score( y_val, yhat )
    
    return pd.DataFrame({"balanced_accuracy": "{}".format(np.round(balanced_accuracy,3)),
                         "kappa_score": "{}".format(np.round(kappa_score,3))}, index=[model_name])

# cross validation neural networks
def cross_validation(model_name, model, x, y, epochs=100, verbose=0):
    
    balanced_accuracy_list = []
    kappa_score_list = []

    skf = ms.StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    for train_index, test_index in skf.split(x, y):
        
        x_train_cv = x.iloc[train_index]
        y_train_cv = y.iloc[train_index]
        
        x_test_cv = x.iloc[test_index]
        y_test_cv = y.iloc[test_index]
        
        # target variable encoding
        ohe = pp.OneHotEncoder()
        y_train_cv = ohe.fit_transform( y_train_cv.values.reshape( -1, 1 ) ).toarray()
        
        # model training
        model.fit(x_train_cv, y_train_cv, epochs=epochs, verbose=verbose )
        
        # prediction
        pred = model.predict(x_test_cv)
        yhat = ohe.inverse_transform( pred )
        
        # metrics
        balanced_accuracy_list.append( m.balanced_accuracy_score( y_test_cv, yhat) )
        kappa_score_list.append( m.cohen_kappa_score( y_test_cv, yhat ) )
        
    
    balanced_accuracy_mean, balanced_accuracy_std = np.round( np.mean( balanced_accuracy_list ), 2 ), np.round( np.std( balanced_accuracy_list ), 2 )
    kappa_score_mean, kappa_score_std = np.round( np.mean( kappa_score_list ), 2 ), np.round( np.std( kappa_score_list ), 2 )
   
    
    return pd.DataFrame({"Balanced_Accuracy:": "{} +/- {}".format(balanced_accuracy_mean, balanced_accuracy_std),
                         "Kappa_Score": "{} +/- {}".format(kappa_score_mean, kappa_score_std)}, index=[model_name])

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


## 0.2. Reading Data

In [4]:
df_raw = pd.read_csv( "../datasets/train_users_2.csv" )

# 1.0. DATA DESCRIPTION

In [5]:
df1 = df_raw.copy()

## 1.1. Columns Descriptions

In [6]:
df1.columns

Index(['id', 'date_account_created', 'timestamp_first_active',
       'date_first_booking', 'gender', 'age', 'signup_method', 'signup_flow',
       'language', 'affiliate_channel', 'affiliate_provider',
       'first_affiliate_tracked', 'signup_app', 'first_device_type',
       'first_browser', 'country_destination'],
      dtype='object')

- **id:** user id

- **date_account_created:** the date of account creation

- **timestamp_first_active:** timestamp of the first activity, note that it can be earlier than date_account_created or -date_first_booking because a user can search before signing up

- **date_first_booking: date of first booking**

- **gender**

- **age**

- **signup_method**

- **signup_flow:** the page a user came to signup up from

- **language:** international language preference

- **affiliate_channel:** what kind of paid marketing

- **affiliate_provider:** where the marketing is e.g. google, craigslist, other

- **first_affiliate_tracked:** whats the first marketing the user interacted with before the signing up

- **signup_app**

- **first_device_type**

- **first_browser**

- **country_destination:** this is the target variable you are to predict

### 1.1.1. Rename Columns

No need, they're already on snakecase.

## 1.2. Data Dimensions

In [7]:
print( 'Number of rows: {}'.format( df1.shape[0] ) )
print( 'Number of columns: {}'.format( df1.shape[1] ) )

Number of rows: 213451
Number of columns: 16


## 1.3. NA Check

In [8]:
df1.isna().sum()

id                              0
date_account_created            0
timestamp_first_active          0
date_first_booking         124543
gender                          0
age                         87990
signup_method                   0
signup_flow                     0
language                        0
affiliate_channel               0
affiliate_provider              0
first_affiliate_tracked      6065
signup_app                      0
first_device_type               0
first_browser                   0
country_destination             0
dtype: int64

### 1.3.1. NA Fulfill

In [9]:
df1 = df1.dropna()

## 1.4. Data Types

In [10]:
df1.dtypes

id                          object
date_account_created        object
timestamp_first_active       int64
date_first_booking          object
gender                      object
age                        float64
signup_method               object
signup_flow                  int64
language                    object
affiliate_channel           object
affiliate_provider          object
first_affiliate_tracked     object
signup_app                  object
first_device_type           object
first_browser               object
country_destination         object
dtype: object

### 1.4.1. Change types

In [11]:
# date account created - datetime
df1['date_account_created'] = pd.to_datetime( df1['date_account_created'] )

# timestamp first active - datetime
df1['timestamp_first_active'] = pd.to_datetime( df1['timestamp_first_active'], format=( '%Y-%m-%d %H:%M:%S' ) )

# date first booking - datetime
df1['date_first_booking'] = pd.to_datetime( df1['date_first_booking'] )

# age - int
df1['age'] = df1['age'].astype( 'int64' )

## 1.5. Data Proportion

In [12]:
df1['country_destination'].value_counts( normalize=True )

US       0.708864
other    0.110854
FR       0.054085
IT       0.029763
GB       0.025935
ES       0.024864
CA       0.015696
DE       0.012454
NL       0.008801
AU       0.006366
PT       0.002318
Name: country_destination, dtype: float64

We clearly have Imbalanced Data, wich will affect the models.

## 1.6. Descriptive Statistics

### 1.6.1. Numerical Attributes

### 1.6.2. Categorical Attributes

# 2.0. DATA FILTERING

In [13]:
df2 = df1.copy()

# 3.0. FEATURE ENGINEERING

In [14]:
df3 = df2.copy()

# 4.0. EXPLORATORY DATA ANALYSIS

In [15]:
df4 = df1.copy()

# 5.0. DATA PREPARATION

In [16]:
df5 = df4.drop( columns=['date_account_created','timestamp_first_active','date_first_booking'] )

df5_dummies = pd.get_dummies( df5.drop( columns=['id','country_destination'] ) )
df5 = pd.concat( [ df5[['id','country_destination']], df5_dummies ], axis=1 )

# 6.0. FEATURE SELECTION

In [17]:
df6 = df5.copy()

# 7.0. MACHINE LEARNING

In [18]:
X = df6.drop( columns=['id','country_destination'] )
Y = df6['country_destination'].copy()

x_train, x_val, y_train, y_val = ms.train_test_split( X, Y, test_size=0.2, random_state=42 )

## 7.1. Random Model

In [21]:
# country_destination_list = df6['country_destination'].drop_duplicates().sort_values().to_list()
# k = y_val.shape[0]
# weights = df1['country_destination'].value_counts( normalize=True ).sort_index().to_list()

# yhat_random = random.choices( population = country_destination_list,
#                               weights=weights,
#                               k=k )

AttributeError: module 'numpy.random' has no attribute 'choices'

### 7.1.1. Imbalanced Metrics and Confusion Matrix

In [86]:
# imbalanced_metrics( 'Random Model', y_val, yhat_random )

NameError: name 'yhat_random' is not defined

In [None]:
# # Confusion Matrix
# mt.plot_confusion_matrix( y_val, yhat_random, normalize=False );

## 7.2. Neural Network - MLP

In [None]:
# y train preparation
ohe = pp.OneHotEncoder()
y_train_nn = ohe.fit_transform( y_train.values.reshape( -1, 1 ) ).toarray()

In [None]:
# model definition
model_nn = ml.Sequential()
model_nn.add( l.Dense( 128, input_dim=x_train.shape[1], activation='relu' ) )
model_nn.add( l.Dense( 11, activation='softmax' ) )

# model compile
model_nn.compile( loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'] )

# model training
model_nn.fit( x_train, y_train_nn, epochs=100, verbose=0 )

In [None]:
# prediction
pred_nn = model_nn.predict( x_val )
yhat_nn = ohe.inverse_transform( pred_nn )

# # prediction prepare
# y_val_nn = y_val.to_numpy()
# yhat_nn = yhat_nn.reshape( 1, -1 )[0]

### 7.2.1. Imbalanced Metrics and Confusion Matrix

In [None]:
imbalanced_metrics( 'Neural Network', y_val_nn, yhat_nn )

In [None]:
# Confusion Matrix
mt.plot_confusion_matrix( y_val_nn, yhat_nn, normalize=False );

### 7.3. Cross Validation

In [None]:
# cv
cross_validation( "neural", model_nn, X, Y )

# 8.0. HYPERPARAMETER FINE TUNING

# 9.0. BUSINESS RESULTS

# 10.0. DEPLOY