# Entendimento do Problema

Objetivo do Problema:
- 1.0. Previsao do primeiro destino que um novo usuário irá escolher.
    - Porque?
    - Qual tipo de modelo de negócio do Airbnb?
        - Marketplace (Conectar pessoas que oferecem acomodacao, com pessoas que estao procurando acomodacao)
        - Oferta (pessoas oferecendo acomodacao)
            - Tamanho do portfólio.
            - Diversidade/Densidade de Portfólio.
            - Preco Medio
            
        - Demanda (pessoas procurando acomodacao)
            - Numero de Usuários
            - LTV (Lifetime Value)
            - CAC (Client Acquisition Cost)
            
            
           Gross Revenue = (Fee*Numero cliente) - CAC 

- Proposta da Solucao
- Modelo de Predivao do primeiro destino de um novo usario.
- 1.0. Predicoes e salva em tabela do banco de dados. 
- 2.0. API 
    - Input: usuario e suas caracteristicas
    - Output: usuario e suas caracteristicas com a **predicao do destino**

# 0.0 Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import random
from sklearn import model_selection as ms
from sklearn import preprocessing as pp
from sklearn import metrics as m
from scikitplot import metrics as mt
from scipy import stats as ss

from keras import models as ml
from keras import layers as l

from imblearn import under_sampling as us
from imblearn import over_sampling as oversamp
from imblearn import combine as c

##  0.1. Helper Functions

In [None]:
def cramer_v(x, y):
    cm = pd.crosstab(x, y).values
    n = cm.sum()
    r, k = cm.shape
    
    chi2 = ss.chi2_contingency(cm)[0]
    chi2corr = max(0, chi2 - (k-1)*(r-1)/(n-1))
    
    kcorr = k - (k-1)**2/(n-1)
    rcorr = r - (r-1)**2/(n-1)
    
    return np.sqrt((chi2corr/n) / (min(kcorr-1, rcorr-1)))

##  0.2. Carregando dados

In [None]:
df_raw = pd.read_csv('../dados/train_users_2.csv', low_memory=True)
df_raw.shape

In [None]:
df_sessions = pd.read_csv('../dados/sessions.csv', low_memory=True)
df_sessions.shape

# 1.0. Descrição dos Dados

In [None]:
df1 = df_raw.copy()

## 1.1. Dimensão dos Dados

In [None]:
print('Usuários Número de Linhas: {}'.format(df1.shape[0]))
print('Usuários Número de Colunas: {}'.format(df1.shape[1]))

In [None]:
print('Usuários Número de Linhas: {}'.format(df_sessions.shape[0]))
print('Usuários Número de Colunas: {}'.format(df_sessions.shape[1]))

##  1.2. Tipo Dados

In [None]:
df1.dtypes

In [None]:
df_sessions.dtypes

## 1.3. Check NA

In [None]:
df1.isna().sum() / len(df1) * 100

In [None]:
df_sessions.isna().sum() / len(df_sessions) * 100

In [None]:
# Dataset Usuário

# date_first_booking_max
date_first_booking_max = pd.to_datetime(df1['date_first_booking']).max().strftime('%Y-%m-%d')
df1['date_first_booking'] = df1['date_first_booking'].fillna(date_first_booking_max)

# age
df1 = df1[(df1['age'] > 15) & (df1['age'] < 120)]
avg_age = df1['age'].mean().astype(int)
df1['age'] = df1['age'].fillna(avg_age)

# first_affiliate_tracked
df1 = df1[~df1['first_affiliate_tracked'].isna()]

In [None]:
# Dataset Sessions
# user_id - 0.3%
df_sessions = df_sessions[~df_sessions['user_id'].isna()]

# action - 0.7%
df_sessions = df_sessions[~df_sessions['action'].isna()]

# action_type - 11%
df_sessions = df_sessions[~df_sessions['action_type'].isna()]

# action_detail - 11%
df_sessions = df_sessions[~df_sessions['action_detail'].isna()]

# secs_elapsed - 1.2%
df_sessions = df_sessions[~df_sessions['secs_elapsed'].isna()]

In [None]:
df1.isna().sum() / len(df1) * 100

In [None]:
df_sessions.isna().sum() / len(df_sessions) * 100

## 1.4. Mudando dTyepes

In [None]:
df1.dtypes

In [None]:
 # date_account_created
df1['date_account_created'] = pd.to_datetime(df1['date_account_created'])

# timestamp_first_active
df1['timestamp_first_active'] = pd.to_datetime(df1['timestamp_first_active'], format='%Y%m%d%H%M%S')

# date_first_booking
df1['date_first_booking'] = pd.to_datetime(df1['date_first_booking'])

# age
df1['age'] = df1['age'].astype(int)

## 1.5. Valida Balanceamento

In [None]:
df1['country_destination'].value_counts(normalize=True) * 100

## 1.6. Análise descritiva

In [None]:
# Users
num_attributes = df1.select_dtypes(include=['int64', 'float64', 'int32'])
cat_attributes = df1.select_dtypes(exclude=['int64', 'float64', 'datetime64[ns]', 'int32'])
time_attributes = df1.select_dtypes(include=['datetime64[ns]'])

# Sessions
num_attributes_sessions = df_sessions.select_dtypes(include=['int64', 'float64', 'int32'])
cat_attributes_sessions = df_sessions.select_dtypes(exclude=['int64', 'float64', 'datetime64[ns]', 'int32'])
time_attributes_sessions = df_sessions.select_dtypes(include=['datetime64[ns]'])

In [None]:
def get_analise_descritiva(df):
    d0 = df.describe()
    d1 = pd.DataFrame(df.apply(lambda x: x.skew())).T
    d2 = pd.DataFrame(df.apply(lambda x: x.kurtosis())).T
    d3 = pd.DataFrame(df.apply(lambda x: x.max() - x.min())).T
    ct = pd.concat([d0, d3, d1, d2]).T.reset_index()
    ct.columns = ['Atributos', 'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'range', 'skew', 'kurtosis']
    return ct

### 1.6.1. Númerico - Users

In [None]:
get_analise_descritiva(num_attributes)

### 1.6.2. Númerico - Sessions

In [None]:
get_analise_descritiva(num_attributes_sessions)

### 1.6.3. Categórico - Users

In [None]:
cat_attributes.drop('id', axis=1).describe() 

### 1.6.4. Categórico - Sessions

In [None]:
cat_attributes_sessions.drop('user_id', axis=1).describe() 

In [None]:
cat_attributes_list = cat_attributes_sessions.drop('user_id', axis=1).columns.tolist()

corr_dict = {}
for i in range(len (cat_attributes_list)):
    corr_list = []
    for j in range(len(cat_attributes_list)):
        ref = cat_attributes_list[i]
        feat = cat_attributes_list[j]
        
        # correlation
        corr = cramer_v(cat_attributes_sessions[ ref ], cat_attributes_sessions[ feat ])
        
        # append a list
        corr_list.append(corr)
    
    # appende a correlation list for each ref attributs
    corr_dict[ ref ] = corr_list

In [None]:
tmp = pd.DataFrame(corr_dict)
tmp = tmp.set_index(tmp.columns)
sns.heatmap(tmp, annot=True)

# 2.0. Feature Engineering

In [None]:
df2 = df1.copy()
df2.head()

## 2.1. Criando Novas Features

In [None]:
# dias desde o primeiro ativo até a primeira reserva
df2['first_active'] = pd.to_datetime(df2['timestamp_first_active'].dt.strftime('%Y-%m-%d'))
df2['days_from_frist_active_until_booking'] = (df2['date_first_booking'] - df2['first_active']).apply(lambda x: x.days)

# dias desde a primeira ativação até a conta criada
df2['days_from_first_active_until_account_created'] = (df2['date_account_created'] - df2['first_active']).apply(lambda x: x.days)

# dias desde a criação da conta até a primeira reserva
df2['days_from_account_created_until_first_booking'] = (df2['date_first_booking'] - df2['date_account_created']).apply(lambda x: x.days)

# ================== Primeira Ativação ==================
df2['year_first_active'] = df2['first_active'].dt.year
df2['month_fist_active'] = df2['first_active'].dt.month
df2['day_first_active'] = df2['first_active'].dt.day
df2['day_of_week_first_active'] = df2['first_active'].dt.dayofweek
df2['week_of_year_first_active'] = df2['first_active'].dt.isocalendar().week

# # ================== Primeira reserva ==================
df2['year_first_booking'] = df2['date_first_booking'].dt.year
df2['month_first_booking'] = df2['date_first_booking'].dt.month
df2['day_first_booking'] = df2['date_first_booking'].dt.day
df2['day_of_week_first_booking'] = df2['date_first_booking'].dt.dayofweek
df2['week_of_year_first_booking'] = df2['date_first_booking'].dt.isocalendar().week

# # ================== Conta Criada =================
df2['year_account_created'] = df2['date_account_created'].dt.year
df2['month_account_created'] = df2['date_account_created'].dt.month
df2['day_account_created'] = df2['date_account_created'].dt.day
df2['day_of_week_account_created'] = df2['date_account_created'].dt.dayofweek
df2['week_of_year_account_created'] = df2['date_account_created'].dt.isocalendar().week

In [None]:
df2.head()

# 3.0. Filtragem Dados

In [None]:
df3 = df2.copy()

## 3.1. Filtragem Linhas

In [None]:
# Filtrando Idade maior que 15 e menor que 120 anos.
df3 = df3[(df3['age'] > 15) & (df3['age'] < 120)]

## 3.2. Seleção Colunas

In [None]:
cols = ['date_account_created', 'date_account_created', 'date_first_booking', 'timestamp_first_active', 'first_active']
df3 = df3.drop(cols, axis=1)

# 4.0. Preparação Dados

In [None]:
df4 = df3.copy()

## 4.1. Balanceamento Dataset

In [None]:
# Encoder Categorical Variables
ohe = pp.OneHotEncoder()

# Numerical
col_num = df4.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Categorical
col_cat = df4.select_dtypes(exclude=['int64', 'float64', 'datetime64[ns]']).drop(['id', 'country_destination'], axis=1).columns.tolist()

# encoding
df4_dummy = pd.DataFrame(ohe.fit_transform(df4[ col_cat]).toarray(), index=df4.index)

# join numerical and categorical
df42 = pd.concat([df4[col_num], df4_dummy], axis=1)
df42.shape

### 4.1.1. Random Undersampling

In [None]:
 # ratio_balanced
ratio_balanced = {'NDF': 10000 }
# define sampler
undersampling = us.RandomUnderSampler(sampling_strategy=ratio_balanced, random_state=32)

# apply sampler
X_under, y_under = undersampling.fit_resample(df42, df4['country_destination'])

In [None]:
df4['country_destination'].value_counts()

In [None]:
y_under.value_counts()

### 4.1.2. Random Oversampling

In [None]:
# ratio_balanced
#ratio_balanced = {'NDF': 10000 }

# define sampler
oversampling = oversamp.RandomOverSampler(sampling_strategy='all', random_state=32)

# apply sampler
X_over, y_over = oversampling.fit_resample(df42, df4['country_destination'])

In [None]:
df4['country_destination'].value_counts() 

In [None]:
y_over.value_counts() 

### 4.1.3. SMOTE + TOMEKLINK

In [None]:
ratio_balanced =  {'NDF': 54852,
                'US':  48057,
                'other': 6*7511,
                'FR': 12*3669,
                'IT': 20*2014,
                'GB': 30*1758,
                'ES': 30*1685,
                'CA': 40*1064,
                'DE': 45*841,
                'NL': 80*595,
                'AU': 85*433,
                'PT': 300*157}

In [None]:
## - Como é muito demorado para executar, salvo em arquivo para depois carregar já com SmoteTomek

# # define sampler
# smt = c.SMOTETomek(sampling_strategy=ratio_balanced, random_state=32, n_jobs=-1)

# # apply sampler
# X_smt, y_smt = smt.fit_resample(df42, df4['country_destination'])

# X_smt.to_csv('../dados/X_SMOTETomek')
# y_smt.to_csv('../dados/y_SMOTETomek')

In [None]:
# df4['country_destination'].value_counts() 

In [None]:
# y_smt.value_counts() 

In [None]:
# # numerical data
# df43 = X_smt[ col_num ]

# # categorical data
# df44 = X_smt.drop(col_num, axis=1)
# df45 = pd.DataFrame(ohe.inverse_transform(df44), columns=col_cat, index=df44.index)

# # join numerical categorical
# df46 = pd.concat([df43, df45], axis=1)
# df46['country_destination'] = y_smt

## --Dummy variável alvo

In [None]:
df4_dummy = pd.get_dummies(df4.drop(['id', 'country_destination'], axis=1))
df4 = pd.concat([df4[['id', 'country_destination']], df4_dummy], axis=1)

# Smote
# df4_dummy = pd.get_dummies(df46.drop(['country_destination'], axis=1))
# df4 = pd.concat([df4[['country_destination']], df4_dummy], axis=1) 

# 5.0. Análise Exploratória(EDA) - Insights -> Dataset sem balanceamento.

In [None]:
df5 = df4.copy()

**H0.** Os usuários levem até 3 dias, em média, para fazer o cadastro no site em todos os destinos.

**Verdadeira.** Os usuários levam até 3 dias, em média para realizar o cadastro no site em todos os destinos

In [None]:
plt.figure(figsize=(25, 12))
aux01 = df5[['days_from_first_active_until_account_created', 'country_destination']].groupby('country_destination').mean().reset_index()
sns.barplot(x='country_destination', y='days_from_first_active_until_account_created' , data=aux01);
plt.ylabel('Average days until Accout Creation');

**H1.** O numero de reservas do Airbnb cresce ou decresce ao longo do tempo?

**Depende.**

In [None]:
plt.figure(figsize=(25,12))
aux01 = df5[df5['country_destination'] != 'NDF']
aux01 = aux01[['year_first_booking', 'month_first_booking', 'country_destination']]\
                .groupby(['year_first_booking', 'month_first_booking'])\
                .count() \
                .reset_index()

aux01['year-month'] = aux01.apply(lambda x: str(x['year_first_booking']) + '-' + str(x['month_first_booking']), axis=1)

sns.barplot(x='year-month', y='country_destination', data=aux01);
plt.xticks(rotation=90);

**H2.** O numero de reservas total cresce 10% ao ano em todos os anos.

**Falsa.** O numero de reservas total cresce 10% ao ao apenas entre 2011 e 2013.

In [None]:
plt.figure(figsize=(25, 12))
aux01 = df5[(df5['country_destination'] != 'NDF') & (df5['year_first_booking'] < 2015)]
aux01 = aux01[['year_first_booking', 'country_destination']].groupby('year_first_booking').count().reset_index()

aux01['growth'] = 100*aux01['country_destination'].pct_change()

sns.barplot(x='year_first_booking', y='growth', data=aux01);

**H4.** Todos os canais de Marketing geram pelo menos 10% de reservas para todos os destinos.

**False.** Nem todos os canais de Marketing geram pelo menos 10% de reservas para todos os destinos.

In [None]:
affiliate_list = df5['affiliate_provider'].drop_duplicates().tolist()

plt.figure(figsize=(25, 50))
for i in range(len(affiliate_list)):
    plt.subplot(6, 3, i+1)
    df5[df5['affiliate_provider'] == affiliate_list[i]]['country_destination'].value_counts(normalize=True).plot.bar();
    plt.title(affiliate_list[i]);

**Outras Hipóteses para validar**

H04. Usuários do sexo feminino fazem 10% mais reservas para países fora dos USA. 

H05. O canal de Marketing Google representa 40% das reservas para países fora dos USA. 

H06. O destino dos USA representam mais de 20% em todos os canais. 

H07. A idade média das pessoas é de 35 anos em todos os destinos. 

H08. A porcentagem de usuários que usam o site na lingua inglês-americano para reservar acomodações em qualquer destino é maior que 90% 

H09. O número de reservas do Airbnb é crescente ou decrescente ao longo dos anos? 

H10. O número de reservas do Airbnb é crescente ao longo dos anos.

# 5.0. Seleção Features

In [None]:
# Remove datas originais pois é inútil para o modelo.
# cols_drop = ['date_account_created', 'timestamp_first_active', 'date_first_booking', 'first_active']
# df5 = df4.drop(cols_drop, axis=1)

df5 = df4.copy()

In [None]:
X = df5.drop('country_destination', axis=1)
y = df5['country_destination'].copy()

X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.2, random_state=42)

# 6.0. Machine Learning

In [None]:
x_train = X_train.drop('id', axis=1)
x_test = X_test.drop('id', axis=1)

## 6.1. Baseline 

In [None]:
country_destination_list = df1['country_destination'].drop_duplicates().sort_values().tolist()
k_num = y_test.shape[0]
country_destination_weights = df1['country_destination'].value_counts(normalize=True).sort_index().tolist()

yhat_random = random.choices(population=country_destination_list,  
                             weights=country_destination_weights,
                             k=k_num)
len(yhat_random)

### 6.1.1 Baseline Performance

In [None]:
# Accuracy
acc_random = m.accuracy_score(y_test, yhat_random)
print('Accuracy: {}'.format(acc_random))

# Balanced Accuray
balanced_acc_random = m.balanced_accuracy_score(y_test, yhat_random)
print('Balanced Accuracy:{}'.format(balanced_acc_random))

# Kappa Metrics
kappa_random = m.cohen_kappa_score(y_test, yhat_random)
print('Kappa Score: {}'.format(kappa_random))

# Classification report
print(m.classification_report(y_test, yhat_random))

# Confusion Matrix
mt.plot_confusion_matrix(y_test, yhat_random, normalize=False, figsize=(12,12))

## 6.2. Neural Network MLP

In [None]:
ohe = pp.OneHotEncoder()
y_train_nn = ohe.fit_transform(y_train.values.reshape(-1, 1)).toarray()

# Definição Modelo
model = ml.Sequential()
model.add(l.Dense(256, input_dim=x_train.shape[1], activation='relu'))
model.add(l.Dense(13, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

#Treino
model.fit(x_train, y_train_nn, epochs=100, verbose=True)

### 6.2.1 NN Performance

In [None]:
# Predição
pred_nn = model.predict(x_test)

# Inverte predição
yhat_nn = ohe.inverse_transform(pred_nn)

y_test_nn = y_test.to_numpy()
yhat_nn = yhat_nn.reshape(1, -1)[0]

In [None]:
# Acurácia
acc_nn =m.accuracy_score(y_test_nn, yhat_nn)
print(f'Acurácia: {acc_nn}')

# Balanced Accuray
balanced_acc_nn = m.balanced_accuracy_score(y_test_nn, yhat_nn)
print('Balanced Accuracy:{}'.format(balanced_acc_nn))

# Kappa
kappa_nn = m.cohen_kappa_score(y_test_nn, yhat_nn)
print('Kappa Score: {}'.format(kappa_nn))

# Classification report
print(m.classification_report(y_test_nn, yhat_nn))

# Matriz de Confusão
mt.plot_confusion_matrix(y_test_nn, yhat_nn, normalize=False, figsize=(12,12))

# Acurácia: 0.8412426614481409
# Balanced Accuracy:0.16665800325744187
# Kappa Score: 0.7273389222364827

Acurácia: 0.7094976164283096

Balanced Accuracy:0.09153183873284591

Kappa Score: 0.004007337133695277

### 6.2.1 NN Performance - Cross-Validation

In [None]:
num_folds = 5
kfold = ms.StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

balanced_acc_list = []
kappa_acc_list = []

i = 1
for train_ix, val_ix in kfold.split(x_train, y_train):
    print(f'Fold {i}/{num_folds}')

    # get fold
    x_train_fold = x_train.iloc[train_ix]
    y_train_fold = y_train.iloc[train_ix]
    
    x_val_fold = x_train.iloc[val_ix]
    y_val_fold = y_train.iloc[val_ix]

    # target hot-encoding
    ohe = pp.OneHotEncoder()
    y_train_fold_nn = ohe.fit_transform(y_train_fold.values.reshape(-1, 1)).toarray()

    # Definição Modelo
    model = ml.Sequential()
    model.add(l.Dense(512, input_dim=x_train.shape[1], activation='relu'))
    model.add(l.Dense(12, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Treino modelo
    model.fit(x_train_fold, y_train_fold_nn, epochs=50, batch_size=32, verbose=0)

    # Predição
    pred_nn = model.predict(x_val_fold)
    yhat_nn = ohe.inverse_transform(pred_nn)
    
    # Ajusta os dados com reshape
    y_test_nn = y_val_fold.to_numpy()
    yhat_nn = yhat_nn.reshape(1, -1)[0]

    # Métricas
    ## Balanced Accuracy 
    balanced_acc_nn = m.balanced_accuracy_score(y_test_nn, yhat_nn)
    balanced_acc_list.append(balanced_acc_nn)
    
    ## Kappa Metrics
    kappa_acc_nn = m.cohen_kappa_score(y_test_nn, yhat_nn)
    kappa_acc_list.append(kappa_acc_nn)
    
    i += 1   


In [None]:
print('Avg Balanced Accuracy: {} +/- {}'.format(np.round(np.mean(balanced_acc_list), 2), 
                                                  np.round(np.std(balanced_acc_list), 4)))
print('Avg Kappa: {} +/- {}'.format(np.round(np.mean(kappa_acc_list), 4), 
                                      np.round(np.std(kappa_acc_list), 4)))

Avg Balanced Accuracy: 0.09 +/- 0.0003

Avg Kappa: 0.003 +/- 0.0018