In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# import pymc3 as pm
# import arviz as az
import matplotlib.lines as mlines
import pickle as pk
# import warnings
# warnings.filterwarnings('ignore')
# from collections import OrderedDict
# import theano
# import theano.tensor as tt
import itertools

# from IPython.core.pylabtools import figsize
pd.set_option('display.max_columns', 30)
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

from scipy import sparse 

In [None]:
def cdisplay(*args):
    '''
    Wrapper for display function. It displays  all columns of DataFrame
    objects present in `*args`
    '''
    old_n_columns = pd.get_option('display.max_columns')
    pd.set_option('display.max_columns', None)
    display(*args)
    pd.set_option('display.max_columns', old_n_columns)


# Graficos blog
I. Bank client data:

1. age: (numeric)   
2. job: type of job (categorical: 'admin.','blue collar', 'entrepreneur', 'housemaid', 'management', 'retired', 'sel.employed', 'services', 'student', 'technician', 'unemployed', 'unknown')  
3. marital: marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)  
4. education: (categorical: primary, secondary, tertiary and unknown)  
5. default: has credit in default? (categorical: 'no','yes','unknown')  
6. housing: has housing loan? (categorical: 'no','yes','unknown')  
7. loan: has personal loan? (categorical: 'no','yes','unknown')  
8. balance: Balance of the individual.  
    
II. Related with the last contact of the current campaign:  

9. contact: contact communication type (categorical: 'cellular','telephone')  
10. month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')  
11. day: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')  
12. duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.  
13. campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)  

III. Other attributes:

14. pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)  
15. previous: number of contacts performed before this campaign and for this client (numeric)  
16. poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')

17. y - has the client subscribed a term deposit? (binary: 'yes','no')

In [None]:
df = pd.read_csv('data/bank-marketing/bank.csv')
df.shape

In [None]:
df.head()

In [None]:
sns.stripplot(x="deposit", y="age", data=df, jitter=True)
plt.show();

In [None]:
df['education'].value_counts()

In [None]:
df['job'].value_counts()

In [None]:
df.poutcome.value_counts()

In [None]:
df['education'] = df['education'].astype('category')

In [None]:
sns.stripplot(x="deposit", y="education", data=df, jitter=True)
plt.show();

In [None]:
df['job'] = df['job'].astype('category')

In [None]:
sns.stripplot(x="deposit", y="job", data=df, jitter=True)
plt.show();

In [None]:
df.head(3)

In [None]:
t = set(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'euribor3m'])

In [None]:
s = set(df.columns.tolist())

In [None]:
t.difference(s)

In [None]:
df['contact'].value_counts()

# Preliminares
## Transformar datos

In [None]:
df = pd.read_csv('data/bank-marketing/bank.csv')

### Categoricas

In [None]:
convert_cat = ['job', 'marital', 'education', 'poutcome', 'contact']

cat_df = df.copy()
for col in convert_cat:
    cat_df[col] = cat_df[col].astype('category')

cat_df['day_of_week'] = pd.to_datetime(
    df.day.astype('str').str.cat(df.month, sep='-'), format='%d-%b'
).dt.dayofweek.astype('category')
cat_df['month'] = pd.to_datetime(cat_df['month'], format='%b').dt.month.astype('category')
# cat_df['day'] = cat_df['day'].astype('category')
cat_df['age'] = pd.cut(cat_df['age'], 10).cat.codes

cat_df.drop('day', axis=1, inplace=True)

decidi botar por intuición de que `'day'` no tiene importancia en el outcome, de cierta forma su información está contenida en `'day_of_week'`.

### Binarias

In [None]:
binary_cols = ['default', 'housing', 'loan', 'deposit']
bin_df = cat_df.copy()

for col in binary_cols:
    display(bin_df[col].value_counts())



In [None]:
binary_cols = ['default', 'housing', 'loan', 'deposit']
bin_df = cat_df.copy()

for col in binary_cols:
    bin_df[col] = (bin_df[col] == 'yes')



In [None]:
reg_df = bin_df.rename(columns={'deposit': 'outcome'})
reg_df.info()

## Explorar variables

In [None]:
reg_df.corr()['outcome'].sort_values(ascending=False)

In [None]:
reg_df['pdays'].value_counts()

In [None]:
((reg_df['pdays'] == -1) == (reg_df['previous'] == 0)).all()

pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; -1 means client was not previously contacted)
previous: number of contacts performed before this campaign and for this client (numeric) 

pdays: número de días que pasaron después de que el cliente fue contactado por última vez desde una campaña anterior (numérico; -1 significa que el cliente no fue contactado previamente)
anterior: número de contactos realizados antes de esta campaña y para este cliente (numérico)

incluimos la varible **never_contacted** para poder reemplazamos -1 por 0 ya que son clientes que nunca han sido contactados y no existe ninguna observación con ese valor.

In [None]:
sns.countplot(x='pdays', data=reg_df[reg_df['pdays'] != -1])

In [None]:
sns.countplot(x='previous', data=reg_df[reg_df['previous'] != 0])

In [None]:
sns.countplot(x='campaign', data=reg_df[reg_df['campaign'] != 0])

In [None]:
reg_df.loc[reg_df['previous'] != 0, 'previous']

In [None]:
plot_srs = np.log(reg_df.loc[reg_df['previous'] != 0, 'previous'])

sns.distplot(plot_srs)

In [None]:
sns.countplot(reg_df['campaign'])

## Modificar variables

**pdays** toma valor -1 cuando el cliente nunca ha sido contactado. Ese caso debería ir incluído en el coeficiente del intercepto por lo que reemplazamos las ocurrencias de `-1` por `0`.

In [None]:
reg_df['pdays'] = reg_df['pdays'].replace(-1, 0)

**Variables enteras**

In [None]:
int_std_cols = ['balance', 'duration']

In [None]:
for c in int_std_cols:
    sns.distplot(reg_df[c])
    plt.show()

**duration** tiene un adistribución que se asemeja a log-normal por lo que la transformamos vía logaritmo

In [None]:
trans_reg_df = reg_df.copy()
trans_reg_df['log_duration'] = np.log(trans_reg_df['duration'])
trans_reg_df = trans_reg_df.drop(['duration'], axis=1)

In [None]:
for c in ['balance', 'log_duration']:
    sns.distplot(trans_reg_df[c])
    plt.show()

## Exportar datos

In [None]:
trans_reg_df.to_csv('trans_bank_marketing_codes.csv', index=False)

In [None]:
with open('trans_bank_marketing_codes-dtype.pk', 'wb') as file:
    pk.dump(trans_reg_df.dtypes.to_dict(), file)

# Regresión logística con Scikit-Learn

In [None]:
X = trans_reg_df.loc[:, trans_reg_df.columns[trans_reg_df.columns != 'outcome']]
y = trans_reg_df['outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
X_train.select_dtypes('bool')
X_train.select_dtypes('bool').columns

In [None]:
for cname, col in X_train.select_dtypes('int').iteritems():
    if cname != 'balance':
        display(col.value_counts())

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

In [None]:
trans_reg_df.info()

In [None]:
int_cols = X_train.select_dtypes('int').columns.tolist()
float_cols = X_train.select_dtypes('float').columns.tolist()
bool_cols = X_train.select_dtypes('bool').columns.tolist()
one_hot_cols = X_train.select_dtypes('category').columns.tolist() + \
    X_train.select_dtypes('int8').columns.tolist()

transformer_list = [
    ('continuous_std', StandardScaler(), float_cols),
    ('minmax', MinMaxScaler(), int_cols),
    ('passthrough_bool', 'passthrough', bool_cols),
    ('one_hot', OneHotEncoder(drop='first'), one_hot_cols)
]

cols_trans = ColumnTransformer(transformer_list)

assert X.columns.isin(int_cols + float_cols + bool_cols + one_hot_cols).all(), \
 'No todas las columnas son consideradas en ColumnTransformer' 

In [None]:
cols_trans.fit_transform(X_train)

In [None]:
from sklearn.linear_model import LogisticRegression
lr_pipe = Pipeline([
    ('ct', cols_trans),
    ('lr', LogisticRegression())
])

In [None]:
lr_pipe.fit(X_train, y_train)

In [None]:
lr_pipe.score(X_test, y_test)

In [None]:
preds = lr_pipe.predict(X_test)

In [None]:
print('Accuracy of the full model: ', accuracy_score(preds, y_test))
print('f1 score of the full model: ', f1_score(preds, y_test))

In [None]:
w_inicial = np.concatenate((lr_pipe[-1].intercept_, lr_pipe[-1].coef_.flatten()))
w_inicial

In [None]:
np.save('optimo_sklearn', w_inicial)