In [1]:
import joblib
import lightgbm as lgb
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.linear_model import LassoCV, LinearRegression, RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder

import warnings
warnings.filterwarnings('ignore')

# Carregando os datasets

## conn_df

In [3]:
conn_df = pd.read_csv('../data/raw/conexoes_espec.csv', sep=';')

In [12]:
conn_df

Unnamed: 0,V1,V2,grau,proximidade,prob_V1_V2
0,1,2,trabalho,visita_frequente,0.589462
1,1,3,trabalho,visita_rara,0.708465
2,2,4,trabalho,visita_casual,
3,2,5,trabalho,visita_rara,0.638842
4,3,6,amigos,mora_junto,
...,...,...,...,...,...
999994,499998,999996,trabalho,visita_rara,
999995,499998,999997,trabalho,visita_rara,
999996,499999,999998,familia,visita_casual,0.451662
999997,499999,999999,familia,visita_rara,0.186973


In [5]:
conn_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999999 entries, 0 to 999998
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   V1           999999 non-null  int64  
 1   V2           999999 non-null  int64  
 2   grau         999999 non-null  object 
 3   proximidade  999999 non-null  object 
 4   prob_V1_V2   499999 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 38.1+ MB


In [6]:
conn_df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
V1,999999.0,,,,250000.25,144337.495129,1.0,125000.5,250000.0,375000.0,500000.0
V2,999999.0,,,,500001.0,288674.990255,2.0,250001.5,500001.0,750000.5,1000000.0
grau,999999.0,3.0,trabalho,333548.0,,,,,,,
proximidade,999999.0,4.0,visita_rara,400238.0,,,,,,,
prob_V1_V2,499999.0,,,,0.485162,0.173963,0.074462,0.351677,0.486413,0.61821,0.942245


In [7]:
conn_df['grau'].unique()

array(['trabalho', 'amigos', 'familia'], dtype=object)

In [8]:
conn_df['proximidade'].unique()

array(['visita_frequente', 'visita_rara', 'visita_casual', 'mora_junto'],
      dtype=object)

In [9]:
# rev_conn_df = conn_df[['V2', 'V1', 'grau', 'proximidade']].copy(deep=True).rename(columns={'V1': 'V2', 'V2': 'V1'})
# rev_conn_df['prob_V1_V2'] = np.nan

# conn_df['is_collected'] = True
# rev_conn_df['is_collected'] = False

# conn_df = conn_df.append(rev_conn_df, ignore_index=True) 

## ind_df

In [19]:
ind_df = pd.read_csv('../data/raw/individuos_espec.csv', sep=';')

In [20]:
ind_df

Unnamed: 0,name,idade,estado_civil,qt_filhos,estuda,trabalha,pratica_esportes,transporte_mais_utilizado,IMC
0,1,44.0,divorciado,1.0,1.0,0.0,1.0,publico,22.200956
1,2,24.0,casado,0.0,0.0,0.0,1.0,publico,25.378720
2,3,35.0,solteiro,1.0,0.0,0.0,1.0,particular,19.952393
3,4,50.0,casado,1.0,1.0,1.0,0.0,publico,26.732053
4,5,30.0,solteiro,2.0,1.0,0.0,1.0,publico,15.295668
...,...,...,...,...,...,...,...,...,...
999995,999996,34.0,casado,1.0,0.0,1.0,1.0,publico,17.556771
999996,999997,40.0,casado,1.0,0.0,1.0,1.0,particular,24.848402
999997,999998,30.0,divorciado,1.0,0.0,0.0,1.0,publico,
999998,999999,33.0,casado,0.0,1.0,1.0,1.0,publico,16.979569


In [21]:
ind_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 9 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   name                       1000000 non-null  int64  
 1   idade                      904063 non-null   float64
 2   estado_civil               949927 non-null   object 
 3   qt_filhos                  971133 non-null   float64
 4   estuda                     959870 non-null   float64
 5   trabalha                   993647 non-null   float64
 6   pratica_esportes           850876 non-null   float64
 7   transporte_mais_utilizado  956967 non-null   object 
 8   IMC                        886130 non-null   float64
dtypes: float64(6), int64(1), object(2)
memory usage: 68.7+ MB


In [22]:
ind_df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
name,1000000.0,,,,500000.5,288675.278933,1.0,250000.75,500000.5,750000.25,1000000.0
idade,904063.0,,,,30.008431,10.951566,0.0,22.0,29.0,37.0,124.0
estado_civil,949927.0,4.0,solteiro,420637.0,,,,,,,
qt_filhos,971133.0,,,,0.928406,0.997289,0.0,0.0,1.0,1.0,9.0
estuda,959870.0,,,,0.442588,0.496693,0.0,0.0,0.0,1.0,1.0
trabalha,993647.0,,,,0.556739,0.496771,0.0,0.0,1.0,1.0,1.0
pratica_esportes,850876.0,,,,0.599823,0.489934,0.0,0.0,1.0,1.0,1.0
transporte_mais_utilizado,956967.0,3.0,publico,574779.0,,,,,,,
IMC,886130.0,,,,22.472079,7.073692,4.852828,17.420851,21.4312,26.378197,89.158204


In [23]:
ind_df['estado_civil'].unique()

array(['divorciado', 'casado', 'solteiro', nan, 'viuvo'], dtype=object)

In [24]:
ind_df['transporte_mais_utilizado'].unique()

array(['publico', 'particular', 'taxi', nan], dtype=object)

In [None]:
df_raw = pd.merge(ind_df.rename(columns=lambda x: x + '_V2'),
                  conn_df, 
                  how='right',
                  left_on='name_V2', 
                  right_on='V2')
df_raw = pd.merge(ind_df.rename(columns=lambda x: x + '_V1'),
                  df_raw, 
                  how='right',
                  left_on='name_V1', 
                  right_on='V1')
df_raw = df_raw.drop(['V1', 'V2'], axis=1)
df_raw

In [None]:
def logit(p):
    return np.log(p / (1-p))

def logistic(y):
    ey = np.exp(y)
    return ey / (1+ey)

def cross_entropy_loss(y_true, y_pred):
    p, q = logistic(y_true), logistic(y_pred)
    return -(p*np.log(q) + (1-p)*np.log(1-q)).mean()

In [None]:
categorical_cols = ['estado_civil_V1', 'estado_civil_V2',
                    'transporte_mais_utilizado_V1', 'transporte_mais_utilizado_V2',
                    'grau', 'proximidade']

boolean_cols = ['estuda_V1', 'trabalha_V1', 'pratica_esportes_V1',
                'estuda_V2', 'trabalha_V2', 'pratica_esportes_V2']

numerical_cols = ['idade_V1', 'qt_filhos_V1', 'IMC_V1',
                  'idade_V2', 'qt_filhos_V2', 'IMC_V2']

cols_to_drop = ['name_V1', 'name_V2', 'prob_V1_V2']

In [None]:
df_bl = df_raw.copy(deep=True)

df_bl['prob_V1_V2'].fillna(df_bl['prob_V1_V2'].median(skipna=True), inplace=True)

In [None]:
X_bl = df_bl.drop(cols_to_drop, axis=1)
y_bl = df_bl['prob_V1_V2']

X_bl_train_, X_bl_test_, y_bl_train, y_bl_test = train_test_split(X_bl,
                                                                  logit(y_bl),
                                                                  test_size=0.4,
                                                                  random_state=42)

In [None]:
X_bl_train, X_bl_test = X_bl_train_.copy(deep=True), X_bl_test_.copy(deep=True)

mode_imp = SimpleImputer(strategy='most_frequent')
mean_imp = SimpleImputer(strategy='mean')
median_imp = SimpleImputer(strategy='median')

X_bl_train[categorical_cols] = mode_imp.fit_transform(X_bl_train_[categorical_cols])
X_bl_train[boolean_cols] = mean_imp.fit_transform(X_bl_train_[boolean_cols])
X_bl_train[numerical_cols] = median_imp.fit_transform(X_bl_train_[numerical_cols])

X_bl_test[categorical_cols] = mode_imp.transform(X_bl_test_[categorical_cols])
X_bl_test[boolean_cols] = mean_imp.transform(X_bl_test_[boolean_cols])
X_bl_test[numerical_cols] = median_imp.transform(X_bl_test_[numerical_cols])

In [None]:
one_enc = OneHotEncoder(drop='first', sparse=False)
one_enc.fit(X_bl_train[categorical_cols])

one_hot_cols = one_enc.get_feature_names(categorical_cols)

X_bl_train[one_hot_cols] = one_enc.transform(X_bl_train[categorical_cols])
X_bl_train.drop(categorical_cols, axis=1, inplace=True)

X_bl_test[one_hot_cols] = one_enc.transform(X_bl_test[categorical_cols])
X_bl_test.drop(categorical_cols, axis=1, inplace=True)

In [None]:
bl_model = LinearRegression().fit(X_bl_train, y_bl_train)

In [None]:
train_loss = cross_entropy_loss(y_bl_train, bl_model.predict(X_bl_train))
test_loss = cross_entropy_loss(y_bl_test, bl_model.predict(X_bl_test))

print('baseline train loss:', train_loss)
print('baseline test loss:', test_loss)

In [None]:
ind_df_1 = ind_df.copy(deep=True)

In [None]:
ind_df_1

In [None]:
cat_to_values = {'divorciado': 0,
                 'casado': 1,
                 'solteiro': 2,
                 'viuvo': 3,
                 'publico': 4,
                 'particular': 5,
                 'taxi': 6}

ind_df_1_ = ind_df_1.replace(cat_to_values)

scaler = MinMaxScaler()
ind_df_1_ = scaler.fit_transform(ind_df_1_.drop('name', axis=1))

In [None]:
# %%time

# knn_imp = KNNImputer(weights='distance')

# ind_df_1_ = knn_imp.fit_transform(ind_df_1_)

In [None]:
# ind_df_1_ = pd.DataFrame(scaler.inverse_transform(ind_df_1_), columns=ind_df_1_.columns)
# ind_df_1 = pd.concat([ind_df_1['name'], ind_df_1_], axis=1)

In [None]:
# cols = ['estado_civil', 'transporte_mais_utilizado']
# values_to_cat = {v: k for (k, v) in cat_to_values.items()}
# ind_df_1[cols] = ind_df_1[cols].round().replace(values_to_cat)

In [None]:
ind_df_1 = pd.read_csv('ind_df_filled_knn.csv')

ind_df_1

In [None]:
conn_df

In [None]:
df_1 = pd.merge(ind_df_1.rename(columns=lambda x: x + '_V2'),
                conn_df, 
                how='right',
                left_on='name_V2', 
                right_on='V2')
df_1 = pd.merge(ind_df_1.rename(columns=lambda x: x + '_V1'),
                df_1, 
                how='right',
                left_on='name_V1', 
                right_on='V1')
df_1 = df_1.drop(['V1', 'V2'], axis=1)
df_1

In [None]:
scaler = MinMaxScaler()
pca = PCA(n_components=0.9, random_state=42)

df_1_ = pd.get_dummies(df_1.drop(cols_to_drop, axis=1), drop_first=True)
df_1_ = pca.fit_transform(scaler.fit_transform(df_1_))
pca_comps = [f'pc_{n+1}' for n in range(df_1_.shape[1])]
df_1_ = pd.DataFrame(scaler.fit_transform(df_1_), columns=pca_comps)

pd.Series(pca.explained_variance_ratio_, index=pca_comps).cumsum()

In [None]:
df_1_ = df_1_.loc[:, :pca_comps[-1]]
df_1_['prob_V1_V2'] = df_1['prob_V1_V2']

df_1_

In [None]:
df_1_.describe().T

In [None]:
mask = df_1_['prob_V1_V2'].isna()
X_1_isna = df_1_.loc[mask].drop('prob_V1_V2', axis=1)
X_1_notna = df_1_.loc[~mask].drop('prob_V1_V2', axis=1)
y_1_notna = logit(df_1_.loc[~mask, 'prob_V1_V2'])

In [None]:
knn_reg_imp = KNeighborsRegressor(weights='distance', n_jobs=-1)

knn_reg_imp.fit(X_1_notna, y_1_notna)

df_1_.loc[mask, 'prob_V1_V2'] = logistic(knn_reg_imp.predict(X_1_isna))

In [None]:
df_1['prob_V1_V2'] = df_1_['prob_V1_V2']

X_1_, y_1 = df_1.drop(cols_to_drop, axis=1), df_1['prob_V1_V2']
X_1 = pd.get_dummies(X_1_, drop_first=True)

In [None]:
X_1_train, X_1_test, y_1_train, y_1_test = train_test_split(X_1,
                                                            logit(y_1),
                                                            test_size=0.4,
                                                            random_state=42)

In [None]:
lr_1_model = LinearRegression().fit(X_1_train, y_1_train)

In [None]:
train_loss = cross_entropy_loss(y_1_train, lr_1_model.predict(X_1_train))
test_loss = cross_entropy_loss(y_1_test, lr_1_model.predict(X_1_test))

print('lr_1 train loss:', train_loss)
print('lr_1 test loss:', test_loss)

In [None]:
ind_df_2 = ind_df.copy(deep=True)

In [None]:
ind_df_2

In [None]:
ind_df_2.isna().mean().sort_values()

In [None]:
ind_df_2.corr().round(3)

In [None]:
ind_df_2['qt_filhos'].value_counts(normalize=True)

In [None]:
ind_df_2.loc[ind_df_2['qt_filhos'] > 1, 'qt_filhos'] = 2
ind_df_2['qt_filhos'].value_counts(normalize=True)

In [None]:
ind_df_2['estuda'].value_counts(normalize=True)

In [None]:
ind_df_2['trabalha'].value_counts(normalize=True)

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(16, 4), sharey=True)
sns.kdeplot(x='idade', hue='qt_filhos', data=ind_df_2, ax=ax[0])
sns.kdeplot(x='idade', hue='estuda', data=ind_df_2, ax=ax[1])
sns.kdeplot(x='idade', hue='trabalha', data=ind_df_2, ax=ax[2]);

In [None]:
ind_df_2['idade_quantis'] = pd.qcut(ind_df_2['idade'], q=15, labels=range(1, 16))

filhos_df = (ind_df_2
             .groupby('idade_quantis')['qt_filhos']
             .value_counts(normalize=True)
             .rename('pct')
             .reset_index())
estuda_df = (ind_df_2
             .groupby('idade_quantis')['estuda']
             .value_counts(normalize=True)
             .rename('pct')
             .reset_index())
trabalha_df = (ind_df_2
               .groupby('idade_quantis')['trabalha']
               .value_counts(normalize=True)
               .rename('pct')
               .reset_index())

fig, ax = plt.subplots(1, 3, figsize=(16, 4), sharey=True)
sns.barplot(x='idade_quantis', y='pct', hue='qt_filhos', data=filhos_df, ax=ax[0])
sns.barplot(x='idade_quantis', y='pct', hue='estuda', data=estuda_df, ax=ax[1])
ax[1].set_ylabel('')
sns.barplot(x='idade_quantis', y='pct', hue='trabalha', data=trabalha_df, ax=ax[2])
ax[2].set_ylabel('');

In [None]:
ind_df_2.loc[ind_df_2['idade_quantis'] == 1, 'idade'].describe().T

In [None]:
ind_df_2_ = ind_df_2.query('idade_quantis > 1').copy(deep=True)

In [None]:
ind_df_2_.corr().round(3)

In [None]:
filhos_1 = (ind_df_2_
            .groupby('estado_civil')['qt_filhos']
            .value_counts(normalize=True)
            .rename('pct')
            .reset_index())
estuda_1 = (ind_df_2_
            .groupby('estado_civil')['estuda']
            .value_counts(normalize=True)
            .rename('pct')
            .reset_index())
trabalha_1 = (ind_df_2_
              .groupby('estado_civil')['trabalha']
              .value_counts(normalize=True)
              .rename('pct')
              .reset_index())
filhos_2 = (ind_df_2_
            .groupby('transporte_mais_utilizado')['qt_filhos']
            .value_counts(normalize=True)
            .rename('pct')
            .reset_index())
estuda_2 = (ind_df_2_
            .groupby('transporte_mais_utilizado')['estuda']
            .value_counts(normalize=True)
            .rename('pct')
            .reset_index())
trabalha_2 = (ind_df_2_
              .groupby('transporte_mais_utilizado')['trabalha']
              .value_counts(normalize=True)
              .rename('pct')
              .reset_index())

fig, ax = plt.subplots(3, 2, figsize=(9, 9), sharex=True)
fig.tight_layout(w_pad=4)

sns.barplot(x='estado_civil', y='pct', hue='qt_filhos', data=filhos_1, ax=ax[0, 0])
ax[0, 0].set_xlabel('')
ax[0, 0].get_legend().remove()
sns.barplot(x='transporte_mais_utilizado', y='pct', hue='qt_filhos', data=filhos_2, ax=ax[0, 1])
ax[0, 1].set_xlabel('')
ax[0, 1].legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0, title='qt_filhos')

sns.barplot(x='estado_civil', y='pct', hue='estuda', data=estuda_1, ax=ax[1, 0])
ax[1, 0].set_xlabel('')
ax[1, 0].get_legend().remove()
sns.barplot(x='transporte_mais_utilizado', y='pct', hue='estuda', data=estuda_2, ax=ax[1, 1])
ax[1, 1].set_xlabel('')
ax[1, 1].legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0, title='estuda')

sns.barplot(x='estado_civil', y='pct', hue='trabalha', data=trabalha_1, ax=ax[2, 0])
ax[2, 0].get_legend().remove()
sns.barplot(x='transporte_mais_utilizado', y='pct', hue='trabalha', data=trabalha_2, ax=ax[2, 1])
ax[2, 1].legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0, title='trabalha');

In [None]:
filhos_pct_df = ind_df_2_['qt_filhos'].value_counts(normalize=True)
filhos_pct_df

In [None]:
mask = ind_df_2['qt_filhos'].isna()
ind_df_2.loc[mask & (ind_df_2['idade_quantis'] == 1), 'qt_filhos'] = 0
ind_df_2['qt_filhos'].fillna(sum(filhos_pct_df.index*filhos_pct_df.values), inplace=True)

In [None]:
estuda_pct_df = ind_df_2_['estuda'].value_counts(normalize=True)
estuda_pct_df

In [None]:
mask = ind_df_2['estuda'].isna()
ind_df_2.loc[mask & (ind_df_2['idade_quantis'] == 1), 'estuda'] = 1
ind_df_2['estuda'].fillna(sum(estuda_pct_df.index*estuda_pct_df.values), inplace=True)

In [None]:
trabalha_pct_df = ind_df_2_['trabalha'].value_counts(normalize=True)
trabalha_pct_df

In [None]:
names_V1 = conn_df.loc[conn_df['grau'] == 'trabalho', 'V1'].unique()
names_V2 = conn_df.loc[conn_df['grau'] == 'trabalho', 'V2'].unique()
names = np.append(names_V1, names_V2)

mask = ind_df_2['trabalha'].isna()
ind_df_2.loc[mask & ind_df_2.index.isin(names), 'trabalha'] = 1
ind_df_2.loc[mask & (ind_df_2['idade_quantis'] == 1), 'trabalha'] = 0
ind_df_2['trabalha'].fillna(sum(trabalha_pct_df.index*trabalha_pct_df.values), inplace=True)

In [None]:
ind_df_2_['IMC_quantis'] = pd.qcut(ind_df_2_['IMC'], q=10, labels=range(1, 11))

In [None]:
fig, ax = plt.subplots(3, 3, figsize=(13, 11))
fig.tight_layout(h_pad=3, w_pad=4)

sns.kdeplot(x='idade', hue='transporte_mais_utilizado', data=ind_df_2, ax=ax[0, 0])

cols = ['idade_quantis', 'estado_civil', 'qt_filhos', 'estuda', 'trabalha', 'pratica_esportes']
for n, c in enumerate(cols, start=1):
    tmp = (ind_df_2
           .groupby(c)['transporte_mais_utilizado']
           .value_counts(normalize=True)
           .rename('pct')
           .reset_index())
    g = sns.barplot(x=c, y='pct', hue='transporte_mais_utilizado', data=tmp, ax=ax[n//3, n%3])
    if c in ['qt_filhos', 'estuda', 'trabalha']:
        g.set_xticklabels(labels=['{:.1f}'.format(x) for x in tmp[c].unique()])
    ax[n//3, n%3].get_legend().remove()

tmp = (ind_df_2_
       .groupby('IMC_quantis')['transporte_mais_utilizado']
       .value_counts(normalize=True)
       .rename('pct')
       .reset_index())
sns.kdeplot(x='IMC', hue='transporte_mais_utilizado', data=ind_df_2, ax=ax[2, 1])
sns.barplot(x='IMC_quantis', y='pct', hue='transporte_mais_utilizado', data=tmp, ax=ax[2, 2])
ax[2, 2].get_legend().remove();

In [None]:
transp_pct_df = ind_df_2['transporte_mais_utilizado'].value_counts(normalize=True)
transp_pct_df

In [None]:
ind_df_2['transporte_mais_utilizado'].fillna('outros', inplace=True)

In [None]:
fig, ax = plt.subplots(3, 3, figsize=(13, 11))
fig.tight_layout(h_pad=3, w_pad=4)

tmp = (ind_df_2
       .groupby('idade_quantis')['estado_civil']
       .value_counts(normalize=True)
       .rename('pct')
       .reset_index())

sns.kdeplot(x='idade', hue='estado_civil', data=ind_df_2, ax=ax[0, 0])
sns.barplot(x='idade_quantis', y='pct', hue='estado_civil', data=tmp, ax=ax[0, 1])

cols = ['qt_filhos', 'estuda', 'trabalha', 'pratica_esportes', 'transporte_mais_utilizado']
for n, c in enumerate(cols, start=2):
    tmp = (ind_df_2_
           .groupby(c)['estado_civil']
           .value_counts(normalize=True)
           .rename('pct')
           .reset_index())
    g = sns.barplot(x=c, y='pct', hue='estado_civil', data=tmp, ax=ax[n//3, n%3])
    if c in ['qt_filhos', 'estuda', 'trabalha']:
        g.set_xticklabels(labels=['{:.1f}'.format(x) for x in tmp[c].unique()])
    ax[n//3, n%3].get_legend().remove()

tmp = (ind_df_2_
       .groupby('IMC_quantis')['estado_civil']
       .value_counts(normalize=True)
       .rename('pct')
       .reset_index())
sns.kdeplot(x='IMC', hue='estado_civil', data=ind_df_2_, ax=ax[2, 1])
sns.barplot(x='IMC_quantis', y='pct', hue='estado_civil', data=tmp, ax=ax[2, 2])
ax[2, 2].get_legend().remove();

In [None]:
estcv_pct_df = ind_df_2_['estado_civil'].value_counts(normalize=True)
estcv_pct_df

In [None]:
mask = ind_df_2['estado_civil'].isna()
ind_df_2.loc[mask & (ind_df_2['idade_quantis'] == 1), 'estado_civil'] = 'solteiro'
ind_df_2['estado_civil'].fillna('outros', inplace=True)

In [None]:
fig, ax = plt.subplots(3, 3, figsize=(13, 11))
fig.tight_layout(h_pad=3, w_pad=4)

sns.kdeplot(x='idade', hue='pratica_esportes', data=ind_df_2, ax=ax[0, 0])

cols = ['idade_quantis', 'estado_civil', 'qt_filhos', 'estuda', 'trabalha', 'transporte_mais_utilizado']
for n, c in enumerate(cols, start=1):
    tmp = (ind_df_2
           .groupby(c)['pratica_esportes']
           .value_counts(normalize=True)
           .rename('pct')
           .reset_index())
    g = sns.barplot(x=c, y='pct', hue='pratica_esportes', data=tmp, ax=ax[n//3, n%3])
    if c in ['qt_filhos', 'estuda', 'trabalha']:
        g.set_xticklabels(labels=['{:.1f}'.format(x) for x in tmp[c].unique()])
    ax[n//3, n%3].get_legend().remove()

tmp = (ind_df_2_
       .groupby('IMC_quantis')['pratica_esportes']
       .value_counts(normalize=True)
       .rename('pct')
       .reset_index())
sns.kdeplot(x='IMC', hue='pratica_esportes', data=ind_df_2, ax=ax[2, 1])
sns.barplot(x='IMC_quantis', y='pct', hue='pratica_esportes', data=tmp, ax=ax[2, 2])
ax[2, 2].get_legend().remove();

In [None]:
esportes_pct_df = ind_df_2_['pratica_esportes'].value_counts(normalize=True)
esportes_pct_df

In [None]:
mask = ind_df_2['pratica_esportes'].isna()
ind_df_2['pratica_esportes'].fillna(sum(esportes_pct_df.index*esportes_pct_df.values), inplace=True)

In [None]:
ind_df_2['IMC'].fillna(ind_df_2['IMC'].median(skipna=True), inplace=True)
ind_df_2['idade'].fillna(ind_df_2['idade'].median(skipna=True), inplace=True)

ind_df_2.drop('idade_quantis', axis=1, inplace=True)

In [None]:
df_2 = pd.merge(ind_df_2.rename(columns=lambda x: x + '_V2'),
                conn_df, 
                how='right',
                left_on='name_V2', 
                right_on='V2')
df_2 = pd.merge(ind_df_2.rename(columns=lambda x: x + '_V1'),
                df_2, 
                how='right',
                left_on='name_V1', 
                right_on='V1')
df_2 = df_2.drop(['V1', 'V2'], axis=1)
df_2

In [None]:
scaler = MinMaxScaler()
pca = PCA(n_components=0.9, random_state=42)

df_2_ = pd.get_dummies(df_2.drop(cols_to_drop, axis=1), drop_first=True)
df_2_ = pca.fit_transform(scaler.fit_transform(df_2_))
pca_comps = [f'pc_{n+1}' for n in range(df_2_.shape[1])]
df_2_ = pd.DataFrame(scaler.fit_transform(df_2_), columns=pca_comps)

pd.Series(pca.explained_variance_ratio_, index=pca_comps).cumsum()

In [None]:
df_2_ = df_2_.loc[:, :pca_comps[-1]]
df_2_['prob_V1_V2'] = df_2['prob_V1_V2']

df_2_

In [None]:
df_2_.describe().T

In [None]:
mask = df_2_['prob_V1_V2'].isna()
X_2_isna = df_2_.loc[mask].drop('prob_V1_V2', axis=1)
X_2_notna = df_2_.loc[~mask].drop('prob_V1_V2', axis=1)
y_2_notna = logit(df_2_.loc[~mask, 'prob_V1_V2'])

In [None]:
# %%time

# knn_reg_imp = KNeighborsRegressor(weights='distance', n_jobs=-1)

# knn_reg_imp.fit(X_2_notna, y_2_notna)

# df_2_.loc[mask, 'prob_V1_V2'] = logistic(knn_reg_imp.predict(X_2_isna))

In [None]:
df_2['prob_V1_V2'] = df_2_['prob_V1_V2']

X_2_, y_2 = df_2.drop(cols_to_drop, axis=1), df_2['prob_V1_V2']
X_2 = pd.get_dummies(X_2_, drop_first=True)

In [None]:
X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(X_2,
                                                            logit(y_2),
                                                            test_size=0.4,
                                                            random_state=42)

# lr_2_model = LinearRegression().fit(X_2_train, y_2_train)
lr_2_model = joblib.load('lr_2_model.pkl')

In [None]:
train_loss = cross_entropy_loss(y_2_train, lr_2_model.predict(X_2_train))
test_loss = cross_entropy_loss(y_2_test, lr_2_model.predict(X_2_test))

print('lr_2 train loss:', train_loss)
print('lr_2 test loss:', test_loss)

In [None]:
df_raw_ = pd.get_dummies(df_raw.drop(['name_V1', 'name_V2'], axis=1), drop_first=True)
X_3_train, X_3_test, y_3_train, y_3_test = train_test_split(df_raw_.drop('prob_V1_V2', axis=1),
                                                            logit(df_raw_['prob_V1_V2']),
                                                            test_size=0.4,
                                                            random_state=42)

In [None]:
lgbm_reg = lgb.LGBMRegressor(random_state=42)
lgbm_reg.fit(X_3_train, y_3_train);

In [None]:
train_loss = cross_entropy_loss(y_3_train, lgbm_reg.predict(X_3_train))
test_loss = cross_entropy_loss(y_3_test, lgbm_reg.predict(X_3_test))

print('lgbr train loss:', train_loss)
print('lgbr test loss:', test_loss)