In [599]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder

In [65]:
conn_df = pd.read_csv('conexoes_espec.csv', sep=';')

In [66]:
conn_df.head(10)

Unnamed: 0,V1,V2,grau,proximidade,prob_V1_V2
0,1,2,trabalho,visita_frequente,0.589462
1,1,3,trabalho,visita_rara,0.708465
2,2,4,trabalho,visita_casual,
3,2,5,trabalho,visita_rara,0.638842
4,3,6,amigos,mora_junto,
5,3,7,familia,visita_casual,0.709608
6,4,8,familia,mora_junto,
7,4,9,amigos,visita_casual,0.465209
8,5,10,trabalho,visita_frequente,0.658706
9,5,11,trabalho,visita_casual,


In [67]:
conn_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999999 entries, 0 to 999998
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   V1           999999 non-null  int64  
 1   V2           999999 non-null  int64  
 2   grau         999999 non-null  object 
 3   proximidade  999999 non-null  object 
 4   prob_V1_V2   499999 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 38.1+ MB


In [68]:
conn_df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
V1,999999.0,,,,250000.25,144337.495129,1.0,125000.5,250000.0,375000.0,500000.0
V2,999999.0,,,,500001.0,288674.990255,2.0,250001.5,500001.0,750000.5,1000000.0
grau,999999.0,3.0,trabalho,333548.0,,,,,,,
proximidade,999999.0,4.0,visita_rara,400238.0,,,,,,,
prob_V1_V2,499999.0,,,,0.485162,0.173963,0.074462,0.351677,0.486413,0.61821,0.942245


In [69]:
conn_df['grau'].unique()

array(['trabalho', 'amigos', 'familia'], dtype=object)

In [70]:
conn_df['proximidade'].unique()

array(['visita_frequente', 'visita_rara', 'visita_casual', 'mora_junto'],
      dtype=object)

In [71]:
# rev_conn_df = conn_df[['V2', 'V1', 'grau', 'proximidade']].copy(deep=True).rename(columns={'V1': 'V2', 'V2': 'V1'})
# rev_conn_df['prob_V1_V2'] = np.nan

# conn_df['is_collected'] = True
# rev_conn_df['is_collected'] = False

# conn_df = conn_df.append(rev_conn_df, ignore_index=True) 

In [72]:
conn_df

Unnamed: 0,V1,V2,grau,proximidade,prob_V1_V2
0,1,2,trabalho,visita_frequente,0.589462
1,1,3,trabalho,visita_rara,0.708465
2,2,4,trabalho,visita_casual,
3,2,5,trabalho,visita_rara,0.638842
4,3,6,amigos,mora_junto,
...,...,...,...,...,...
999994,499998,999996,trabalho,visita_rara,
999995,499998,999997,trabalho,visita_rara,
999996,499999,999998,familia,visita_casual,0.451662
999997,499999,999999,familia,visita_rara,0.186973


In [73]:
ind_df = pd.read_csv('individuos_espec.csv', sep=';')

In [74]:
ind_df.head()

Unnamed: 0,name,idade,estado_civil,qt_filhos,estuda,trabalha,pratica_esportes,transporte_mais_utilizado,IMC
0,1,44.0,divorciado,1.0,1.0,0.0,1.0,publico,22.200956
1,2,24.0,casado,0.0,0.0,0.0,1.0,publico,25.37872
2,3,35.0,solteiro,1.0,0.0,0.0,1.0,particular,19.952393
3,4,50.0,casado,1.0,1.0,1.0,0.0,publico,26.732053
4,5,30.0,solteiro,2.0,1.0,0.0,1.0,publico,15.295668


In [75]:
ind_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 9 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   name                       1000000 non-null  int64  
 1   idade                      904063 non-null   float64
 2   estado_civil               949927 non-null   object 
 3   qt_filhos                  971133 non-null   float64
 4   estuda                     959870 non-null   float64
 5   trabalha                   993647 non-null   float64
 6   pratica_esportes           850876 non-null   float64
 7   transporte_mais_utilizado  956967 non-null   object 
 8   IMC                        886130 non-null   float64
dtypes: float64(6), int64(1), object(2)
memory usage: 68.7+ MB


In [76]:
ind_df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
name,1000000.0,,,,500000.5,288675.278933,1.0,250000.75,500000.5,750000.25,1000000.0
idade,904063.0,,,,30.008431,10.951566,0.0,22.0,29.0,37.0,124.0
estado_civil,949927.0,4.0,solteiro,420637.0,,,,,,,
qt_filhos,971133.0,,,,0.928406,0.997289,0.0,0.0,1.0,1.0,9.0
estuda,959870.0,,,,0.442588,0.496693,0.0,0.0,0.0,1.0,1.0
trabalha,993647.0,,,,0.556739,0.496771,0.0,0.0,1.0,1.0,1.0
pratica_esportes,850876.0,,,,0.599823,0.489934,0.0,0.0,1.0,1.0,1.0
transporte_mais_utilizado,956967.0,3.0,publico,574779.0,,,,,,,
IMC,886130.0,,,,22.472079,7.073692,4.852828,17.420851,21.4312,26.378197,89.158204


In [77]:
ind_df['estado_civil'].unique()

array(['divorciado', 'casado', 'solteiro', nan, 'viuvo'], dtype=object)

In [78]:
ind_df['transporte_mais_utilizado'].unique()

array(['publico', 'particular', 'taxi', nan], dtype=object)

In [79]:
df_raw = pd.merge(ind_df.rename(columns=lambda x: x + '_V2'),
                  conn_df, 
                  how='right',
                  left_on='name_V2', 
                  right_on='V2')
df_raw = pd.merge(ind_df.rename(columns=lambda x: x + '_V1'),
                  df_raw, 
                  how='right',
                  left_on='name_V1', 
                  right_on='V1')
df_raw = df_raw.drop(['V1', 'V2'], axis=1)
df_raw

Unnamed: 0,name_V1,idade_V1,estado_civil_V1,qt_filhos_V1,estuda_V1,trabalha_V1,pratica_esportes_V1,transporte_mais_utilizado_V1,IMC_V1,name_V2,...,estado_civil_V2,qt_filhos_V2,estuda_V2,trabalha_V2,pratica_esportes_V2,transporte_mais_utilizado_V2,IMC_V2,grau,proximidade,prob_V1_V2
0,1,44.0,divorciado,1.0,1.0,0.0,1.0,publico,22.200956,2,...,casado,0.0,0.0,0.0,1.0,publico,25.378720,trabalho,visita_frequente,0.589462
1,1,44.0,divorciado,1.0,1.0,0.0,1.0,publico,22.200956,3,...,solteiro,1.0,0.0,0.0,1.0,particular,19.952393,trabalho,visita_rara,0.708465
2,2,24.0,casado,0.0,0.0,0.0,1.0,publico,25.378720,4,...,casado,1.0,1.0,1.0,0.0,publico,26.732053,trabalho,visita_casual,
3,2,24.0,casado,0.0,0.0,0.0,1.0,publico,25.378720,5,...,solteiro,2.0,1.0,0.0,1.0,publico,15.295668,trabalho,visita_rara,0.638842
4,3,35.0,solteiro,1.0,0.0,0.0,1.0,particular,19.952393,6,...,,1.0,0.0,1.0,0.0,publico,20.412942,amigos,mora_junto,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999994,499998,23.0,casado,0.0,1.0,1.0,0.0,publico,22.036885,999996,...,casado,1.0,0.0,1.0,1.0,publico,17.556771,trabalho,visita_rara,
999995,499998,23.0,casado,0.0,1.0,1.0,0.0,publico,22.036885,999997,...,casado,1.0,0.0,1.0,1.0,particular,24.848402,trabalho,visita_rara,
999996,499999,26.0,casado,3.0,0.0,0.0,0.0,taxi,19.543889,999998,...,divorciado,1.0,0.0,0.0,1.0,publico,,familia,visita_casual,0.451662
999997,499999,26.0,casado,3.0,0.0,0.0,0.0,taxi,19.543889,999999,...,casado,0.0,1.0,1.0,1.0,publico,16.979569,familia,visita_rara,0.186973


In [80]:
def logit(p):
    return np.log(p / (1-p))

def logistic(y):
    ey = np.exp(y)
    return ey / (1+ey)

def cross_entropy_loss(y_true, y_pred):
    p, q = logistic(y_true), logistic(y_pred)
    return -(p*np.log(q) + (1-p)*np.log(1-q)).mean()

In [152]:
categorical_cols = ['estado_civil_V1', 'estado_civil_V2',
                    'transporte_mais_utilizado_V1', 'transporte_mais_utilizado_V2',
                    'grau', 'proximidade']

boolean_cols = ['estuda_V1', 'trabalha_V1', 'pratica_esportes_V1',
                'estuda_V2', 'trabalha_V2', 'pratica_esportes_V2']

numerical_cols = ['idade_V1', 'qt_filhos_V1', 'IMC_V1',
                  'idade_V2', 'qt_filhos_V2', 'IMC_V2']

cols_to_drop = ['name_V1', 'name_V2', 'prob_V1_V2']

In [82]:
df_bl = df_raw.copy(deep=True)

df_bl['prob_V1_V2'].fillna(df_bl['prob_V1_V2'].median(), inplace=True)

In [83]:
X_bl = df_bl.drop(cols_to_drop, axis=1)
y_bl = df_bl['prob_V1_V2']

X_bl_train_, X_bl_test_, y_bl_train, y_bl_test = train_test_split(X_bl,
                                                                  logit(y_bl),
                                                                  test_size=0.4,
                                                                  random_state=42)

In [84]:
X_bl_train, X_bl_test = X_bl_train_.copy(deep=True), X_bl_test_.copy(deep=True)

mode_imp = SimpleImputer(strategy='most_frequent')
mean_imp = SimpleImputer(strategy='mean')
median_imp = SimpleImputer(strategy='median')

X_bl_train[categorical_cols] = mode_imp.fit_transform(X_bl_train_[categorical_cols])
X_bl_train[boolean_cols] = mean_imp.fit_transform(X_bl_train_[boolean_cols])
X_bl_train[numerical_cols] = median_imp.fit_transform(X_bl_train_[numerical_cols])

X_bl_test[categorical_cols] = mode_imp.transform(X_bl_test_[categorical_cols])
X_bl_test[boolean_cols] = mean_imp.transform(X_bl_test_[boolean_cols])
X_bl_test[numerical_cols] = median_imp.transform(X_bl_test_[numerical_cols])

In [85]:
# prox_ord = ['visita_rara', 'visita_casual', 'visita_frequente', 'mora_junto']
# transp_ord = ['particular', 'taxi', 'publico']
# ord_enc = OrdinalEncoder(categories=[transp_ord, transp_ord, prox_ord])

# X_bl_train[ordinal_cat_cols] = ord_enc.fit_transform(X_bl_train[ordinal_cat_cols])
# X_bl_test[ordinal_cat_cols] = ord_enc.transform(X_bl_test[ordinal_cat_cols])

In [87]:
one_enc = OneHotEncoder(drop='first', sparse=False)
one_enc.fit(X_bl_train[categorical_cols])

one_hot_cols = one_enc.get_feature_names(categorical_cols)

X_bl_train[one_hot_cols] = one_enc.transform(X_bl_train[categorical_cols])
X_bl_train.drop(categorical_cols, axis=1, inplace=True)

X_bl_test[one_hot_cols] = one_enc.transform(X_bl_test[categorical_cols])
X_bl_test.drop(categorical_cols, axis=1, inplace=True)

In [89]:
bl_model = LinearRegression().fit(X_bl_train, y_bl_train)

In [343]:
train_loss = cross_entropy_loss(y_bl_train, bl_model.predict(X_bl_train))
test_loss = cross_entropy_loss(y_bl_test, bl_model.predict(X_bl_test))

print('baseline train loss:', train_loss)
print('baseline test loss:', test_loss)

baseline train loss: 0.684272325646911
baseline test loss: 0.6842074405738243


In [402]:
df_1 = df_raw.copy(deep=True).sample(frac=0.1, random_state=42)

In [403]:
df_1

Unnamed: 0,name_V1,idade_V1,estado_civil_V1,qt_filhos_V1,estuda_V1,trabalha_V1,pratica_esportes_V1,transporte_mais_utilizado_V1,IMC_V1,name_V2,...,estado_civil_V2,qt_filhos_V2,estuda_V2,trabalha_V2,pratica_esportes_V2,transporte_mais_utilizado_V2,IMC_V2,grau,proximidade,prob_V1_V2
987230,493616,48.0,solteiro,0.0,1.0,0.0,,publico,,987232,...,divorciado,0.0,1.0,1.0,0.0,publico,23.629884,familia,visita_frequente,
79954,39978,33.0,casado,1.0,0.0,0.0,1.0,particular,36.497011,79956,...,viuvo,1.0,0.0,0.0,1.0,publico,15.739333,amigos,visita_frequente,
567130,283566,24.0,solteiro,1.0,0.0,0.0,1.0,particular,21.733414,567132,...,divorciado,1.0,0.0,0.0,1.0,publico,37.438250,trabalho,visita_casual,0.470959
500891,250446,,solteiro,0.0,1.0,0.0,1.0,particular,11.278218,500893,...,divorciado,4.0,0.0,1.0,0.0,publico,12.541784,familia,visita_casual,0.639066
55399,27700,31.0,solteiro,0.0,,1.0,1.0,particular,13.226648,55401,...,solteiro,1.0,1.0,1.0,0.0,publico,22.814826,trabalho,visita_rara,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395942,197972,25.0,divorciado,2.0,0.0,1.0,,publico,16.327677,395944,...,casado,1.0,0.0,0.0,1.0,particular,,familia,visita_rara,0.297126
417771,208886,24.0,solteiro,1.0,0.0,1.0,1.0,taxi,19.201943,417773,...,solteiro,1.0,1.0,1.0,1.0,particular,20.485265,familia,visita_rara,0.301117
713259,356630,28.0,solteiro,2.0,1.0,1.0,1.0,particular,15.976760,713261,...,divorciado,,1.0,1.0,1.0,particular,17.146773,amigos,visita_rara,0.400940
794021,397011,36.0,casado,0.0,0.0,1.0,0.0,particular,18.807305,794023,...,casado,1.0,0.0,0.0,1.0,particular,25.896600,amigos,visita_rara,


In [404]:
X_1 = df_1.drop(cols_to_drop, axis=1)
y_1 = df_1['prob_V1_V2']

X_1_train_, X_1_test_, y_1_train, y_1_test = train_test_split(X_1,
                                                              logit(y_1),
                                                              test_size=0.2,
                                                              random_state=42)

In [405]:
X_1_train, X_1_test = X_1_train_.copy(deep=True), X_1_test_.copy(deep=True)

mode_imp = SimpleImputer(strategy='most_frequent')
mean_imp = SimpleImputer(strategy='mean')
median_imp = SimpleImputer(strategy='median')

X_1_train[categorical_cols] = mode_imp.fit_transform(X_1_train_[categorical_cols])
X_1_train[boolean_cols] = mean_imp.fit_transform(X_1_train_[boolean_cols])
X_1_train[numerical_cols] = median_imp.fit_transform(X_1_train_[numerical_cols])

X_1_test[categorical_cols] = mode_imp.transform(X_1_test_[categorical_cols])
X_1_test[boolean_cols] = mean_imp.transform(X_1_test_[boolean_cols])
X_1_test[numerical_cols] = median_imp.transform(X_1_test_[numerical_cols])

In [406]:
one_enc = OneHotEncoder(drop='first', sparse=False)
one_enc.fit(X_1_train[categorical_cols])

one_hot_cols = one_enc.get_feature_names(categorical_cols)

X_1_train[one_hot_cols] = one_enc.transform(X_1_train[categorical_cols])
X_1_train.drop(categorical_cols, axis=1, inplace=True)

X_1_test[one_hot_cols] = one_enc.transform(X_1_test[categorical_cols])
X_1_test.drop(categorical_cols, axis=1, inplace=True)

In [407]:
X_1_train['is_train'], X_1_test['is_train'] = True, False

X_1 = X_1_train.append(X_1_test)
y_1 = y_1_train.append(y_1_test)

In [408]:
scaler = MinMaxScaler()
pca = PCA(n_components=0.9, random_state=42)

X_1_ = pca.fit_transform(scaler.fit_transform(X_1.drop('is_train', axis=1)))
pca_comps = [f'pc_{n+1}' for n in range(X_1_.shape[1])]
X_1_ = pd.DataFrame(scaler.fit_transform(X_1_), index=X_1.index, columns=pca_comps)

pd.Series(pca.explained_variance_ratio_, index=pca_comps).cumsum()

In [409]:
X_1_ = X_1_.loc[:, :'pc_16']
df_1_ = pd.concat([X_1_, y_1.rename('logit_prob_V1_V2')], axis=1)

df_1_

Unnamed: 0,idade_V1,qt_filhos_V1,estuda_V1,trabalha_V1,pratica_esportes_V1,IMC_V1,idade_V2,qt_filhos_V2,estuda_V2,trabalha_V2,...,transporte_mais_utilizado_V1_publico,transporte_mais_utilizado_V1_taxi,transporte_mais_utilizado_V2_publico,transporte_mais_utilizado_V2_taxi,grau_familia,grau_trabalho,proximidade_visita_casual,proximidade_visita_frequente,proximidade_visita_rara,logit_prob_V1_V2
868974,0.435644,0.000000,1.0,1.0,1.0,0.070383,0.329545,0.285714,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,
411690,0.267327,0.000000,0.0,1.0,1.0,0.221980,0.397727,0.142857,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.321389
616280,0.287129,0.142857,0.0,0.0,1.0,0.193617,0.215909,0.000000,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.234459
267494,0.356436,0.000000,1.0,1.0,1.0,0.112285,0.181818,0.142857,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.642401
420509,0.287129,0.428571,0.0,1.0,1.0,0.152509,0.306818,0.000000,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302463,0.297030,0.000000,1.0,0.0,0.0,0.281490,0.125000,0.000000,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,
666057,0.415842,0.000000,0.0,1.0,1.0,0.059846,0.306818,0.000000,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.383085
252899,0.326733,0.142857,0.0,0.0,1.0,0.126751,0.215909,0.285714,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,
612256,0.405941,0.142857,1.0,1.0,0.0,0.085908,0.272727,0.142857,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,-0.559819


In [410]:
df_1_.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
idade_V1,100000.0,0.276455,0.10368,0.0,0.207921,0.267327,0.336634,1.0
qt_filhos_V1,100000.0,0.133207,0.140252,0.0,0.0,0.142857,0.142857,1.0
estuda_V1,100000.0,0.445127,0.487042,0.0,0.0,0.0,1.0,1.0
trabalha_V1,100000.0,0.558133,0.495051,0.0,0.0,1.0,1.0,1.0
pratica_esportes_V1,100000.0,0.598143,0.451936,0.0,0.0,1.0,1.0,1.0
IMC_V1,100000.0,0.205185,0.082055,0.0,0.151108,0.193617,0.245065,1.0
idade_V2,100000.0,0.317138,0.118605,0.0,0.238636,0.306818,0.386364,1.0
qt_filhos_V2,100000.0,0.133146,0.140926,0.0,0.0,0.142857,0.142857,1.0
estuda_V2,100000.0,0.442484,0.48682,0.0,0.0,0.0,1.0,1.0
trabalha_V2,100000.0,0.555347,0.495363,0.0,0.0,1.0,1.0,1.0


In [411]:
knn_imp = KNNImputer(weights='distance')

y_1_ = knn_imp.fit_transform(df_1_)[:, -1]

In [412]:
df_1 = pd.concat([X_1, pd.Series(y_1_, index=X_1.index, name='logit_prob_V1_V2')], axis=1)

df_1

Unnamed: 0,idade_V1,qt_filhos_V1,estuda_V1,trabalha_V1,pratica_esportes_V1,IMC_V1,idade_V2,qt_filhos_V2,estuda_V2,trabalha_V2,...,transporte_mais_utilizado_V1_taxi,transporte_mais_utilizado_V2_publico,transporte_mais_utilizado_V2_taxi,grau_familia,grau_trabalho,proximidade_visita_casual,proximidade_visita_frequente,proximidade_visita_rara,is_train,logit_prob_V1_V2
868974,46.0,0.0,1.0,1.0,1.0,11.411365,31.0,2.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,True,0.082141
411690,29.0,0.0,0.0,1.0,1.0,23.690345,37.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,True,0.321389
616280,31.0,1.0,0.0,0.0,1.0,21.393032,21.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,True,1.234459
267494,38.0,0.0,1.0,1.0,1.0,14.805366,18.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,True,-0.642401
420509,31.0,3.0,0.0,1.0,1.0,18.063386,29.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,True,0.444466
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302463,32.0,0.0,1.0,0.0,0.0,28.510499,13.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,False,1.120099
666057,44.0,0.0,0.0,1.0,1.0,10.557943,29.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,False,-0.383085
252899,35.0,1.0,0.0,0.0,1.0,15.977035,21.0,2.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,False,-0.045296
612256,43.0,1.0,1.0,1.0,0.0,12.668878,26.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,False,-0.559819


In [413]:
df_1_train = df_1.loc[df_1['is_train']].drop('is_train', axis=1)
df_1_test = df_1.loc[~df_1['is_train']].drop('is_train', axis=1)

X_1_train = df_1_train.drop('logit_prob_V1_V2', axis=1)
y_1_train = df_1_train['logit_prob_V1_V2']
X_1_test = df_1_test.drop('logit_prob_V1_V2', axis=1)
y_1_test = df_1_test['logit_prob_V1_V2']

In [414]:
lr_1_model = LinearRegression().fit(X_1_train, y_1_train)

In [399]:
train_loss = cross_entropy_loss(y_1_train, lr_1_model.predict(X_1_train))
test_loss = cross_entropy_loss(y_1_test, lr_1_model.predict(X_1_test))

print('lr_1 train loss:', train_loss)
print('lr_1 test loss:', test_loss)

lr_1 train loss: 0.6598879658096841
lr_1 test loss: 0.6601786703950473


In [459]:
df_2 = df_raw.copy(deep=True)

In [460]:
X_2 = df_2.drop(cols_to_drop, axis=1)
y_2 = df_2['prob_V1_V2']

X_2_train_, X_2_test_, y_2_train, y_2_test = train_test_split(X_2,
                                                              logit(y_2),
                                                              test_size=0.4,
                                                              random_state=42)

In [477]:
X_2_train_.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 599999 entries, 333136 to 121958
Data columns (total 18 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   idade_V1                      542379 non-null  float64
 1   estado_civil_V1               569824 non-null  object 
 2   qt_filhos_V1                  582618 non-null  float64
 3   estuda_V1                     576154 non-null  float64
 4   trabalha_V1                   596134 non-null  float64
 5   pratica_esportes_V1           510554 non-null  float64
 6   transporte_mais_utilizado_V1  574015 non-null  object 
 7   IMC_V1                        531809 non-null  float64
 8   idade_V2                      542260 non-null  float64
 9   estado_civil_V2               570197 non-null  object 
 10  qt_filhos_V2                  582612 non-null  float64
 11  estuda_V2                     575694 non-null  float64
 12  trabalha_V2                   596174 no

In [502]:
names_1 = df_2.loc[X_2_train_.index, 'name_V1'].unique()
names_2 = df_2.loc[X_2_train_.index, 'name_V2'].unique()
names = np.append(names_1, names_2)

In [604]:
ind_df_2 = ind_df.loc[ind_df['name'].isin(names)]
ind_df_2

Unnamed: 0,name,idade,estado_civil,qt_filhos,estuda,trabalha,pratica_esportes,transporte_mais_utilizado,IMC
0,1,44.0,divorciado,1.0,1.0,0.0,1.0,publico,22.200956
1,2,24.0,casado,0.0,0.0,0.0,1.0,publico,25.378720
2,3,35.0,solteiro,1.0,0.0,0.0,1.0,particular,19.952393
3,4,50.0,casado,1.0,1.0,1.0,0.0,publico,26.732053
4,5,30.0,solteiro,2.0,1.0,0.0,1.0,publico,15.295668
...,...,...,...,...,...,...,...,...,...
999994,999995,33.0,casado,0.0,0.0,0.0,,particular,16.982246
999995,999996,34.0,casado,1.0,0.0,1.0,1.0,publico,17.556771
999996,999997,40.0,casado,1.0,0.0,1.0,1.0,particular,24.848402
999998,999999,33.0,casado,0.0,1.0,1.0,1.0,publico,16.979569


In [605]:
ind_df_2.isna().mean().sort_values()

name                         0.000000
trabalha                     0.006343
qt_filhos                    0.028885
estuda                       0.040348
transporte_mais_utilizado    0.043163
estado_civil                 0.050010
idade                        0.096145
IMC                          0.114082
pratica_esportes             0.149260
dtype: float64

In [625]:
df_imp = ind_df_2.drop('name', axis=1)
df_imp = df_imp.dropna(subset=df_imp.columns.drop('trabalha'), axis=0)

df_imp = pd.get_dummies(df_imp.dropna(subset=['trabalha'], axis=0), drop_first=True)
X_train_, X_test_, y_train, y_test = train_test_split(df_imp.drop('trabalha', axis=1),
                                                      df_imp_train['trabalha'],
                                                      test_size=0.4,
                                                      random_state=42)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_)
X_test = scaler.transform(X_test_)

lr = LogisticRegressionCV(penalty='elasticnet',
                          solver='saga',
                          n_jobs=-1,
                          random_state=42,
                          l1_ratios=[1])
lr.fit(X_train, y_train);

In [626]:
pd.Series(lr.coef_.squeeze(), index=X_train_.columns)

idade                                0.484991
qt_filhos                            0.000000
estuda                              -0.276310
pratica_esportes                     0.000000
IMC                                  0.000000
estado_civil_divorciado              0.000000
estado_civil_solteiro               -0.271118
estado_civil_viuvo                   0.000000
transporte_mais_utilizado_publico    0.000000
transporte_mais_utilizado_taxi       0.000000
dtype: float64

In [627]:
train_score = lr.score(X_train, y_train)
test_score = lr.score(X_test, y_test)

print('train score:', train_score)
print('test score:', test_score)

train score: 0.6012163829449032
test score: 0.6026704361598856


In [628]:
df_imp_isna = df_imp.loc[df_imp['trabalha'].isna()]
X_isna = pd.get_dummies(df_imp_test.drop('trabalha', axis=1), drop_first=True)

In [644]:
ind_df_2.loc[X_isna.index, 'trabalha'] = lr.predict(X_isna)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [645]:
ind_df_2.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
name,767902.0,445344.091246,283478.893379,1.0,205125.25,410248.5,680133.75,1000000.0
idade,694072.0,29.997822,10.94744,0.0,22.0,29.0,37.0,124.0
qt_filhos,745721.0,0.927297,0.996882,0.0,0.0,1.0,1.0,8.0
estuda,736919.0,0.44256,0.49669,0.0,0.0,0.0,1.0,1.0
trabalha,765825.0,0.558269,0.496593,0.0,0.0,1.0,1.0,1.0
pratica_esportes,653285.0,0.600055,0.489887,0.0,0.0,1.0,1.0,1.0
IMC,680298.0,22.470468,7.071299,4.852828,17.420319,21.428833,26.375108,89.158204


In [646]:
ind_df_2.loc[X_isna.index]

Unnamed: 0,name,idade,estado_civil,qt_filhos,estuda,trabalha,pratica_esportes,transporte_mais_utilizado,IMC
144,145,38.0,casado,1.0,0.0,1.0,1.0,particular,17.358752
146,147,28.0,casado,0.0,0.0,1.0,0.0,publico,21.983078
320,321,25.0,solteiro,1.0,0.0,1.0,0.0,publico,17.515253
352,353,27.0,divorciado,0.0,0.0,1.0,1.0,publico,18.204883
400,401,40.0,divorciado,1.0,1.0,1.0,1.0,publico,26.186201
...,...,...,...,...,...,...,...,...,...
996036,996037,18.0,casado,0.0,0.0,1.0,1.0,particular,21.565478
996116,996117,30.0,divorciado,3.0,0.0,1.0,1.0,particular,18.328365
997057,997058,40.0,casado,2.0,0.0,1.0,0.0,publico,13.079058
998144,998145,18.0,viuvo,0.0,0.0,1.0,0.0,particular,21.014601
