# Tratamento dos dados

In [137]:
# imports

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# normalizacao
from sklearn import preprocessing

# Balanceamento das classes
from imblearn.combine import SMOTETomek

# Modelagem
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

random_state=5007

## Import do Dataset

In [138]:
df = pd.read_csv('../data/kag_risk_factors_cervical_cancer.csv')

print(df.shape)

df.head()

(858, 36)


Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,IUD (years),STDs,STDs (number),STDs:condylomatosis,STDs:cervical condylomatosis,STDs:vaginal condylomatosis,STDs:vulvo-perineal condylomatosis,STDs:syphilis,STDs:pelvic inflammatory disease,STDs:genital herpes,STDs:molluscum contagiosum,STDs:AIDS,STDs:HIV,STDs:Hepatitis B,STDs:HPV,STDs: Number of diagnosis,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,?,?,0,0,0,0,0,0,0,0
2,34,1.0,?,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,?,?,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,?,?,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,?,?,0,0,0,0,0,0,0,0


## Pré-Processamento

In [139]:
# substituindo ? por NAN
df.replace('?', np.NAN,inplace=True)

In [140]:
# Transforma as feature em numericas
df_processed = df.apply(pd.to_numeric, errors="ignore")

# Resultado final com as devidas alteracoes
df_processed.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 36 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Age                                 858 non-null    int64  
 1   Number of sexual partners           832 non-null    float64
 2   First sexual intercourse            851 non-null    float64
 3   Num of pregnancies                  802 non-null    float64
 4   Smokes                              845 non-null    float64
 5   Smokes (years)                      845 non-null    float64
 6   Smokes (packs/year)                 845 non-null    float64
 7   Hormonal Contraceptives             750 non-null    float64
 8   Hormonal Contraceptives (years)     750 non-null    float64
 9   IUD                                 741 non-null    float64
 10  IUD (years)                         741 non-null    float64
 11  STDs                                753 non-n

## Split em Train e Test

In [141]:
# Utilizando todas as features como preditoras
X = df_processed.drop('Biopsy', axis=1)
y = df_processed['Biopsy']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size = 0.25, random_state=random_state)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((643, 35), (215, 35), (643,), (215,))

## Eliminação (ou não) de instâncias com missing values

Para features continuas --> preenche com a mediana (para nao ter muito impacto com outliers)

Hipotese:
 - Para features categoricas --> preenche com o valor mais frequente
 
No decorrer dos experimentos validar se a hipotese é aceita ou não.

(Uma alternativa seria imputar os valores pelo moda (valor mais frequente), que provavelmente é uma solução ruim, pois a resposta verdadeira pode estar correlacionada com a probabilidade de um valor estar ausente. Pois isso teremos um bias, pois esses valores são privados e a pessoa pode escolher não divulga-los)

In [142]:
continuous_feat = ['Number of sexual partners',
                   'First sexual intercourse',
                   'Num of pregnancies', 
                   'Smokes (years)',
                   'Smokes (packs/year)',
                   'Hormonal Contraceptives (years)',
                   'IUD (years)',
                   'STDs (number)',
                   'STDs: Time since first diagnosis',
                   'STDs: Time since last diagnosis'] 

discrete_feat = ['Smokes',
                 'Hormonal Contraceptives',
                 'IUD',
                 'STDs',
                 'STDs:condylomatosis',
                 'STDs:cervical condylomatosis',
                 'STDs:vaginal condylomatosis',
                 'STDs:vulvo-perineal condylomatosis',
                 'STDs:syphilis',
                 'STDs:pelvic inflammatory disease',
                 'STDs:genital herpes',
                 'STDs:molluscum contagiosum',
                 'STDs:AIDS',
                 'STDs:HIV',
                 'STDs:Hepatitis B',
                 'STDs:HPV']                               


In [143]:
X_train_processed = X_train.copy()

In [144]:
# preenche com a mediana
imp_median = SimpleImputer(missing_values = np.nan, strategy = 'median')
X_train_processed[continuous_feat] = imp_median.fit_transform(X_train[continuous_feat])

# preenche com o valor mais frequente
imp_most_freq = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
X_train_processed[discrete_feat] = imp_most_freq.fit_transform(X_train[discrete_feat])

X_train_processed.isnull().sum()


Age                                   0
Number of sexual partners             0
First sexual intercourse              0
Num of pregnancies                    0
Smokes                                0
Smokes (years)                        0
Smokes (packs/year)                   0
Hormonal Contraceptives               0
Hormonal Contraceptives (years)       0
IUD                                   0
IUD (years)                           0
STDs                                  0
STDs (number)                         0
STDs:condylomatosis                   0
STDs:cervical condylomatosis          0
STDs:vaginal condylomatosis           0
STDs:vulvo-perineal condylomatosis    0
STDs:syphilis                         0
STDs:pelvic inflammatory disease      0
STDs:genital herpes                   0
STDs:molluscum contagiosum            0
STDs:AIDS                             0
STDs:HIV                              0
STDs:Hepatitis B                      0
STDs:HPV                              0


## Normalização

A principio vamos testar o MinMax Scaler

In [145]:
# Aplica MinMax no grupo de treino
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))

X_normalized = minmax_scale.fit_transform(X_train_processed)

In [146]:
# transforma de volta para dataframe
X_normalized = pd.DataFrame(X_normalized, columns = X_train_processed.columns.tolist())

X_normalized.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,IUD (years),STDs,STDs (number),STDs:condylomatosis,STDs:cervical condylomatosis,STDs:vaginal condylomatosis,STDs:vulvo-perineal condylomatosis,STDs:syphilis,STDs:pelvic inflammatory disease,STDs:genital herpes,STDs:molluscum contagiosum,STDs:AIDS,STDs:HIV,STDs:Hepatitis B,STDs:HPV,STDs: Number of diagnosis,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology
0,0.338028,0.037037,0.5,0.363636,0.0,0.0,0.0,1.0,0.133333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.098592,0.074074,0.318182,0.090909,1.0,0.027027,0.002703,1.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.253521,0.074074,0.409091,0.090909,0.0,0.0,0.0,1.0,0.002667,1.0,0.421053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.239437,0.037037,0.318182,0.181818,0.0,0.0,0.0,1.0,0.366667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.140845,0.0,0.409091,0.272727,0.0,0.0,0.0,1.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Balanceamento de classes

Vamos aplicar a tecnica de sampling

a partir de amostras, vamos criar novos registros  nosso conjunto de forma que cada amostra represente o target de forma mais equilibrada

Exemplos:

https://elitedatascience.com/imbalanced-classes

In [147]:
# documentacao SMOTETomek: 
# https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.combine.SMOTETomek.html

cc = SMOTETomek(random_state=random_state)
X_train_res, y_train_res = cc.fit_resample(X_normalized, y_train)

X_train_res.shape, y_train_res.shape

((1208, 35), (1208,))

Aumentamos nosso dataset de treino e teste de 643 para 1208

## Salva Dataset

In [148]:
df_train = X_train_res.join(y_train_res)
df_train.head(10)

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,IUD (years),STDs,STDs (number),STDs:condylomatosis,STDs:cervical condylomatosis,STDs:vaginal condylomatosis,STDs:vulvo-perineal condylomatosis,STDs:syphilis,STDs:pelvic inflammatory disease,STDs:genital herpes,STDs:molluscum contagiosum,STDs:AIDS,STDs:HIV,STDs:Hepatitis B,STDs:HPV,STDs: Number of diagnosis,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,0.338028,0.037037,0.5,0.363636,0.0,0.0,0.0,1.0,0.133333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.098592,0.074074,0.318182,0.090909,1.0,0.027027,0.002703,1.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.253521,0.074074,0.409091,0.090909,0.0,0.0,0.0,1.0,0.002667,1.0,0.421053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0
3,0.239437,0.037037,0.318182,0.181818,0.0,0.0,0.0,1.0,0.366667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.140845,0.0,0.409091,0.272727,0.0,0.0,0.0,1.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,0.15493,0.0,0.318182,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
6,0.197183,0.0,0.272727,0.181818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
7,0.183099,0.074074,0.409091,0.181818,0.0,0.0,0.0,1.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0
8,0.211268,0.074074,0.272727,0.272727,1.0,0.324324,0.162162,1.0,0.233333,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.333333,0.05,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
9,0.098592,0.0,0.318182,0.181818,0.0,0.0,0.0,1.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [149]:
df_train.describe()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,IUD (years),STDs,STDs (number),STDs:condylomatosis,STDs:cervical condylomatosis,STDs:vaginal condylomatosis,STDs:vulvo-perineal condylomatosis,STDs:syphilis,STDs:pelvic inflammatory disease,STDs:genital herpes,STDs:molluscum contagiosum,STDs:AIDS,STDs:HIV,STDs:Hepatitis B,STDs:HPV,STDs: Number of diagnosis,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,0.204303,0.055075,0.316188,0.214064,0.174204,0.042851,0.014823,0.702525,0.095231,0.107041,0.022435,0.120203,0.053173,0.076425,0.0,0.002483,0.076425,0.009934,0.0,0.008252,0.000828,0.0,0.035862,0.000828,0.001656,0.039792,0.148981,0.103832,0.031451,0.021437,0.031451,0.042702,0.259527,0.464587,0.180045,0.5
std,0.11567,0.052875,0.108526,0.112145,0.361432,0.126428,0.05665,0.445409,0.141751,0.288084,0.080553,0.317255,0.151794,0.259828,0.0,0.049793,0.259828,0.099213,0.0,0.078616,0.028772,0.0,0.171306,0.028772,0.040673,0.109699,0.054865,0.051794,0.163802,0.130948,0.163802,0.185679,0.415725,0.491259,0.359021,0.500207
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.112676,0.03217,0.252965,0.128005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.1832,0.037037,0.318182,0.181818,0.0,0.0,0.0,1.0,0.018178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
75%,0.273575,0.074074,0.363636,0.272727,0.0,0.0,0.0,1.0,0.153589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.543206,1.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [150]:
# salva dataset
df_train.to_csv('../data/Dados_Processados/df_train_normalized.csv')

## Voltando para escala original

In [151]:
X_train_escala_orig = pd.DataFrame(minmax_scale.inverse_transform(X_train_res))
X_train_escala_orig.columns = X_train_res.columns.tolist()

df_train_non_normalized = X_train_escala_orig.join(y_train_res)
df_train_non_normalized.to_csv('../data/Dados_Processados/df_train_non_normalized.csv')

df_train_non_normalized.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,IUD (years),STDs,STDs (number),STDs:condylomatosis,STDs:cervical condylomatosis,STDs:vaginal condylomatosis,STDs:vulvo-perineal condylomatosis,STDs:syphilis,STDs:pelvic inflammatory disease,STDs:genital herpes,STDs:molluscum contagiosum,STDs:AIDS,STDs:HIV,STDs:Hepatitis B,STDs:HPV,STDs: Number of diagnosis,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,37.0,2.0,21.0,4.0,0.0,0.0,0.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,20.0,3.0,17.0,1.0,1.0,1.0,0.1,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,31.0,3.0,19.0,1.0,0.0,0.0,0.0,1.0,0.08,1.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0
3,30.0,2.0,17.0,2.0,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,23.0,1.0,19.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


## Grupo Teste

In [152]:
X_test_processed = X_test.copy()

X_test_processed[continuous_feat] = imp_median.transform(X_test[continuous_feat])
X_test_processed[discrete_feat] = imp_most_freq.transform(X_test[discrete_feat])

X_test_processed.isnull().sum()

Age                                   0
Number of sexual partners             0
First sexual intercourse              0
Num of pregnancies                    0
Smokes                                0
Smokes (years)                        0
Smokes (packs/year)                   0
Hormonal Contraceptives               0
Hormonal Contraceptives (years)       0
IUD                                   0
IUD (years)                           0
STDs                                  0
STDs (number)                         0
STDs:condylomatosis                   0
STDs:cervical condylomatosis          0
STDs:vaginal condylomatosis           0
STDs:vulvo-perineal condylomatosis    0
STDs:syphilis                         0
STDs:pelvic inflammatory disease      0
STDs:genital herpes                   0
STDs:molluscum contagiosum            0
STDs:AIDS                             0
STDs:HIV                              0
STDs:Hepatitis B                      0
STDs:HPV                              0


In [153]:
df_test_non_normalized = X_test_processed.join(y_test)

df_test_non_normalized.to_csv('../data/Dados_Processados/df_test_non_normalized.csv')

df_test_non_normalized.head(10)

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,IUD (years),STDs,STDs (number),STDs:condylomatosis,STDs:cervical condylomatosis,STDs:vaginal condylomatosis,STDs:vulvo-perineal condylomatosis,STDs:syphilis,STDs:pelvic inflammatory disease,STDs:genital herpes,STDs:molluscum contagiosum,STDs:AIDS,STDs:HIV,STDs:Hepatitis B,STDs:HPV,STDs: Number of diagnosis,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
199,27,2.0,18.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,4.0,3.0,0,0,0,0,0,0,0,0
803,18,3.0,17.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,4.0,3.0,0,0,0,0,0,0,0,0
505,23,2.0,17.0,3.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,4.0,3.0,0,0,0,0,0,0,0,0
522,16,2.0,15.0,1.0,1.0,1.0,0.2,1.0,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,4.0,3.0,0,0,0,0,0,0,0,0
502,24,5.0,15.0,2.0,0.0,0.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,4.0,3.0,0,0,0,0,0,0,0,0
403,17,3.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,4.0,3.0,0,0,0,0,0,0,0,0
136,31,2.0,18.0,3.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,4.0,3.0,0,0,0,0,0,0,0,0
314,21,1.0,17.0,2.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,4.0,3.0,0,0,0,0,1,1,0,1
121,30,5.0,15.0,3.0,0.0,0.0,0.0,1.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,4.0,3.0,0,0,0,0,0,0,0,0
740,40,1.0,20.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,4.0,3.0,0,0,0,0,0,0,0,0


In [154]:
X_test_processed = minmax_scale.transform(X_test_processed)
X_test_processed = pd.DataFrame(X_test_processed, columns = X_test.columns.tolist())

In [155]:
df_test = X_test_processed.join(y_test)
df_test.head(10)

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,IUD (years),STDs,STDs (number),STDs:condylomatosis,STDs:cervical condylomatosis,STDs:vaginal condylomatosis,STDs:vulvo-perineal condylomatosis,STDs:syphilis,STDs:pelvic inflammatory disease,STDs:genital herpes,STDs:molluscum contagiosum,STDs:AIDS,STDs:HIV,STDs:Hepatitis B,STDs:HPV,STDs: Number of diagnosis,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,0.197183,0.037037,0.363636,0.181818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,0.070423,0.074074,0.318182,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,0.140845,0.037037,0.318182,0.272727,0.0,0.0,0.0,1.0,0.016667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.042254,0.037037,0.227273,0.090909,1.0,0.027027,0.005405,1.0,0.002667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,0.15493,0.148148,0.227273,0.181818,0.0,0.0,0.0,1.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
5,0.056338,0.074074,0.227273,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
6,0.253521,0.037037,0.363636,0.272727,0.0,0.0,0.0,1.0,0.016667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
7,0.112676,0.0,0.318182,0.181818,0.0,0.0,0.0,1.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
8,0.239437,0.148148,0.227273,0.272727,0.0,0.0,0.0,1.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
9,0.380282,0.0,0.454545,0.636364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [156]:
df_test.describe()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,IUD (years),STDs,STDs (number),STDs:condylomatosis,STDs:cervical condylomatosis,STDs:vaginal condylomatosis,STDs:vulvo-perineal condylomatosis,STDs:syphilis,STDs:pelvic inflammatory disease,STDs:genital herpes,STDs:molluscum contagiosum,STDs:AIDS,STDs:HIV,STDs:Hepatitis B,STDs:HPV,STDs: Number of diagnosis,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
count,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,64.0
mean,0.19417,0.05702,0.32093,0.208879,0.125581,0.030257,0.013589,0.665116,0.062878,0.130233,0.026842,0.111628,0.04186,0.055814,0.0,0.004651,0.051163,0.027907,0.004651,0.0,0.0,0.0,0.023256,0.0,0.0,0.032558,0.17186,0.126744,0.018605,0.009302,0.018605,0.023256,0.04186,0.083721,0.046512,0.046875
std,0.105707,0.053433,0.135162,0.11708,0.33215,0.098904,0.060369,0.473051,0.106589,0.337345,0.096594,0.315643,0.12555,0.230098,0.0,0.068199,0.220844,0.165091,0.068199,0.0,0.0,0.0,0.151067,0.0,0.0,0.099189,0.133032,0.141069,0.135439,0.096223,0.135439,0.151067,0.200738,0.277615,0.211082,0.213042
min,0.014085,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.112676,0.037037,0.227273,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.183099,0.037037,0.318182,0.181818,0.0,0.0,0.0,1.0,0.016667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.267606,0.074074,0.363636,0.272727,0.0,0.0,0.0,1.0,0.07137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.549296,0.296296,0.863636,0.636364,1.0,0.594595,0.594595,1.0,0.733333,1.0,0.789474,1.0,0.5,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.333333,1.05,1.05,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [157]:
# salva dataset
df_test.to_csv('../data/Dados_Processados/df_test_normalized.csv')