# Cervical Cancer Risk

In [40]:
# import libraries
import pandas as pd 
import plotly_express as px 
import numpy as np

In [41]:
# read dataframe
df = pd.read_csv('datasets/kag_risk_factors_cervical_cancer.csv')

In [42]:
# look at dataframe
df.head()

Unnamed: 0,age,number_of_sexual_partners,first_sexual_intercourse,num_of_pregnancies,smokes,smokes_years,smokes_packs/year,hormonal_contraceptives,hormonal_contraceptives_years,iud,...,std_ time_since_first_diagnosis,std_time_since_last_diagnosis,dx_cancer,dx_cin,dx_hpv,dx,hinselmann,schiller,citology,biopsy
0,18,4,15,1,0,0,0,0,0,0,...,?,?,0,0,0,0,0,0,0,0
1,15,1,14,1,0,0,0,0,0,0,...,?,?,0,0,0,0,0,0,0,0
2,34,1,?,1,0,0,0,0,0,0,...,?,?,0,0,0,0,0,0,0,0
3,52,5,16,4,1,37,37,1,3,0,...,?,?,1,0,1,0,0,0,0,0
4,46,3,21,4,0,0,0,1,15,0,...,?,?,0,0,0,0,0,0,0,0


In [43]:
# look at columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 36 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   age                                858 non-null    int64 
 1   number_of_sexual_partners          858 non-null    object
 2   first_sexual_intercourse           858 non-null    object
 3   num_of_pregnancies                 858 non-null    object
 4   smokes                             858 non-null    object
 5   smokes_years                       858 non-null    object
 6   smokes_packs/year                  858 non-null    object
 7   hormonal_contraceptives            858 non-null    object
 8   hormonal_contraceptives_years      858 non-null    object
 9   iud                                858 non-null    object
 10  iud_years                          858 non-null    object
 11  stds                               858 non-null    object
 12  stds_num

In [44]:
# lookong for missing values
df.isna().sum().sum()

0

In [45]:
df.duplicated().sum()

23

In [46]:
# looking for duplicates
df[df.duplicated()]

Unnamed: 0,age,number_of_sexual_partners,first_sexual_intercourse,num_of_pregnancies,smokes,smokes_years,smokes_packs/year,hormonal_contraceptives,hormonal_contraceptives_years,iud,...,std_ time_since_first_diagnosis,std_time_since_last_diagnosis,dx_cancer,dx_cin,dx_hpv,dx,hinselmann,schiller,citology,biopsy
66,34,3,19,3,0,0,0,1,5,0,...,?,?,0,0,0,0,0,0,0,0
234,25,?,18,2,0,0,0,?,?,?,...,?,?,0,0,0,0,0,0,0,0
255,25,2,18,2,0,0,0,1,0.25,0,...,?,?,0,0,0,0,0,0,0,0
356,18,1,17,1,0,0,0,0,0,0,...,?,?,0,0,0,0,0,0,0,0
395,18,1,18,1,0,0,0,0,0,0,...,?,?,0,0,0,0,0,0,0,0
406,17,1,17,1,0,0,0,0,0,0,...,?,?,0,0,0,0,0,0,0,0
419,19,4,14,1,0,0,0,?,?,?,...,?,?,0,0,0,0,0,0,0,0
431,18,1,14,2,0,0,0,0,0,0,...,?,?,0,0,0,0,0,0,0,0
435,17,2,15,1,0,0,0,0,0,0,...,?,?,0,0,0,0,0,0,0,0
440,15,1,14,1,0,0,0,0,0,0,...,?,?,0,0,0,0,0,0,0,0


In [47]:
# replace question marks with NaN
df.replace('?', np.NaN, inplace=True)

In [48]:
# recheck for missing values
df.isna().sum()

age                                    0
number_of_sexual_partners             26
first_sexual_intercourse               7
num_of_pregnancies                    56
smokes                                13
smokes_years                          13
smokes_packs/year                     13
hormonal_contraceptives              108
hormonal_contraceptives_years        108
iud                                  117
iud_years                            117
stds                                 105
stds_number                          105
std_condylomatosis                   105
std_cervical condylomatosis          105
std_vaginal_condylomatosis           105
std_vulvo_perineal_condylomatosis    105
std_syphilis                         105
std_pelvic_inflammatory_disease      105
std_genital_herpes                   105
std_molluscum_contagiosum            105
std_aids                             105
std_hiv                              105
std_hep_B                            105
std_hpv         

In [49]:
df.columns

Index(['age', 'number_of_sexual_partners', 'first_sexual_intercourse',
       'num_of_pregnancies', 'smokes', 'smokes_years', 'smokes_packs/year',
       'hormonal_contraceptives', 'hormonal_contraceptives_years', 'iud',
       'iud_years', 'stds', 'stds_number', 'std_condylomatosis',
       'std_cervical condylomatosis', 'std_vaginal_condylomatosis',
       'std_vulvo_perineal_condylomatosis', 'std_syphilis',
       'std_pelvic_inflammatory_disease', 'std_genital_herpes',
       'std_molluscum_contagiosum', 'std_aids', 'std_hiv', 'std_hep_B',
       'std_hpv', 'std_number_of_diagnosis', 'std_ time_since_first_diagnosis',
       'std_time_since_last_diagnosis', 'dx_cancer', 'dx_cin', 'dx_hpv', 'dx',
       'hinselmann', 'schiller', 'citology', 'biopsy'],
      dtype='object')

## EDA

In [50]:
# summary statistics
df.describe()

Unnamed: 0,age,std_number_of_diagnosis,dx_cancer,dx_cin,dx_hpv,dx,hinselmann,schiller,citology,biopsy
count,858.0,858.0,858.0,858.0,858.0,858.0,858.0,858.0,858.0,858.0
mean,26.820513,0.087413,0.020979,0.01049,0.020979,0.027972,0.040793,0.086247,0.051282,0.064103
std,8.497948,0.302545,0.143398,0.101939,0.143398,0.164989,0.197925,0.280892,0.220701,0.245078
min,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,84.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [51]:
# correlations
px.imshow(df.corr(), text_auto=True, aspect='auto')

In [56]:
columns = ['age', 'number_of_sexual_partners', 'first_sexual_intercourse',
       'num_of_pregnancies', 'smokes', 'smokes_years', 'smokes_packs/year', 'hormonal_contraceptives', 'hormonal_contraceptives_years', 'iud',
       'iud_years', 'stds', 'stds_number', 'std_condylomatosis',
       'std_cervical condylomatosis', 'std_vaginal_condylomatosis',
       'std_vulvo_perineal_condylomatosis', 'std_syphilis',
       'std_pelvic_inflammatory_disease', 'std_genital_herpes',
       'std_molluscum_contagiosum', 'std_aids', 'std_hiv', 'std_hep_B',
       'std_hpv', 'std_number_of_diagnosis', 'std_ time_since_first_diagnosis',
       'std_time_since_last_diagnosis', 'dx_cancer', 'dx_cin', 'dx_hpv', 'dx',
       'hinselmann', 'schiller', 'citology', 'biopsy']
for column in columns:
    px.histogram(df[column].sort_values(), title='Distribution of ' + str.upper(column).replace('_', ' '), template='ggplot2', labels={'value':str.upper(column).replace('_', ' ')}).show()

In [53]:
columns = ['age', 'number_of_sexual_partners', 'first_sexual_intercourse',
       'num_of_pregnancies', 'smokes_years', 'smokes_packs/year',
       'hormonal_contraceptives_years', 'iud_years', 'stds_number', 'std_number_of_diagnosis', 'std_ time_since_first_diagnosis',
       'std_time_since_last_diagnosis']
for column in columns:
    px.box(df[column], title='Distribution of ' + str.upper(column).replace('_', ' '), template='ggplot2', labels={'value':str.upper(column).replace('_', ' ')}).show()

## Feature Engineering

## Modeling