# Import libraries required

In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import pylab as pl
import os
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [2]:
import warnings, sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.ConvergenceWarning)
warnings.filterwarnings("ignore", category=sklearn.exceptions.DataConversionWarning)

pd.options.mode.chained_assignment = None


# Import dataset

In [3]:
orig_data = pd.read_csv("C:\\DePaulCoursework\\Winter 2019 CSC 529\\Project\\risk_factors_cervical_cancer.csv", 
                          delimiter=",", index_col=None, header=0)

In [4]:
orig_data.shape

(858, 36)

# Set Pandas output print variables so all rows and columns will print

In [5]:
num_rows = orig_data.shape[0] + 1
num_cols = orig_data.shape[1] + 1
pd.set_option('max_rows', num_rows)
pd.set_option('max_columns', num_cols)
np.set_printoptions(threshold = np.inf)

# Examine the format and column names of the dataset

In [6]:
print(orig_data.head(1))

   Age Number of sexual partners First sexual intercourse Num of pregnancies  \
0   18                       4.0                     15.0                1.0   

  Smokes Smokes (years) Smokes (packs/year) Hormonal Contraceptives  \
0    0.0            0.0                 0.0                     0.0   

  Hormonal Contraceptives (years)  IUD IUD (years) STDs STDs (number)  \
0                             0.0  0.0         0.0  0.0           0.0   

  STDs:condylomatosis STDs:cervical condylomatosis  \
0                 0.0                          0.0   

  STDs:vaginal condylomatosis STDs:vulvo-perineal condylomatosis  \
0                         0.0                                0.0   

  STDs:syphilis STDs:pelvic inflammatory disease STDs:genital herpes  \
0           0.0                              0.0                 0.0   

  STDs:molluscum contagiosum STDs:AIDS STDs:HIV STDs:Hepatitis B STDs:HPV  \
0                        0.0       0.0      0.0              0.0      0.0   

   

# Print out dataset information

In [7]:
orig_data.describe(include = "all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Age,858,,,,26.8205,8.49795,13.0,20.0,25.0,32.0,84.0
Number of sexual partners,858,13.0,2.0,272.0,,,,,,,
First sexual intercourse,858,22.0,15.0,163.0,,,,,,,
Num of pregnancies,858,12.0,1.0,270.0,,,,,,,
Smokes,858,3.0,0.0,722.0,,,,,,,
Smokes (years),858,31.0,0.0,722.0,,,,,,,
Smokes (packs/year),858,63.0,0.0,722.0,,,,,,,
Hormonal Contraceptives,858,3.0,1.0,481.0,,,,,,,
Hormonal Contraceptives (years),858,41.0,0.0,269.0,,,,,,,
IUD,858,3.0,0.0,658.0,,,,,,,


# Rename columns to more suitable names (remove spaces, etc)

In [8]:
col_names = list(orig_data.columns.values)
print(col_names)

['Age', 'Number of sexual partners', 'First sexual intercourse', 'Num of pregnancies', 'Smokes', 'Smokes (years)', 'Smokes (packs/year)', 'Hormonal Contraceptives', 'Hormonal Contraceptives (years)', 'IUD', 'IUD (years)', 'STDs', 'STDs (number)', 'STDs:condylomatosis', 'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis', 'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis', 'STDs:pelvic inflammatory disease', 'STDs:genital herpes', 'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV', 'STDs:Hepatitis B', 'STDs:HPV', 'STDs: Number of diagnosis', 'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis', 'Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller', 'Citology', 'Biopsy']


In [9]:
new_col_names = ['age', 'num_sex_partners', 'first_sex_int', 'num_pregnancies', 'smokes', 'smokes_yrs',  
                'smokes_pk_yrs','hormonal_contr', 'hormonal_contr_yrs', 'iud', 'iud_yrs', 'stds', 'stds_num',  
                'stds_condylomatosis', 'stds_cerv_condylomatosis', 'stds_vag_condylomatosis', 'stds_vp_condylomatosis',
                'stds_syphillus', 'stds_pelvic_inf_disease', 'stds_gen_herpes', 'stds_molluscom_contagiosum',  
                'stds_aids', 'stds_hiv', 'stds_hep_b', 'stds_hpv', 'stds_num_dx', 'stds_time_first_dx',  
                'stds_time_last_dx','dx_cancer', 'dx_cin', 'dx_hpv', 'dx', 'hinselmann', 'schiller', 'citology', 
                 'biopsy']

In [10]:
print(len(col_names))
print(len(new_col_names))

36
36


In [11]:
for i in range(len(new_col_names)):
    orig_data.rename(index=str, columns={col_names[i]: new_col_names[i]}, inplace = True)

In [12]:
print(list(orig_data.columns.values))

['age', 'num_sex_partners', 'first_sex_int', 'num_pregnancies', 'smokes', 'smokes_yrs', 'smokes_pk_yrs', 'hormonal_contr', 'hormonal_contr_yrs', 'iud', 'iud_yrs', 'stds', 'stds_num', 'stds_condylomatosis', 'stds_cerv_condylomatosis', 'stds_vag_condylomatosis', 'stds_vp_condylomatosis', 'stds_syphillus', 'stds_pelvic_inf_disease', 'stds_gen_herpes', 'stds_molluscom_contagiosum', 'stds_aids', 'stds_hiv', 'stds_hep_b', 'stds_hpv', 'stds_num_dx', 'stds_time_first_dx', 'stds_time_last_dx', 'dx_cancer', 'dx_cin', 'dx_hpv', 'dx', 'hinselmann', 'schiller', 'citology', 'biopsy']


# Declare path to use to save preprocessing analysis outputs

In [13]:
data_save_path = 'C:\\DePaulCoursework\\Winter 2019 CSC 529\\Project\\Data\\'

if not os.path.exists(data_save_path):
    os.mkdir(data_save_path)


# Save column data types and unique variable values to a text file

In [14]:
var_info_file = data_save_path + 'Orig_Variable_Info.txt'
fp = open(var_info_file, 'w')
for name in new_col_names:
    fp.write('{}:\n'.format(name))
    fp.write('Unique Values:\n')
    fp.write('{}\n'.format(orig_data[name].unique()))
    fp.write('Variable Data Type:\n')
    fp.write('{}\n'.format(orig_data[name].dtype))
    fp.write('\n')
    
fp.close()

# Replace all missing (?) values with NaN

In [15]:
orig_data.replace('?', np.NaN, inplace = True)

# Find the number of missing values per column

In [16]:
orig_data.isna().sum()

age                             0
num_sex_partners               26
first_sex_int                   7
num_pregnancies                56
smokes                         13
smokes_yrs                     13
smokes_pk_yrs                  13
hormonal_contr                108
hormonal_contr_yrs            108
iud                           117
iud_yrs                       117
stds                          105
stds_num                      105
stds_condylomatosis           105
stds_cerv_condylomatosis      105
stds_vag_condylomatosis       105
stds_vp_condylomatosis        105
stds_syphillus                105
stds_pelvic_inf_disease       105
stds_gen_herpes               105
stds_molluscom_contagiosum    105
stds_aids                     105
stds_hiv                      105
stds_hep_b                    105
stds_hpv                      105
stds_num_dx                     0
stds_time_first_dx            787
stds_time_last_dx             787
dx_cancer                       0
dx_cin        

# Remove columns with mostly missing values

In [17]:
orig_data.drop(['stds_time_first_dx', 'stds_time_last_dx'], axis = 1, inplace = True)


# Remove rows of stds column (will remove all std related columns as well)

In [18]:
orig_data.drop(orig_data[orig_data['stds'].isna()].index, axis = 0, inplace = True)

In [19]:
orig_data.isna().sum()

age                            0
num_sex_partners              14
first_sex_int                  6
num_pregnancies               47
smokes                        10
smokes_yrs                    10
smokes_pk_yrs                 10
hormonal_contr                13
hormonal_contr_yrs            13
iud                           16
iud_yrs                       16
stds                           0
stds_num                       0
stds_condylomatosis            0
stds_cerv_condylomatosis       0
stds_vag_condylomatosis        0
stds_vp_condylomatosis         0
stds_syphillus                 0
stds_pelvic_inf_disease        0
stds_gen_herpes                0
stds_molluscom_contagiosum     0
stds_aids                      0
stds_hiv                       0
stds_hep_b                     0
stds_hpv                       0
stds_num_dx                    0
dx_cancer                      0
dx_cin                         0
dx_hpv                         0
dx                             0
hinselmann

# Remove missing values from remaining categorical columns

In [20]:
orig_data.drop(orig_data[orig_data['smokes'].isna()].index, axis = 0, inplace = True)
orig_data.drop(orig_data[orig_data['hormonal_contr'].isna()].index, axis = 0, inplace = True)
orig_data.drop(orig_data[orig_data['iud'].isna()].index, axis = 0, inplace = True)

In [21]:
orig_data.isna().sum()

age                            0
num_sex_partners              14
first_sex_int                  6
num_pregnancies               41
smokes                         0
smokes_yrs                     0
smokes_pk_yrs                  0
hormonal_contr                 0
hormonal_contr_yrs             0
iud                            0
iud_yrs                        0
stds                           0
stds_num                       0
stds_condylomatosis            0
stds_cerv_condylomatosis       0
stds_vag_condylomatosis        0
stds_vp_condylomatosis         0
stds_syphillus                 0
stds_pelvic_inf_disease        0
stds_gen_herpes                0
stds_molluscom_contagiosum     0
stds_aids                      0
stds_hiv                       0
stds_hep_b                     0
stds_hpv                       0
stds_num_dx                    0
dx_cancer                      0
dx_cin                         0
dx_hpv                         0
dx                             0
hinselmann

In [22]:
orig_data.dtypes

age                            int64
num_sex_partners              object
first_sex_int                 object
num_pregnancies               object
smokes                        object
smokes_yrs                    object
smokes_pk_yrs                 object
hormonal_contr                object
hormonal_contr_yrs            object
iud                           object
iud_yrs                       object
stds                          object
stds_num                      object
stds_condylomatosis           object
stds_cerv_condylomatosis      object
stds_vag_condylomatosis       object
stds_vp_condylomatosis        object
stds_syphillus                object
stds_pelvic_inf_disease       object
stds_gen_herpes               object
stds_molluscom_contagiosum    object
stds_aids                     object
stds_hiv                      object
stds_hep_b                    object
stds_hpv                      object
stds_num_dx                    int64
dx_cancer                      int64
d

# Assign float64 data types to appropriate columns

In [23]:
orig_data['age'] = orig_data['age'].astype('float64')
orig_data['num_sex_partners'] = orig_data['num_sex_partners'].astype('float64')
orig_data['first_sex_int'] = orig_data['first_sex_int'].astype('float64')
orig_data['num_pregnancies'] = orig_data['num_pregnancies'].astype('float64')
orig_data['smokes_yrs'] = orig_data['smokes_yrs'].astype('float64')
orig_data['smokes_pk_yrs'] = orig_data['smokes_pk_yrs'].astype('float64')
orig_data['hormonal_contr_yrs'] = orig_data['hormonal_contr_yrs'].astype('float64')
orig_data['iud_yrs'] = orig_data['iud_yrs'].astype('float64')
orig_data['stds_num'] = orig_data['stds_num'].astype('float64')
orig_data['stds_num_dx'] = orig_data['stds_num_dx'].astype('float64')

# Replace missing values in continuous columns with column mean values

In [24]:
orig_data['num_sex_partners'].fillna(orig_data['num_sex_partners'].mean(), inplace = True)
orig_data['first_sex_int'].fillna(orig_data['first_sex_int'].mean(), inplace = True)
orig_data['num_pregnancies'].fillna(orig_data['num_pregnancies'].mean(), inplace = True)

In [25]:
orig_data.isna().sum()

age                           0
num_sex_partners              0
first_sex_int                 0
num_pregnancies               0
smokes                        0
smokes_yrs                    0
smokes_pk_yrs                 0
hormonal_contr                0
hormonal_contr_yrs            0
iud                           0
iud_yrs                       0
stds                          0
stds_num                      0
stds_condylomatosis           0
stds_cerv_condylomatosis      0
stds_vag_condylomatosis       0
stds_vp_condylomatosis        0
stds_syphillus                0
stds_pelvic_inf_disease       0
stds_gen_herpes               0
stds_molluscom_contagiosum    0
stds_aids                     0
stds_hiv                      0
stds_hep_b                    0
stds_hpv                      0
stds_num_dx                   0
dx_cancer                     0
dx_cin                        0
dx_hpv                        0
dx                            0
hinselmann                    0
schiller

# Assign the int64 data type to the appropriate columns

In [26]:
col_names = list(orig_data.columns.values)
for name in col_names:
    if orig_data[name].dtype == 'object':
        orig_data[name] = orig_data[name].astype('float').astype('int64')

In [27]:
orig_data.dtypes

age                           float64
num_sex_partners              float64
first_sex_int                 float64
num_pregnancies               float64
smokes                          int64
smokes_yrs                    float64
smokes_pk_yrs                 float64
hormonal_contr                  int64
hormonal_contr_yrs            float64
iud                             int64
iud_yrs                       float64
stds                            int64
stds_num                      float64
stds_condylomatosis             int64
stds_cerv_condylomatosis        int64
stds_vag_condylomatosis         int64
stds_vp_condylomatosis          int64
stds_syphillus                  int64
stds_pelvic_inf_disease         int64
stds_gen_herpes                 int64
stds_molluscom_contagiosum      int64
stds_aids                       int64
stds_hiv                        int64
stds_hep_b                      int64
stds_hpv                        int64
stds_num_dx                   float64
dx_cancer   

# Print out dataset information now that missing values are removed

In [28]:
orig_data.describe(include = "all").T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,726.0,27.258953,8.731498,13.0,21.0,26.0,33.0,84.0
num_sex_partners,726.0,2.514045,1.621349,1.0,2.0,2.0,3.0,28.0
first_sex_int,726.0,17.094444,2.830762,10.0,15.0,17.0,18.0,32.0
num_pregnancies,726.0,2.318248,1.42617,0.0,1.0,2.0,3.0,11.0
smokes,726.0,0.143251,0.35057,0.0,0.0,0.0,0.0,1.0
smokes_yrs,726.0,1.243246,4.182275,0.0,0.0,0.0,0.0,37.0
smokes_pk_yrs,726.0,0.463816,2.319839,0.0,0.0,0.0,0.0,37.0
hormonal_contr,726.0,0.641873,0.47978,0.0,0.0,1.0,1.0,1.0
hormonal_contr_yrs,726.0,2.234937,3.659374,0.0,0.0,0.5,3.0,22.0
iud,726.0,0.11157,0.315054,0.0,0.0,0.0,0.0,1.0


# Print out first row of cleaned data and data size

In [29]:
print(orig_data.head(1))

    age  num_sex_partners  first_sex_int  num_pregnancies  smokes  smokes_yrs  \
0  18.0               4.0           15.0              1.0       0         0.0   

   smokes_pk_yrs  hormonal_contr  hormonal_contr_yrs  iud  iud_yrs  stds  \
0            0.0               0                 0.0    0      0.0     0   

   stds_num  stds_condylomatosis  stds_cerv_condylomatosis  \
0       0.0                    0                         0   

   stds_vag_condylomatosis  stds_vp_condylomatosis  stds_syphillus  \
0                        0                       0               0   

   stds_pelvic_inf_disease  stds_gen_herpes  stds_molluscom_contagiosum  \
0                        0                0                           0   

   stds_aids  stds_hiv  stds_hep_b  stds_hpv  stds_num_dx  dx_cancer  dx_cin  \
0          0         0           0         0          0.0          0       0   

   dx_hpv  dx  hinselmann  schiller  citology  biopsy  
0       0   0           0         0         0     

In [30]:
orig_data.shape

(726, 34)

# Create combination target

In [31]:
orig_data['combo_target'] = 0

ndx = (orig_data.citology == 1) | (orig_data.biopsy == 1) | (orig_data.hinselmann == 1) | (orig_data.schiller == 1)
orig_data.loc[ndx, 'combo_target'] = 1


# Print out the number of class 0 and 1 values for all targets

In [32]:
print('Number of Hinselmann 0: {}'.format(sum(orig_data.hinselmann == 0)))
print('Number of Hinselmann 1: {}'.format(sum(orig_data.hinselmann == 1)))
      

Number of Hinselmann 0: 693
Number of Hinselmann 1: 33


In [33]:
print('Number of Schiller 0: '.format(sum(orig_data.schiller == 0)))
print('Number of Schiller 1: '.format(sum(orig_data.schiller == 1)))


Number of Schiller 0: 
Number of Schiller 1: 


In [34]:
print('Number of Citology 0: {}'.format(sum(orig_data.citology == 0)))
print('Number of Citology 1: {}'.format(sum(orig_data.citology == 1)))


Number of Citology 0: 686
Number of Citology 1: 40


In [35]:
print('Number of Biopsy 0: {}'.format(sum(orig_data.biopsy == 0)))
print('Number of Biopsy 1: {}'.format(sum(orig_data.biopsy == 1)))


Number of Biopsy 0: 676
Number of Biopsy 1: 50


In [36]:
print('Number of Combo Target 0: {}'.format(sum(orig_data.combo_target == 0)))
print('Number of Combo Target 1: {}'.format(sum(orig_data.combo_target == 1)))


Number of Combo Target 0: 633
Number of Combo Target 1: 93


# Create folders to save exploratory plots

In [37]:
dist_save_path = 'C:\\DePaulCoursework\\Winter 2019 CSC 529\\Project\\Distributions\\'
citology_save_path = 'C:\\DePaulCoursework\\Winter 2019 CSC 529\\Project\\Citology_Crosstabs\\'
biopsy_save_path = 'C:\\DePaulCoursework\\Winter 2019 CSC 529\\Project\\Biopsy_Crosstabs\\'
    
if not os.path.exists(dist_save_path):
    os.mkdir(dist_save_path)

if not os.path.exists(citology_save_path):
    os.mkdir(citology_save_path)

if not os.path.exists(biopsy_save_path):
    os.mkdir(biopsy_save_path)    
    

# Create plots for variable distributions

In [38]:
for name in col_names:
    if orig_data[name].dtype == 'float64':
        title_str = 'Histogram of ' + name.capitalize()
        xlab_str = name.capitalize()
        plt.hist(orig_data[name], bins = 20, alpha = 0.5, edgecolor = 'black', linewidth = 1.2)
        plt.xlabel(xlab_str)
        plt.ylabel('Count')
        plt.title(title_str)
        plt.grid(True)
        
    else:
        title_str = name.capitalize() + ' Counts'
        xlab_str = name.capitalize()
        orig_data[name].value_counts().plot(kind = 'bar')
        plt.xlabel(xlab_str)
        plt.ylabel('Count')
        plt.title(title_str)   
        
    save_path = dist_save_path + name.capitalize() + '.png'
    plt.savefig(save_path)
    plt.close()        
        

# Create cross tab plots for the citology target

In [39]:
for name in col_names:
    if orig_data[name].dtype == 'float64':
        title_str = 'Citology Versus ' + name.capitalize()
        xlab_str = name.capitalize()
        orig_data.plot(x = name, y = 'citology', kind = 'scatter')
        plt.xlabel(xlab_str)
        plt.ylabel('Citology')
        plt.title(title_str)        
    else:
        title_str = 'Crosstab of Citology and ' + name.capitalize()
        xlab_str = name.capitalize() + ' by Citology'
        obs_inc_cross = pd.crosstab(orig_data['citology'], orig_data[name])
        obs_inc_cross.plot(kind = "bar")
        plt.xlabel(xlab_str)
        plt.ylabel('Count')
        plt.title(title_str)
        plt.legend(bbox_to_anchor = (1.05, 1), loc = 2, borderaxespad = 0.)    
    
    save_path = citology_save_path + 'Citology_' + name.capitalize() + '_Crosstab.png'
    plt.savefig(save_path)
    plt.close()        
    

# Create cross tab plots for the biopsy target

In [40]:
for name in col_names:
    if orig_data[name].dtype == 'float64':
        title_str = 'Biopsy Versus ' + name.capitalize()
        xlab_str = name.capitalize()
        orig_data.plot(x = name, y = 'biopsy', kind = 'scatter')
        plt.xlabel(xlab_str)
        plt.ylabel('Biopsy')
        plt.title(title_str)        
    else:
        title_str = 'Crosstab of Biopsy and ' + name.capitalize()
        xlab_str = name.capitalize() + ' by Biopsy'
        obs_inc_cross = pd.crosstab(orig_data['biopsy'], orig_data[name])
        obs_inc_cross.plot(kind = "bar")
        plt.xlabel(xlab_str)
        plt.ylabel('Count')
        plt.title(title_str)
        plt.legend(bbox_to_anchor = (1.05, 1), loc = 2, borderaxespad = 0.)    
    
    save_path = biopsy_save_path + 'Biopsy_' + name.capitalize() + '_Crosstab.png'
    plt.savefig(save_path)
    plt.close()        
    

# Create correlation matrix of all dataset variables

In [41]:
orig_corr = orig_data.corr(method = 'pearson')
corr_path = data_save_path + 'CervicalCancerRisks_CorrelationOutput.csv'
orig_corr.to_csv(corr_path, sep = ',')


In [42]:
orig_data_path = data_save_path + 'CervicalCancerRisks_CleanedData.csv'
orig_data.to_csv(orig_data_path, sep = ',')

# Save individual targets into their own dataframes

In [43]:
indices = orig_data.index.values

In [44]:
cit_target = pd.DataFrame(orig_data['citology'], index = indices, columns = ['citology'])
biop_target = pd.DataFrame(orig_data['biopsy'], index = indices, columns = ['biopsy'])
hins_target = pd.DataFrame(orig_data['hinselmann'], index = indices, columns = ['hinselmann'])
sch_target = pd.DataFrame(orig_data['schiller'], index = indices, columns = ['schiller'])
combo_target = pd.DataFrame(orig_data['combo_target'], index = indices, columns = ['combo_target'])


In [45]:
cit_target.head(1)

Unnamed: 0,citology
0,0


# Remove target columns from original feature dataframe

In [46]:
feat_data = orig_data[:]
feat_data.drop(columns=['biopsy', 'citology', 'hinselmann', 'schiller', 'combo_target'], axis = 1, inplace = True)
feat_col_names = list(feat_data.columns.values)


In [47]:
print(feat_data.head(1))

    age  num_sex_partners  first_sex_int  num_pregnancies  smokes  smokes_yrs  \
0  18.0               4.0           15.0              1.0       0         0.0   

   smokes_pk_yrs  hormonal_contr  hormonal_contr_yrs  iud  iud_yrs  stds  \
0            0.0               0                 0.0    0      0.0     0   

   stds_num  stds_condylomatosis  stds_cerv_condylomatosis  \
0       0.0                    0                         0   

   stds_vag_condylomatosis  stds_vp_condylomatosis  stds_syphillus  \
0                        0                       0               0   

   stds_pelvic_inf_disease  stds_gen_herpes  stds_molluscom_contagiosum  \
0                        0                0                           0   

   stds_aids  stds_hiv  stds_hep_b  stds_hpv  stds_num_dx  dx_cancer  dx_cin  \
0          0         0           0         0          0.0          0       0   

   dx_hpv  dx  
0       0   0  


# Function to split data into training and validation sets, then save them

In [48]:
def create_train_val_data(x_data, y_data, test_sz = 0.2, rand_st = 1):
    x_col_names = list(x_data.columns.values)
    y_col_names = list(y_data.columns.values)
    
    data_train, data_val, target_train, target_val = train_test_split(x_data, y_data, 
                                                                      test_size = test_sz, 
                                                                      stratify = y_data,
                                                                      random_state = rand_st)
                          
    data_train_df = pd.DataFrame(data_train, columns = x_col_names)
    train_indices = data_train_df.index.values
    target_train_df = pd.DataFrame(target_train, index = train_indices, columns = y_col_names)
    
    target_name = y_col_names[0].capitalize()
    
    train_data_name = data_save_path + 'CervicalCancerRisks_' + target_name + '_TrainData.csv'
    data_train_df.to_csv(train_data_name, sep = ',')
    train_target_name = data_save_path + 'CervicalCancerRisks_' + target_name + '_TrainTarget.csv'
    target_train_df.to_csv(train_target_name, sep = ',')
    
    print('Target Variable Name: {}'.format(target_name))
    print('Number of Initial Target 0 Value: {}'.format(sum(target_train_df[y_col_names[0]] == 0)))
    print('Number of Initial Target 1 Value: {}'.format(sum(target_train_df[y_col_names[0]] == 1)))
    
    data_val_df = pd.DataFrame(data_val, columns = x_col_names)
    val_indices = data_val_df.index.values
    target_val_df = pd.DataFrame(target_val, index = val_indices, columns = y_col_names)

    val_data_name = data_save_path + 'CervicalCancerRisks_' + target_name + '_ValData.csv'
    data_val_df.to_csv(val_data_name, sep = ',')
    val_target_name = data_save_path + 'CervicalCancerRisks_' + target_name + '_ValTarget.csv'
    target_val_df.to_csv(val_target_name, sep = ',')
    
#    sm = SMOTE(random_state = rand_st)
#    data_train_sm, target_train_sm = sm.fit_resample(data_train, target_train)
    
#    data_train_sm_df = pd.DataFrame(data_train_sm, columns = x_col_names)
#    train_sm_indices = data_train_sm_df.index.values
#    target_train_sm_df = pd.DataFrame(target_train_sm, index = train_sm_indices, columns = y_col_names)
#    train_sm_df = pd.concat([data_train_sm_df, target_train_sm_df], axis = 1)
    
#    train_sm_data_name = data_save_path + 'CervicalCancerRisks_' + target_name + '_TrainData_SMOTE.csv'
#    data_train_sm_df.to_csv(train_sm_data_name, sep = ',')
#    train_sm_target_name = data_save_path + 'CervicalCancerRisks_' + target_name + '_TrainTarget_SMOTE.csv'
#    target_train_sm_df.to_csv(train_sm_target_name, sep = ',')

#    print('Number of SMOTE Target 0 Value: {}'.format(sum(target_train_sm_df[y_col_names[0]] == 0)))
#    print('Number of SMOTE Target 1 Value: {}'.format(sum(target_train_sm_df[y_col_names[0]] == 1)))
    
    return 


# Split feature and citology target, then save training and validation data sets

In [49]:
create_train_val_data(feat_data, cit_target, test_sz = 0.2, rand_st = 1)

Target Variable Name: Citology
Number of Initial Target 0 Value: 548
Number of Initial Target 1 Value: 32


# Split feature and biopsy target, then save training and validation data sets

In [50]:
create_train_val_data(feat_data, biop_target, test_sz = 0.2, rand_st = 1)

Target Variable Name: Biopsy
Number of Initial Target 0 Value: 540
Number of Initial Target 1 Value: 40


# Split feature and Hinselmann target, then save training and validation data sets

In [51]:
create_train_val_data(feat_data, hins_target, test_sz = 0.2, rand_st = 1)

Target Variable Name: Hinselmann
Number of Initial Target 0 Value: 554
Number of Initial Target 1 Value: 26


# Split feature and Schiller target, then save training and validation data sets

In [52]:
create_train_val_data(feat_data, sch_target, test_sz = 0.2, rand_st = 1)

Target Variable Name: Schiller
Number of Initial Target 0 Value: 525
Number of Initial Target 1 Value: 55


# Split feature and combination target, then save training and validation data sets

In [53]:
create_train_val_data(feat_data, combo_target, test_sz = 0.2, rand_st = 1)

Target Variable Name: Combo_target
Number of Initial Target 0 Value: 506
Number of Initial Target 1 Value: 74
