In [1]:
import numpy as np
import pandas as pd

## Pre-processing

### Student math performance data set

#### 1) Read in data set

In [2]:
# define category ordering for ordinal variables
cat_Medu = pd.CategoricalDtype(categories = ['1', '2', '3', '4'], ordered = True)
cat_Fedu = pd.CategoricalDtype(categories = ['1', '2', '3', '4'], ordered = True)
cat_traveltime = pd.CategoricalDtype(categories = ['1', '2', '3', '4'], ordered = True)
cat_studytime = pd.CategoricalDtype(categories = ['1', '2', '3', '4'], ordered = True)
cat_failures = pd.CategoricalDtype(categories = ['1', '2', '3', '4'], ordered = True)

# define variable types
# use numeric encoding for binary and numeric vars
stu_dtypes = {
    'school' : 'category',
    'sex' : 'category',
    'age' : 'int8',
    'address' : 'category',
    'famsize' : 'category',
    'Pstatus' : 'category',
    'Medu' : 'category',
    'Fedu' : 'category',
    'Mjob' : 'category',
    'Fjob' : 'category',
    'reason' : 'category',
    'guardian' : 'category',
    'traveltime' : 'category',
    'studytime' : 'category',
    'failures' : 'category',
    'schoolsup' : 'category',
    'famsup' : 'category',
    'paid' : 'category',
    'activities' : 'category',
    'nursery' : 'category',
    'higher' : 'category',
    'internet' : 'category',
    'romantic' : 'category',
    'famrel' : 'int8',
    'freetime' : 'int8',
    'goout' : 'int8',
    'Dalc' : 'int8',
    'Walc' : 'int8',
    'health' : 'int8',
    'absences' : 'int8',
    'G1' : 'int8',
    'G2' : 'int8',
    'G3' : 'int8'
}

# read in student math performance data set
student = pd.read_csv('data/student/student-por.csv', sep = ';', dtype = stu_dtypes)

#### 2) Inspect

In [3]:
student.head(10)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13
5,GP,M,16,U,LE3,T,4,3,services,other,...,5,4,2,1,2,5,6,12,12,13
6,GP,M,16,U,LE3,T,2,2,other,other,...,4,4,4,1,1,3,0,13,12,13
7,GP,F,17,U,GT3,A,4,4,other,teacher,...,4,1,4,1,1,1,2,10,13,13
8,GP,M,15,U,LE3,A,3,2,services,other,...,4,2,2,1,1,1,0,15,16,17
9,GP,M,15,U,GT3,T,3,4,other,other,...,5,5,1,1,1,5,0,12,12,13


In [4]:
student.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 33 columns):
school        649 non-null category
sex           649 non-null category
age           649 non-null int8
address       649 non-null category
famsize       649 non-null category
Pstatus       649 non-null category
Medu          649 non-null category
Fedu          649 non-null category
Mjob          649 non-null category
Fjob          649 non-null category
reason        649 non-null category
guardian      649 non-null category
traveltime    649 non-null category
studytime     649 non-null category
failures      649 non-null category
schoolsup     649 non-null category
famsup        649 non-null category
paid          649 non-null category
activities    649 non-null category
nursery       649 non-null category
higher        649 non-null category
internet      649 non-null category
romantic      649 non-null category
famrel        649 non-null int8
freetime      649 non-null int8
goout  

In [5]:
student.describe(include=['int8']).transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,649.0,16.744222,1.218138,15.0,16.0,17.0,18.0,22.0
famrel,649.0,3.930663,0.955717,1.0,4.0,4.0,5.0,5.0
freetime,649.0,3.180277,1.051093,1.0,3.0,3.0,4.0,5.0
goout,649.0,3.1849,1.175766,1.0,2.0,3.0,4.0,5.0
Dalc,649.0,1.502311,0.924834,1.0,1.0,1.0,2.0,5.0
Walc,649.0,2.280431,1.28438,1.0,1.0,2.0,3.0,5.0
health,649.0,3.53621,1.446259,1.0,2.0,4.0,5.0,5.0
absences,649.0,3.659476,4.640759,0.0,0.0,2.0,6.0,32.0
G1,649.0,11.399076,2.745265,0.0,10.0,11.0,13.0,19.0
G2,649.0,11.570108,2.913639,0.0,10.0,11.0,13.0,19.0


In [6]:
student.describe(include=['category']).transpose()

Unnamed: 0,count,unique,top,freq
school,649,2,GP,423
sex,649,2,F,383
address,649,2,U,452
famsize,649,2,GT3,457
Pstatus,649,2,T,569
Medu,649,5,2,186
Fedu,649,5,2,209
Mjob,649,5,other,258
Fjob,649,5,other,367
reason,649,4,course,285


#### 3) Select variables

In [7]:
# use G3 as target variable - drop G1 and G2
student = student.drop(['G1', 'G2'], axis = 1)

#### 4) Save

In [8]:
student.to_pickle('output/student_cleaned.pkl')

### Cervical cancer risk data set 

#### 1) Read in data set

In [9]:
# define variable types - use float for numeric vars due to presence of NAs
can_dtypes = {
    'Age' : 'float',
    'Number of sexual partners' : 'float',
    'First sexual intercourse (age)' : 'float',
    'Num of pregnancies' : 'float',
    'Smokes' : 'category',
    'Smokes (years)' : 'float',
    'Smokes (packs/year)' : 'float',
    'Hormonal Contraceptives' : 'category',
    'Hormonal Contraceptives (years)' : 'float',
    'IUD' : 'category',
    'IUD (years)' : 'float',
    'STDs' : 'category',
    'STDs (number)' : 'float',
    'STDs:condylomatosis' : 'category',
    'STDs:cervical condylomatosis' : 'category',
    'STDs:vaginal condylomatosis' : 'category',
    'STDs:vulvo-perineal condylomatosis' : 'category',
    'STDs:syphilis' : 'category',
    'STDs:pelvic inflammatory disease' : 'category',
    'STDs:genital herpes' : 'category',
    'STDs:molluscum contagiosum' : 'category',
    'STDs:AIDS' : 'category',
    'STDs:HIV' : 'category',
    'STDs:Hepatitis B' : 'category',
    'STDs:HPV' : 'category',
    'STDs: Number of diagnosis' : 'float',
    'STDs: Time since first diagnosis' : 'float',
    'STDs: Time since last diagnosis' : 'float',
    'Dx:Cancer' : 'category',
    'Dx:CIN' : 'category',
    'Dx:HPV' : 'category',
    'Dx' : 'category',
    'Hinselmann' : 'category',
    'Schiller' : 'category',
    'Citology' : 'category',
    'Biopsy' : 'category'
}


# read in cervical cancer data set
cancer = pd.read_csv('data/cervical_cancer/risk_factors_cervical_cancer.csv', dtype = can_dtypes, na_values = '?')

#### 2) Inspect

In [10]:
cancer.head(10)

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18.0,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0,0,0,0,0,0,0,0
1,15.0,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0,0,0,0,0,0,0,0
2,34.0,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0,0,0,0,0,0,0,0
3,52.0,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,,,1,0,1,0,0,0,0,0
4,46.0,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,,,0,0,0,0,0,0,0,0
5,42.0,3.0,23.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0,0,0,0,0,0,0,0
6,51.0,3.0,17.0,6.0,1.0,34.0,3.4,0.0,0.0,1.0,...,,,0,0,0,0,1,1,0,1
7,26.0,1.0,26.0,3.0,0.0,0.0,0.0,1.0,2.0,1.0,...,,,0,0,0,0,0,0,0,0
8,45.0,1.0,20.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,1,0,1,1,0,0,0,0
9,44.0,3.0,15.0,,1.0,1.266973,2.8,0.0,0.0,,...,,,0,0,0,0,0,0,0,0


In [11]:
cancer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 36 columns):
Age                                   858 non-null float64
Number of sexual partners             832 non-null float64
First sexual intercourse              851 non-null float64
Num of pregnancies                    802 non-null float64
Smokes                                845 non-null category
Smokes (years)                        845 non-null float64
Smokes (packs/year)                   845 non-null float64
Hormonal Contraceptives               750 non-null category
Hormonal Contraceptives (years)       750 non-null float64
IUD                                   741 non-null category
IUD (years)                           741 non-null float64
STDs                                  753 non-null category
STDs (number)                         753 non-null float64
STDs:condylomatosis                   753 non-null category
STDs:cervical condylomatosis          753 non-null category
STDs

In [12]:
cancer.describe(include=['float']).transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,858.0,26.820513,8.497948,13.0,20.0,25.0,32.0,84.0
Number of sexual partners,832.0,2.527644,1.66776,1.0,2.0,2.0,3.0,28.0
First sexual intercourse,851.0,16.9953,2.803355,10.0,15.0,17.0,18.0,32.0
Num of pregnancies,802.0,2.275561,1.447414,0.0,1.0,2.0,3.0,11.0
Smokes (years),845.0,1.219721,4.089017,0.0,0.0,0.0,0.0,37.0
Smokes (packs/year),845.0,0.453144,2.22661,0.0,0.0,0.0,0.0,37.0
Hormonal Contraceptives (years),750.0,2.256419,3.764254,0.0,0.0,0.5,3.0,30.0
IUD (years),741.0,0.514804,1.943089,0.0,0.0,0.0,0.0,19.0
STDs (number),753.0,0.176627,0.561993,0.0,0.0,0.0,0.0,4.0
STDs: Number of diagnosis,858.0,0.087413,0.302545,0.0,0.0,0.0,0.0,3.0


In [13]:
cancer.describe(include=['category']).transpose()

Unnamed: 0,count,unique,top,freq
Smokes,845,2,0.0,722
Hormonal Contraceptives,750,2,1.0,481
IUD,741,2,0.0,658
STDs,753,2,0.0,674
STDs:condylomatosis,753,2,0.0,709
STDs:cervical condylomatosis,753,1,0.0,753
STDs:vaginal condylomatosis,753,2,0.0,749
STDs:vulvo-perineal condylomatosis,753,2,0.0,710
STDs:syphilis,753,2,0.0,735
STDs:pelvic inflammatory disease,753,2,0.0,752


#### 3) Select variables

In [14]:
# use Biopsy diagnosis as target variable - drop Hinselmann, Schiller, and Citology
cancer = cancer.drop(['Hinselmann', 'Schiller', 'Citology'], axis = 1)

#### 4) Save 

In [15]:
cancer.to_pickle('output/cancer_cleaned.pkl')