### Cryotherapy Dataset Data Set
https://archive.ics.uci.edu/ml/datasets/Cryotherapy+Dataset+

| Name                | Type    | Description                                            |
|---------------------|---------|--------------------------------------------------------|
| sex                 | integer | Sex of patient: 1, 2                                   |
| age                 | integer | Age of patient: numerical value                        |
| time                | float   | Time elapsed before treatment (month): numerical value |
| number_of_warts     | integer | Number of warts: numerical value                       |
| type                | integer | Type of wart: 1, 2, 3                                  |
| area                | integer | Surface area of warts (mm^2): numerical value          |
| result_of_treatment | integer | Result of treatment: 0, 1                              |

In [1]:
%matplotlib inline

import matplotlib
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv("cryotherapy.csv",header=None)
le = LabelEncoder()
for label, content in df.iteritems():
    if content.dtype == "object":
        df[label] = le.fit_transform(df[label])
df1 = df.apply(pd.to_numeric, errors='coerce')
df1.fillna(0)
continuous_index =  df1.dtypes[df1.dtypes == "float64"].index.values.tolist()

In [3]:
for col in range(0,len(df1.columns)):
    print(str(col) + " " + str(len(df1[col].value_counts())))

0 2
1 24
2 36
3 12
4 3
5 22
6 2


In [14]:
df.dtypes

0      int64
1      int64
2    float64
3      int64
4      int64
5      int64
6      int64
dtype: object

In [15]:
continuous_index

[2]

In [16]:
df1

Unnamed: 0,0,1,2,3,4,5,6
0,1,35,12.00,5,1,100,0
1,1,29,7.00,5,1,96,1
2,1,50,8.00,1,3,132,0
3,1,32,11.75,7,3,750,0
4,1,67,9.25,1,1,42,0
...,...,...,...,...,...,...,...
85,2,34,12.00,3,3,95,0
86,2,20,3.50,6,1,75,1
87,2,35,8.25,8,3,100,0
88,1,24,10.75,10,1,20,1


In [17]:
df2 = pd.DataFrame()
for i in df1.dtypes.index.values.tolist():
    if(i in continuous_index):
        npa = df1.iloc[:,i].to_numpy()
        npa1 = npa[~np.isnan(npa)]
        bins = np.histogram_bin_edges(npa1)
        npa2 = np.digitize(npa, bins)
        df2.insert(loc=i,column=i,value=npa2)
    else:
        series = df1.iloc[:,i]
        df2.insert(loc=i,column=i,value=series)

In [18]:
df2

Unnamed: 0,0,1,2,3,4,5,6
0,1,35,11,5,1,100,0
1,1,29,6,5,1,96,1
2,1,50,7,1,3,132,0
3,1,32,10,7,3,750,0
4,1,67,8,1,1,42,0
...,...,...,...,...,...,...,...
85,2,34,11,3,3,95,0
86,2,20,3,6,1,75,1
87,2,35,7,8,3,100,0
88,1,24,9,10,1,20,1


In [19]:
header_names=[]
for i in range(0, len(df2.columns)-1):
    header_names.append('X'+str(i))
header_names.append('Y1')
df2.to_csv("cryotherapy-discretized.csv", index=False, header=header_names)