### Breast Cancer Wisconsin (Original) Data Set
https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)

In [1]:
%matplotlib inline

import matplotlib
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv("breast-cancer-wisconsin.csv",header=None)
le = LabelEncoder()
for label, content in df.iteritems():
    if content.dtype == "object":
        df[label] = le.fit_transform(df[label])
df1 = df.apply(pd.to_numeric, errors='coerce')
df1.fillna(0)
continuous_index =  df1.dtypes[df1.dtypes == "float64"].index.values.tolist()

In [3]:
for col in range(0,len(df.columns)):
    print(str(col) + " " + str(len(df1[col].value_counts())))

0 645
1 10
2 10
3 10
4 10
5 10
6 11
7 10
8 10
9 9
10 2


In [4]:
df1[0]

0      1000025
1      1002945
2      1015425
3      1016277
4      1017023
        ...   
694     776715
695     841769
696     888820
697     897471
698     897471
Name: 0, Length: 699, dtype: int64

In [5]:
df1.columns
df1[10]

0      2
1      2
2      2
3      2
4      2
      ..
694    2
695    2
696    4
697    4
698    4
Name: 10, Length: 699, dtype: int64

In [3]:
y1_mapping = {2: 0, 4: 1}
df1[10] = df1[10].map(y1_mapping)

In [7]:
df.dtypes

0     int64
1     int64
2     int64
3     int64
4     int64
5     int64
6     int64
7     int64
8     int64
9     int64
10    int64
dtype: object

In [4]:
continuous_index

[]

In [8]:
df1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,0,3,1,1,0
1,1002945,5,4,4,5,7,1,3,2,1,0
2,1015425,3,1,1,1,2,2,3,1,1,0
3,1016277,6,8,8,1,3,4,3,7,1,0
4,1017023,4,1,1,3,2,0,3,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,0
695,841769,2,1,1,1,2,0,1,1,1,0
696,888820,5,10,10,3,7,3,8,10,2,1
697,897471,4,8,6,4,3,4,10,6,1,1


In [5]:
df2 = pd.DataFrame()
for i in df1.dtypes.index.values.tolist():
    if(i in continuous_index):
        npa = df1.iloc[:,i].to_numpy()
        npa1 = npa[~np.isnan(npa)]
        bins = np.histogram_bin_edges(npa1)
        npa2 = np.digitize(npa, bins)
        df2.insert(loc=i,column=i,value=npa2)
    else:
        series = df1.iloc[:,i]
        df2.insert(loc=i,column=i,value=series)
df2 = df2.drop(0,axis=1)
df2

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,5,1,1,1,2,0,3,1,1,0
1,5,4,4,5,7,1,3,2,1,0
2,3,1,1,1,2,2,3,1,1,0
3,6,8,8,1,3,4,3,7,1,0
4,4,1,1,3,2,0,3,1,1,0
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1,0
695,2,1,1,1,2,0,1,1,1,0
696,5,10,10,3,7,3,8,10,2,1
697,4,8,6,4,3,4,10,6,1,1


In [6]:
df2.columns

Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64')

In [7]:
header_names=[]
for i in range(0, len(df2.columns)-1):
    header_names.append('X'+str(i))
header_names.append('Y1')

oversample = SMOTE()
cols = df2.columns
X = df2.iloc[:,0:len(cols)-1]
Y = df2.iloc[:,len(cols)-1]
print("Number of rows before SMOTE: " + str(len(Y)))
X, Y = oversample.fit_resample(X, Y)
print("Number of rows after SMOTE: " + str(len(Y)))
df3 = pd.DataFrame(X)
df3['Y1'] = pd.Series(Y) 


df2.to_csv("breast-cancer-wisconsin-discretized.csv", index=False, header=header_names)
df3.to_csv("breast-cancer-wisconsin-discretized-oversampled.csv", index=False, header=header_names)

Number of rows before SMOTE: 699
Number of rows after SMOTE: 916
