In [89]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, normalize
from sklearn.model_selection import train_test_split

In [90]:
#import the csv
df = pd.read_csv("wbc_original.csv")
# to interpolate the missing values
df.interpolate(method ='linear', limit_direction ='forward')

# Outlier Treatment
def outlier_treatment(df, feature):
    q1, q3 = np.percentile(df[feature], [25, 75])
    IQR = q3 - q1 
    lower_range = q1 - (3 * IQR) 
    upper_range = q3 + (3 * IQR)
    to_drop = df[(df[feature]<lower_range)|(df[feature]>upper_range)]
    df.drop(to_drop.index, inplace=True)

outlier_treatment(df, 'class')

In [91]:
#Cleaning the dataset + preprocessing
df = df.drop(['id'], axis=1)
X = df.drop(['class'], axis=1)
Y = df['class'].values
Xnames = X.columns
#X is normalized
# X = pd.DataFrame(normalize(X.values), columns = Xnames)
# X.head()

In [92]:
#Reducing multicollinearity
final_features = [x for x in Xnames]
p = df[Xnames].corr().values.tolist()
for i in range(len(p)):
    for j in range(i+1, len(p)):
        if abs(p[i][j]) > 0.7 and Xnames[i] in final_features:
            final_features.remove(Xnames[i])
print("\n\nFeatures before removing multicollinearity: ", Xnames)
print("\n\nFeatures after removing multicollinearity:\n", final_features)



Features before removing multicollinearity:  Index(['clump_thickness', 'size_uniformity', 'shape_uniformity',
       'marginal_adhesion', 'epithelial_size', 'bare_nucleoli',
       'bland_chromatin', 'normal_nucleoli', 'mitoses'],
      dtype='object')


Features after removing multicollinearity:
 ['clump_thickness', 'marginal_adhesion', 'epithelial_size', 'bare_nucleoli', 'bland_chromatin', 'normal_nucleoli', 'mitoses']


In [93]:
X = X[final_features]
# Splitting the dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [94]:
def calc_prior_prob(Y):
    uniq, count = np.unique(Y_train, return_counts=True)
    res = dict()
    for i,j in zip(uniq, count):
        res.update({str(i):j/Y.size})
    return res

In [95]:
calc_prior_prob(Y_train)

{'2': 0.667262969588551, '4': 0.33273703041144903}

In [100]:
df[df['clump_thickness'] == ]

Unnamed: 0,clump_thickness,size_uniformity,shape_uniformity,marginal_adhesion,epithelial_size,bare_nucleoli,bland_chromatin,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
12,5,3,3,3,2,3,4,4,1,4
25,5,2,3,4,2,7,3,6,1,4
27,5,1,1,1,2,1,2,1,1,2
...,...,...,...,...,...,...,...,...,...,...
677,5,1,1,1,2,1,1,1,1,2
681,5,10,10,10,4,10,5,6,3,4
682,5,1,1,1,2,1,3,2,1,2
691,5,10,10,5,4,5,4,4,1,4


In [103]:
np.unique(df['clump_thickness'])

(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=int64),
 array([145,  50, 108,  80, 130,  34,  23,  46,  14,  69], dtype=int64))

In [110]:
for i in np.unique(df['clump_thickness']):
    df2 = df[df['clump_thickness'] == i]
    df3 = df2[df2['class'] == 2]
    print(df2.count()

     clump_thickness  size_uniformity  shape_uniformity  marginal_adhesion  \
6                  1                1                 1                  1   
10                 1                1                 1                  1   
13                 1                1                 1                  1   
24                 1                1                 1                  1   
29                 1                1                 3                  1   
..               ...              ...               ...                ...   
684                1                1                 1                  1   
685                1                1                 1                  1   
686                1                1                 1                  1   
689                1                1                 1                  1   
690                1                1                 1                  3   

     epithelial_size bare_nucleoli  bland_chromatin  normal_nuc

In [115]:
df2 = df[df['clump_thickness'] == 1]
print(df2.count())
df3 = df2[df2['class'] == 2]
print(df3.count())

clump_thickness      145
size_uniformity      145
shape_uniformity     145
marginal_adhesion    145
epithelial_size      145
bare_nucleoli        145
bland_chromatin      145
normal_nucleoli      145
mitoses              145
class                145
dtype: int64
clump_thickness      142
size_uniformity      142
shape_uniformity     142
marginal_adhesion    142
epithelial_size      142
bare_nucleoli        142
bland_chromatin      142
normal_nucleoli      142
mitoses              142
class                142
dtype: int64
