In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, normalize
from sklearn.model_selection import train_test_split

In [44]:
#import the csv
df = pd.read_csv("wbc_original.csv")

# to interpolate the missing values
def fillNaN(df):
    df.replace("?", np.nan, inplace = True)
    NaNcols = list()
    for column in df.isnull().columns.values.tolist():
        if df.isnull()[column].value_counts()[0] < df.shape[0]:
            NaNcols.append(column)
    for col in NaNcols:    
        avg_norm_loss = df[col].astype("float").mean(axis=0)
        df[col].replace(np.nan, str(round(avg_norm_loss)), inplace=True) 

fillNaN(df)

# Outlier Treatment
def outlier_treatment(df, feature):
    q1, q3 = np.percentile(df[feature], [25, 75])
    IQR = q3 - q1 
    lower_range = q1 - (3 * IQR) 
    upper_range = q3 + (3 * IQR)
    to_drop = df[(df[feature]<lower_range)|(df[feature]>upper_range)]
    df.drop(to_drop.index, inplace=True)

outlier_treatment(df, 'class')

In [45]:
#Cleaning the dataset + preprocessing
df = df.drop(['id'], axis=1)
X = df.drop(['class'], axis=1)
Y = df['class']
Xnames = X.columns
#X is normalized
# X = pd.DataFrame(normalize(X.values), columns = Xnames)
# X.head()

In [46]:
#Reducing multicollinearity
final_features = [x for x in Xnames]
p = df[Xnames].corr().values.tolist()
for i in range(len(p)):
    for j in range(i+1, len(p)):
        if abs(p[i][j]) > 0.7 and Xnames[i] in final_features:
            final_features.remove(Xnames[i])
print("\n\nFeatures before removing multicollinearity: ", Xnames)
print("\n\nFeatures after removing multicollinearity:\n", final_features)
X = X[final_features]



Features before removing multicollinearity:  Index(['clump_thickness', 'size_uniformity', 'shape_uniformity',
       'marginal_adhesion', 'epithelial_size', 'bare_nucleoli',
       'bland_chromatin', 'normal_nucleoli', 'mitoses'],
      dtype='object')


Features after removing multicollinearity:
 ['clump_thickness', 'marginal_adhesion', 'epithelial_size', 'bare_nucleoli', 'bland_chromatin', 'normal_nucleoli', 'mitoses']


In [47]:
# Splitting the dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [48]:
def calc_prior_prob(Y):
    uniq, count = np.unique(Y_train, return_counts=True)
    res = dict()
    for i,j in zip(uniq, count):
        res.update({str(i):j/Y.size})
    return res

In [49]:
calc_prior_prob(Y_train)

{'2': 0.667262969588551, '4': 0.33273703041144903}

In [50]:
forCPT = X
forCPT['class'] = Y

In [52]:
for column in forCPT[::-1]:
    for featvalue in np.unique(forCPT[column]):
        class2featcount = forCPT[(forCPT[column] == featvalue) & (forCPT['class'] == 2)][[column, 'class']].count()[0]
        class4featcount = forCPT[(forCPT[column] == featvalue)][[column, 'class']].count() - class2featcount

In [28]:
(forCPT['clump_thickness'] == 1).values

array([False, False, False, False, False, False,  True, False, False,
       False,  True, False, False,  True, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False,  True, False, False, False, False,  True, False, False,
        True, False, False,  True,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False,  True, False,  True, False, False, False,
       False, False, False, False, False, False, False, False,  True,
        True, False,  True, False, False, False, False,  True, False,
       False,  True, False,  True, False, False, False, False,  True,
       False, False,