In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, normalize
from sklearn.model_selection import train_test_split

In [2]:
#import the csv
df = pd.read_csv("wbc_original.csv")

# to interpolate the missing values
def fillNaN(df):
    df.replace("?", np.nan, inplace = True)
    NaNcols = list()
    for column in df.isnull().columns.values.tolist():
        if df.isnull()[column].value_counts()[0] < df.shape[0]:
            NaNcols.append(column)
    for col in NaNcols:    
        avg_norm_loss = df[col].astype("float").mean(axis=0)
        df[col].replace(np.nan, str(round(avg_norm_loss)), inplace=True) 

fillNaN(df)

# Outlier Treatment
def outlier_treatment(df, feature):
    q1, q3 = np.percentile(df[feature], [25, 75])
    IQR = q3 - q1 
    lower_range = q1 - (3 * IQR) 
    upper_range = q3 + (3 * IQR)
    to_drop = df[(df[feature]<lower_range)|(df[feature]>upper_range)]
    df.drop(to_drop.index, inplace=True)

outlier_treatment(df, 'class')

In [3]:
#Cleaning the dataset + preprocessing
df = df.drop(['id'], axis=1)
X = df.drop(['class'], axis=1)
Y = df['class']
Xnames = X.columns
#X is normalized
# X = pd.DataFrame(normalize(X.values), columns = Xnames)
# X.head()

In [4]:
#Reducing multicollinearity
final_features = [x for x in Xnames]
p = df[Xnames].corr().values.tolist()
for i in range(len(p)):
    for j in range(i+1, len(p)):
        if abs(p[i][j]) > 0.7 and Xnames[i] in final_features:
            final_features.remove(Xnames[i])
print("\n\nFeatures before removing multicollinearity: ", Xnames)
print("\n\nFeatures after removing multicollinearity:\n", final_features)
X = X[final_features]



Features before removing multicollinearity:  Index(['clump_thickness', 'size_uniformity', 'shape_uniformity',
       'marginal_adhesion', 'epithelial_size', 'bare_nucleoli',
       'bland_chromatin', 'normal_nucleoli', 'mitoses'],
      dtype='object')


Features after removing multicollinearity:
 ['clump_thickness', 'marginal_adhesion', 'epithelial_size', 'bare_nucleoli', 'bland_chromatin', 'normal_nucleoli', 'mitoses']


In [5]:
# Splitting the dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
train = X_train
train['class'] = Y_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['class'] = Y_train


In [6]:
prior = (train.groupby("class").count() / len(train)).iloc[:,1] # Estimate prior probabilities
classes = np.unique(train["class"].tolist()) # Storing all possible classes