In [66]:
import numpy as np 
import pandas as pd

In [67]:
df=pd.read_csv('cancer_data.csv')

PRE-PROCESSING

In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [34]:
df.isnull().sum()

id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

In [69]:
df.shape

(569, 32)

In [70]:
df.drop('id',axis=1,inplace=True)

In [71]:

df['diagnosis']=pd.get_dummies(df['diagnosis'],drop_first=True)

In [72]:
X=df.drop('diagnosis',axis=1)
y=df['diagnosis']

In [73]:
from sklearn.preprocessing import MinMaxScaler
num_cols=df.select_dtypes(include=['int64','float64','int32']).columns

for col in num_cols:
    fill_value=df[col].mean()
    X[col].fillna(fill_value,inplace=True)

minmax=MinMaxScaler()
X[num_cols]=minmax.fit_transform(df[num_cols])

In [74]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Naive Bayes


In [75]:
class  NaiveBayes:

	def __init__(self):
		self.features = list
		self.likelihoods = {}
		self.class_priors = {}
		self.pred_priors = {}

		self.X_train = np.array
		self.y_train = np.array
		self.train_size = int
		self.num_feats = int

	def fit(self, X, y):

		self.features = list(X.columns)
		self.X_train = X
		self.y_train = y
		self.train_size = X.shape[0]
		self.num_feats = X.shape[1]

		for feature in self.features:
			self.likelihoods[feature] = {}
			self.pred_priors[feature] = {}

			for feat_val in np.unique(self.X_train[feature]):
				self.pred_priors[feature].update({feat_val: 0})

				for outcome in np.unique(self.y_train):
					self.likelihoods[feature].update({str(feat_val)+'_'+str(outcome):0})
					self.class_priors.update({outcome: 0})

		self._calc_class_prior()
		self._calc_likelihoods()
		self._calc_predictor_prior()

	def _calc_class_prior(self):
		for outcome in np.unique(self.y_train):
			outcome_count = sum(self.y_train == outcome)
			self.class_priors[outcome] = outcome_count / self.train_size

	def _calc_likelihoods(self):
		for feature in self.features:

			for outcome in np.unique(self.y_train):
				outcome_count = sum(self.y_train == outcome)
				feat_likelihood = self.X_train[feature][self.y_train[self.y_train == outcome].index.values.tolist()].value_counts().to_dict()

				for feat_val, count in feat_likelihood.items():
					self.likelihoods[feature][str(feat_val) + '_' + str(outcome)] = count/outcome_count


	def _calc_predictor_prior(self):
		for feature in self.features:
			feat_vals = self.X_train[feature].value_counts().to_dict()

			for feat_val, count in feat_vals.items():
				self.pred_priors[feature][feat_val] = count/self.train_size


	def predict(self, X):
		results = []
		X = np.array(X)

		for query in X:
			probs_outcome = {}
			for outcome in np.unique(self.y_train):
				prior = self.class_priors[outcome]
				likelihood = 1
				evidence = 1

				for feat, feat_val in zip(self.features, query):
					likelihood *= self.likelihoods[feat][str(feat_val) + '_' + str(outcome)]
					evidence *= self.pred_priors[feat][feat_val]

				posterior = (likelihood * prior) / (evidence)

				probs_outcome[outcome] = posterior

			result = max(probs_outcome, key = lambda x: probs_outcome[x])
			results.append(result)

		return np.array(results)

In [78]:
import math
def accuracy_score(y_true, y_pred):
	return round(float(sum(y_pred == y_true))/float(len(y_true)) * 100 ,2)

In [79]:
nb_clf = NaiveBayes()
nb_clf.fit(X, y)

print("Train Accuracy: {}".format(accuracy_score(y, nb_clf.predict(X))))

Train Accuracy: 100.0


KNN

In [80]:
from scipy.spatial.distance import cdist
from sklearn.metrics import accuracy_score

In [81]:
class KNN:
    def __init__(self):
        pass
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
    
    def predict(self, X, k):
        
        dists = cdist(X, self.X_train)
        
       
        idx = np.argsort(dists, axis=1)[:, :k]
        
        
        k_nearest_labels = self.y_train[idx]
        
        
        predictions = []
        for labels in k_nearest_labels:
            unique_labels, counts = np.unique(labels, return_counts=True)
            pred_label = unique_labels[np.argmax(counts)]
            predictions.append(pred_label)
        
        return np.array(predictions)

In [82]:
knn = KNN()

y_train = y_train.values
knn.fit(X_train, y_train)
# Predict the labels for the testing data
y_pred = knn.predict(X_test, k=5)
# Calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)

In [84]:
print(accuracy)

0.9649122807017544
