# Kimiya Ghanai Machine Learning

## Comparing ML Classification Algorithms

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## import Breast Cancer dataset

In [2]:
df= pd.read_csv('BRCA.csv')
df.head()

Unnamed: 0,Patient_ID,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status
0,TCGA-D8-A1XD,36.0,FEMALE,0.080353,0.42638,0.54715,0.27368,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,15-Jan-17,19-Jun-17,Alive
1,TCGA-EW-A1OX,43.0,FEMALE,-0.42032,0.57807,0.61447,-0.031505,II,Mucinous Carcinoma,Positive,Positive,Negative,Lumpectomy,26-Apr-17,09-Nov-18,Dead
2,TCGA-A8-A079,69.0,FEMALE,0.21398,1.3114,-0.32747,-0.23426,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,08-Sep-17,09-Jun-18,Alive
3,TCGA-D8-A1XR,56.0,FEMALE,0.34509,-0.21147,-0.19304,0.12427,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,25-Jan-17,12-Jul-17,Alive
4,TCGA-BH-A0BF,56.0,FEMALE,0.22155,1.9068,0.52045,-0.31199,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,06-May-17,27-Jun-19,Dead


In [3]:
df.isnull().sum()

Patient_ID             7
Age                    7
Gender                 7
Protein1               7
Protein2               7
Protein3               7
Protein4               7
Tumour_Stage           7
Histology              7
ER status              7
PR status              7
HER2 status            7
Surgery_type           7
Date_of_Surgery        7
Date_of_Last_Visit    24
Patient_Status        20
dtype: int64

In [4]:
df=df.dropna()
df.isnull().sum()

Patient_ID            0
Age                   0
Gender                0
Protein1              0
Protein2              0
Protein3              0
Protein4              0
Tumour_Stage          0
Histology             0
ER status             0
PR status             0
HER2 status           0
Surgery_type          0
Date_of_Surgery       0
Date_of_Last_Visit    0
Patient_Status        0
dtype: int64

## Normalizing data

In [5]:
x= df.drop(['Patient_Status','Patient_ID','Date_of_Surgery','Date_of_Last_Visit'],axis='columns')
y= df['Patient_Status']
df['Patient_Status'].value_counts()

Patient_Status
Alive    255
Dead      62
Name: count, dtype: int64

In [6]:
from sklearn.preprocessing import LabelEncoder
x = x.apply(LabelEncoder().fit_transform)
y = LabelEncoder().fit_transform(y)
y.shape

(317,)

## Split train\test

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
y_test.shape

(64,)

In [8]:
x_train= np.array(x_train)
x_test= np.array(x_test)
y_train= np.array(y_train)
y_test= np.array(y_test)
y_test.shape

(64,)

## Prediction with Naive Bayes

In [9]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train, y_train)
y_pred = nb.predict(x_test)

## Calculating Accuracy, Precision, Recall, Confusion Matrix

In [10]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [11]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("\nConfusion Matrix:\n", cm)

Accuracy: 0.8125
Precision: 0.0
Recall: 0.0

Confusion Matrix:
 [[52  1]
 [11  0]]


### <span style="color:red;"> Naive Bayes classification is too simple for this datset and does not detect the positive cases so I gave it 6/10 </span>

## Prediction with KNN

In [12]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)

## Calculating Accuracy, Precision, Recall, Confusion Matrix

In [13]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [14]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("\nConfusion Matrix:\n", cm)

Accuracy: 0.75
Precision: 0.14285714285714285
Recall: 0.09090909090909091

Confusion Matrix:
 [[47  6]
 [10  1]]


### <span style="color:orange;"> k-nearest neighbor classification is one of the weakest module for this dataset so i gave it 7/10</span>

## Prediction with Decision Tree

In [15]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
y_pred = dt.predict(x_test)

## Calculating Accuracy, Precision, Recall, Confusion Matrix

In [16]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [17]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("\nConfusion Matrix:\n", cm)

Accuracy: 0.640625
Precision: 0.125
Recall: 0.18181818181818182

Confusion Matrix:
 [[39 14]
 [ 9  2]]


### <span style="color:green;"> Disicion tree classification has low recall and misses some positive cases so I gave it 8/10 </span>

## Prediction with Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=3,class_weight='balanced')
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)

## Calculating Accuracy, Precision, Recall, Confusion Matrix

In [19]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [20]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("\nConfusion Matrix:\n", cm)

Accuracy: 0.671875
Precision: 0.1875
Recall: 0.2727272727272727

Confusion Matrix:
 [[40 13]
 [ 8  3]]


### <span style="color:green;"> Random forest classification is also good for this dataset, much better than a single decision tree so I gave it 8.5/10 </span>

## Prediction with SVM

In [21]:
from sklearn import svm
svm = svm.SVC(kernel='linear',class_weight='balanced')
svm.fit(x_train, y_train)
y_pred = svm.predict(x_test)

## Calculating Accuracy, Precision, Recall, Confusion Matrix

In [22]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [23]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("\nConfusion Matrix:\n", cm)

Accuracy: 0.5
Precision: 0.2
Recall: 0.6363636363636364

Confusion Matrix:
 [[25 28]
 [ 4  7]]


### <span style="color:green;"> SVM classification is the best module for this dataset and detcets the positive cases very well so I gave it 10/10</span>

## Prediction with Logestic Regression

In [24]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=1000,class_weight='balanced')
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)

## Calculating Accuracy, Precision, Recall, Confusion Matrix

In [25]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [26]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("\nConfusion Matrix:\n", cm)

Accuracy: 0.5625
Precision: 0.20689655172413793
Recall: 0.5454545454545454

Confusion Matrix:
 [[30 23]
 [ 5  6]]


### <span style="color:green;">Logistic regression is the second of the best module because of high recall and more balanced so i gave it 9.5/10</span>

## Prediction with ANN

In [27]:
from sklearn.neural_network import MLPClassifier
ann = MLPClassifier(hidden_layer_sizes=100, max_iter=1000)
ann.fit(x_train, y_train)
y_pred = ann.predict(x_test)

## Calculating Accuracy, Precision, Recall, Confusion Matrix

In [28]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [29]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("\nConfusion Matrix:\n", cm)

Accuracy: 0.6875
Precision: 0.0
Recall: 0.0

Confusion Matrix:
 [[44  9]
 [11  0]]


### <span style="color:red;">ANN classification is the weakest algorithms for this dataset so i gave it 4/10</span>

# <span style="color:purple;"> So as we can see SVM (support vector machine) is the best one with 50% accuracy, 20% precision and 63% recall for this dataset otherwise ANN (artificial neural network) is the weakest one with 68% acuuracy which is meaningless here and 0% for both precision and recall, thus, the best option here is SVM</span>