# 1. Busines Understanding
Tujuan dari menganalisi ini adalah:


*   Mengetahui Algoritma Machine Learning yang cocok digunakan untuk klasifikasi diagnosis penyakit Kanker
*   Algoritma Machine Learning yang digunakan adalah: Algoritma K-NN, Algoritma Naive Bayes, Algoritma Decision Tree, Algoritma LogisticRegression, Algoritma Random Foreest, dan Algoritma Support Vector Machine (SVM)



# 2. Data Understanding


*   Dataset yang digunakan adalah Cancer_Data.csv
*   Dataset tersebut didapatkan melalui Kaggle
https://www.kaggle.com/datasets/erdemtaha/cancer-data



# 3. Data Preparation

In [None]:
# Mengimport seluruh module yang dibutuhkan
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report

#import LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Algoritma k-NN
from sklearn.neighbors import KNeighborsClassifier

# Algoritma Naive Bayes
from sklearn.naive_bayes import GaussianNB

# Algoritma Decision Tree
from sklearn.tree import DecisionTreeClassifier

# Algoritma LogisticRegression
from sklearn.linear_model import LogisticRegression

# Algoritma Random Foreest
from sklearn.ensemble import RandomForestClassifier

# Algoritma Support Vector Machine (SVC)
from sklearn.svm import SVC

In [None]:
# Membaca dataset dalam bentuk csv

cancer = pd.read_csv('/content/drive/MyDrive/MYUU DRIVE/Cancer_Data.csv')

# Melihat data

cancer.head()

In [None]:
# Cancer.drop digunakan untuk membuang data yang tidak digunakan
# Data yang tidak saya gunakan adalah id dan Unnamed: 32
cancer.drop(['id', 'Unnamed: 32'], axis = 1, inplace=True)
cancer.head()

In [None]:
# Memahami bentuk data dalam dataset (569 data, 31 kolom)
cancer.shape

In [None]:
cancer.columns

In [None]:
# Memahami data dan atribut dalam dataset cancer
cancer.info()

In [None]:
#Mengecek Null
cancer.isnull().sum()

In [None]:
# Mencari jumlah dari data diagnosis (object)
cancer['diagnosis'].value_counts()

In [None]:
# Merubah data nominal ke data numerik
cancer2 = cancer.copy()
cancer2 = cancer2.replace({ 'diagnosis': {'B': 0, 'M': 1}})

In [None]:
# Mencari jumlah dari data diagnosis (int64)
cancer2['diagnosis'].value_counts()

In [None]:
# Membuat data dan atribut dalam dataset
cancer.describe().values

### Exploratory Data Analysis (EDA)



In [None]:
# Univariate analysis
# 0 untuk  Benign Cancer (B) dan 1 untuk Malignant Cancer (M)
plt.figure(figsize=(9,7))
plt.title('Jumlah Tipe Kanker')
sns.countplot(x='diagnosis', data=cancer2)
plt.xlabel('Tipe')
plt.ylabel('Jumlah')
plt.show()

In [None]:
# Bivariate analysis
# Bagian 1: diagnosis dengan radius_mean
# Bagian 2: diagnosis dengan Texture_mean
plt.subplot(1,2,1)
sns.boxplot(x='diagnosis', y='radius_mean', data=cancer)

plt.subplot(1,2,2)
sns.boxplot(x='diagnosis', y='texture_mean', data=cancer)

plt.show()

In [None]:
plt.figure(figsize=(7, 5))
sns.distplot(cancer2.radius_mean[cancer2.diagnosis == 0], color="r" ,label="Benign cancer")
sns.distplot(cancer2.radius_mean[cancer2.diagnosis == 1], color="g" ,label="Malignant cancer")
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(7, 5))
sns.distplot(cancer2.texture_mean[cancer2.diagnosis == 0], color="r" ,label="Benign cancer")
sns.distplot(cancer2.texture_mean[cancer2.diagnosis == 1], color="g" ,label="Malignant cancer")
plt.legend()
plt.show()

In [None]:
# Multivariate analysis
korelasi = cancer.corr()
korelasi.shape

In [None]:
# Visualisasi heeatmap
plt.figure(figsize=(15,13))
heatmap = sns.heatmap(korelasi, cbar=True, square=True, annot=True, annot_kws={'size':9}, vmin=-1, vmax=1, fmt=".1f",linewidth=.5, cmap="viridis")
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=9)

In [None]:
# Membuat histogram untuk setiap kolom
cancer.hist(figsize=(20,20))
plt.show()

# 4. Modeling

In [None]:
# atribut biasa
x = cancer.drop(columns='diagnosis')
x.head().values

In [None]:
# Atribut spesial
y = cancer['diagnosis']
y.head().values

In [None]:
# Split data menjadi data training (70%) dan data testing (30%)
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=0)

### Algoritma k-NN

In [None]:
# k-NN
knn = KNeighborsClassifier(n_neighbors=5, weights='distance', p=2, metric='euclidean')

# Training
knn.fit(xtrain, ytrain)

In [None]:
#Klasifikasi dengan algoritma Naive Bayes
ypred = knn.predict(xtest)

In [None]:
# confusion_matrix
print(confusion_matrix(ytest,ypred))

In [None]:
# akurasi k-NN
print(accuracy_score(ytest,ypred))

In [None]:
report=classification_report(ytest,ypred)
print(report)

### Algoritma Naive Bayes

In [None]:
# Naive Bayes
nb = GaussianNB()

# Training
nb.fit(xtrain, ytrain)

In [None]:
#Klasifikasi dengan algoritma Naive Bayes
ypred = nb.predict(xtest)

In [None]:
# akurasi Naive Bayes
print(accuracy_score(ytest,ypred))

In [None]:
report=classification_report(ytest,ypred)
print(report)

### Algoritma Decision Tree

In [None]:
# Decision Tree
dt = DecisionTreeClassifier()

# Training
dt.fit(xtrain, ytrain)

In [None]:
#Klasifikasi dengan algoritma Decision Tree
ypred = dt.predict(xtest)

In [None]:
# akurasi Decision Tree
print(accuracy_score(ytest,ypred))

In [None]:
report=classification_report(ytest,ypred)
print(report)

### Logistic Regression

In [None]:
# LogisticRegression
regressor=LogisticRegression(random_state=0)

# Training
regressor.fit(xtrain,ytrain)

In [None]:
ypred=regressor.predict(xtest)

In [None]:
print(accuracy_score(ytest,ypred))

In [None]:
report=classification_report(ytest,ypred)
print(report)

### Random Forest

In [None]:
# Random Forest
RF = RandomForestClassifier()

# Training
RF.fit(xtrain,ytrain)

In [None]:
#Klasifikasi dengan algoritma Random Forest
ypred = RF.predict(xtest)

In [None]:
# akurasi Random Forest
print(accuracy_score(ytest,ypred))

In [None]:
report=classification_report(ytest,ypred)
print(report)

### Support Vector Machine

In [None]:
# Support Vector Machine
svc = SVC(kernel = 'linear', random_state = 0)

# Training
svc.fit(xtrain, ytrain)

In [None]:
# Klasifikasi dengan algoritma Support Vector Machine
ypred=svc.predict(xtest)

In [None]:
# Akurasi Support Vector Machine
print(accuracy_score(ytest,ypred))

In [None]:
report=classification_report(ytest,ypred)
print(report)

# 5. Evaluation

In [None]:
models = [knn,nb,dt,regressor,RF,svc]
nilai_akurasi = []

for model in models:
  ypred=model.predict(xtest)
  akurasi=accuracy_score(ytest,ypred)

  nilai_akurasi.append(akurasi)
print(nilai_akurasi)

In [None]:
# Daftar warna untuk setiap bar
colors = ['yellow', 'orange', 'pink', 'blue', 'purple', 'green']

# Membuat diagram batang dengan variasi warna
fig, ax = plt.subplots()
bars = ax.bar(['KNN', 'Naive Bayes', 'DT','LR','RF','SVM'], nilai_akurasi)

# Mengatur warna untuk setiap bar
for i in range(len(bars)):
    bars[i].set_color(colors[i])

plt.ylim(0.90, 0.99)
plt.title('Perbandingan Algoritma Machine Learning Untuk Penyakit Kanker', fontsize=15, color='black')
plt.xlabel("Algoritma Machine Learning", fontsize=12, color="r")
plt.ylabel("Nilai Akurasi", fontsize=12, color="r")
plt.tight_layout()
plt.show()

In [None]:
# Label untuk setiap algoritma
labels = ['KNN', 'Decision Tree', 'Naive Bayes','LR','RF','SVM']

# Membuat diagram pie
plt.title('Perbandingan Algoritma Machine Learning Untuk Penyakit Kanker', fontsize=13, color='black')
plt.pie(nilai_akurasi, labels=labels, autopct='%1.2f%%')
plt.axis('equal')  # Memastikan diagram pie memiliki bentuk lingkaran
plt.show()

Kesimpulan dari analisis data di atas adalah


*   Algoritma K-NN memiliki akurasi ***94%***
*   Algoritma Naive Bayes memiliki akurasi ***92%***
*   Algoritma Decision Tree memiliki akurasi ***91%***
*   Algoritma Logistic Regression memiliki akurasi ***95.32%***
*   Algoritma Random Forest Machine memiliki akurasi ***96%***
*   Algoritma Support Vector Machine memiliki akurasi ***95.90%***





