# **Task 1: Breast Cancer Classification**

In [73]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import seaborn as sns

**Step 1: Load the dataset**

In [74]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


**Step 2: Data Cleaning**

In [75]:
data=data[['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
        'compactness_mean', 'concavity_mean', 'concave points_mean','diagnosis']]
print(data.isnull().sum())  # Check for missing values

radius_mean            0
texture_mean           0
perimeter_mean         0
area_mean              0
smoothness_mean        0
compactness_mean       0
concavity_mean         0
concave points_mean    0
diagnosis              0
dtype: int64


In [76]:
data.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,M
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,M
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,M
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,M
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,M


In [77]:
# Drop rows with missing values (if any)
print(f"Shape of data before dropna: {data.shape}")  # Check shape before dropna
data = data.dropna()
print(f"Shape of data after dropna: {data.shape}")  # Check shape after dropna


Shape of data before dropna: (569, 9)
Shape of data after dropna: (569, 9)


**Step 3: Data Preprocessing**

In [78]:
# Encode the diagnosis column (M = 1, B = 0)
label_encoder = LabelEncoder()
data['diagnosis'] = label_encoder.fit_transform(data['diagnosis'])

In [79]:
data.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,1
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,1
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,1
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,1
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,1


In [80]:
# Separate features (X) and target (y)
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']

In [81]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [82]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

**Step 4: Model Building**

In [83]:
# 1. K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_test_scaled)
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))

KNN Accuracy: 0.9385964912280702
              precision    recall  f1-score   support

           0       0.96      0.94      0.95        71
           1       0.91      0.93      0.92        43

    accuracy                           0.94       114
   macro avg       0.93      0.94      0.93       114
weighted avg       0.94      0.94      0.94       114



In [84]:
# 2. Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train_scaled, y_train)
y_pred_rfc = rfc.predict(X_test_scaled)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rfc))
print(classification_report(y_test, y_pred_rfc))

Random Forest Accuracy: 0.9473684210526315
              precision    recall  f1-score   support

           0       0.96      0.96      0.96        71
           1       0.93      0.93      0.93        43

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



In [85]:
# 3. Logistic Regression
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_scaled, y_train)
y_pred_log_reg = log_reg.predict(X_test_scaled)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print(classification_report(y_test, y_pred_log_reg))

Logistic Regression Accuracy: 0.9473684210526315
              precision    recall  f1-score   support

           0       0.96      0.96      0.96        71
           1       0.93      0.93      0.93        43

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



In [88]:
# 4. Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train_scaled, y_train)
y_pred_dtc = dtc.predict(X_test_scaled)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dtc))
print(classification_report(y_test, y_pred_dtc))

Decision Tree Accuracy: 0.9298245614035088
              precision    recall  f1-score   support

           0       0.94      0.94      0.94        71
           1       0.91      0.91      0.91        43

    accuracy                           0.93       114
   macro avg       0.93      0.93      0.93       114
weighted avg       0.93      0.93      0.93       114



In [87]:
# 5. Linear Regression (Note: Linear regression is typically used for regression, not classification)
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train)
y_pred_lin_reg = lin_reg.predict(X_test_scaled)
# Convert regression predictions to binary classification (0/1)
y_pred_lin_reg_class = np.where(y_pred_lin_reg > 0.5, 1, 0)
print("Linear Regression Accuracy:", accuracy_score(y_test, y_pred_lin_reg_class))
print(classification_report(y_test, y_pred_lin_reg_class))

Linear Regression Accuracy: 0.9298245614035088
              precision    recall  f1-score   support

           0       0.93      0.96      0.94        71
           1       0.93      0.88      0.90        43

    accuracy                           0.93       114
   macro avg       0.93      0.92      0.92       114
weighted avg       0.93      0.93      0.93       114



**Step 6: Compare and Conclusion**

In [89]:
models = ['KNN', 'Random Forest', 'Logistic Regression', 'Decision Tree', 'Linear Regression']
accuracies = [
    accuracy_score(y_test, y_pred_knn),
    accuracy_score(y_test, y_pred_rfc),
    accuracy_score(y_test, y_pred_log_reg),
    accuracy_score(y_test, y_pred_dtc),
    accuracy_score(y_test, y_pred_lin_reg_class)
]

In [90]:
# Print model accuracies
for model, accuracy in zip(models, accuracies):
    print(f"{model} Accuracy: {accuracy:.2f}")

KNN Accuracy: 0.94
Random Forest Accuracy: 0.95
Logistic Regression Accuracy: 0.95
Decision Tree Accuracy: 0.93
Linear Regression Accuracy: 0.93


**Step 7: Take New User Input for Prediction**

In [91]:
def get_user_input():
    print("Enter the following features for breast cancer diagnosis prediction:")
    features = {
        'radius_mean': float(input("Radius Mean: ")),
        'texture_mean': float(input("Texture Mean: ")),
        'perimeter_mean': float(input("Perimeter Mean: ")),
        'area_mean': float(input("Area Mean: ")),
        'smoothness_mean': float(input("Smoothness Mean: ")),
        'compactness_mean': float(input("Compactness Mean: ")),
        'concavity_mean': float(input("Concavity Mean: ")),
        'concave points_mean': float(input("Concave Points Mean: ")),
        # 'symmetry_mean': float(input("Symmetry Mean: ")),
        # 'fractal_dimension_mean': float(input("Fractal Dimension Mean: ")),
        # You can add more features as needed
    }
    return pd.DataFrame([features])

# Get user input
user_input = get_user_input()

# Scale the user input
user_input_scaled = scaler.transform(user_input)

# Step 8: Use one of the models to predict (Example: KNN)
prediction_knn = knn.predict(user_input_scaled)
prediction_rfc = rfc.predict(user_input_scaled)
prediction_log_reg = log_reg.predict(user_input_scaled)
prediction_dtc = dtc.predict(user_input_scaled)

# Convert the numerical prediction to "Malignant" or "Benign"
diagnosis_knn = "Malignant" if prediction_knn[0] == 1 else "Benign"
diagnosis_rfc = "Malignant" if prediction_rfc[0] == 1 else "Benign"
diagnosis_log_reg = "Malignant" if prediction_log_reg[0] == 1 else "Benign"
diagnosis_dtc = "Malignant" if prediction_dtc[0] == 1 else "Benign"

# Display the predictions
print(f"KNN Prediction: {diagnosis_knn}")
print(f"Random Forest Prediction: {diagnosis_rfc}")
print(f"Logistic Regression Prediction: {diagnosis_log_reg}")
print(f"Decision Tree Prediction: {diagnosis_dtc}")

Enter the following features for breast cancer diagnosis prediction:
Radius Mean: 17.99
Texture Mean: 10.38
Perimeter Mean: 122.8
Area Mean: 1001
Smoothness Mean: 0.1184
Compactness Mean: 0.2776
Concavity Mean: 0.3001
Concave Points Mean: 0.1471
KNN Prediction: Malignant
Random Forest Prediction: Malignant
Logistic Regression Prediction: Malignant
Decision Tree Prediction: Malignant
