In [3]:

# =================================================================
# 1. Data Collection & Loading
# =================================================================

import numpy as np
import pandas as pd
# Removed from sklearn.datasets, using uploaded CSV instead
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

print("1. Libraries Loaded Successfully.")

# --- Load the Dataset from CSV ---
DATASET_PATH = 'data (1).csv' # Using the uploaded file name

try:
    # Load the dataset into a pandas DataFrame
    data_frame = pd.read_csv(DATASET_PATH)
    print(f"Data collected and loaded from CSV: {DATASET_PATH}")
except FileNotFoundError:
    print(f"Error: Dataset not found at {DATASET_PATH}. Please ensure the file is correctly named and located.")
    exit()


# =================================================================
# 2. Exploratory Data Analysis (EDA)
# =================================================================

print("\n=================================================================")
print("2. Exploratory Data Analysis (EDA)")
print("=================================================================")

# Drop unnecessary identifier column and the 'Unnamed: 32' column which contains all NaNs
data_frame.drop(columns=['id', 'Unnamed: 32'], axis=1, inplace=True, errors='ignore')
print("Dropped 'id' and 'Unnamed: 32' columns.")

# Rename the target column for consistency and convert text labels to binary
# M (Malignant) -> 0, B (Benign) -> 1
data_frame.rename(columns={'diagnosis': 'label'}, inplace=True)
data_frame['label'] = data_frame['label'].map({'M': 0, 'B': 1})

# Display the first five rows of the dataset
print("\n2.1 First five rows of the DataFrame:")
print(data_frame.head())

# Display the last five rows of the dataset
print("\n2.2 Last five rows of the DataFrame (with 'label'):")
print(data_frame.tail())

# Analyze the dataset structure
print("\n2.3 Dataset dimensions (.shape):")
print(data_frame.shape)

print("\n2.4 Data types and non-null values (.info()):")
data_frame.info()

# Check for missing values (Should show all zeros after initial cleaning)
print("\n2.5 Missing values check (.isnull().sum()):")
print(data_frame.isnull().sum().loc[lambda x: x > 0]) # Display only columns with NaNs

# Display summary statistics
print("\n2.6 Summary statistics (.describe()):")
print(data_frame.describe())

# Analyze the target variable distribution
print("\n2.7 Target variable distribution (.value_counts()):")
# Label 0: Malignant (Cancerous), Label 1: Benign (Non-Cancerous)
print(data_frame['label'].value_counts())
print("\nInterpretation: The dataset is slightly imbalanced, with more Benign (1) cases than Malignant (0) cases.")


# =================================================================
# 3. Data Preprocessing
# =================================================================

print("\n=================================================================")
print("3. Data Preprocessing")
print("=================================================================")

# Separate the features (X) and target variable (Y)
X = data_frame.drop(columns='label', axis=1)
Y = data_frame['label']

print(f"Features (X) shape: {X.shape}")
print(f"Target (Y) shape: {Y.shape}")
print("Features and target separated.")


# =================================================================
# 4. Splitting the Dataset
# =================================================================

print("\n=================================================================")
print("4. Splitting the Dataset")
print("=================================================================")

# Split the data into training and testing sets (80% training, 20% testing)
# stratify=Y ensures equal proportions of the target variable (0 and 1) in both train and test sets.
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y,
    test_size=0.2,
    random_state=2,
    stratify=Y # Crucial for classification problems to maintain class balance
)

print(f"Original data size: {X.shape}")
print(f"Training data size (80%): {X_train.shape}")
print(f"Testing data size (20%): {X_test.shape}")


# =================================================================
# 5. Model Training (Logistic Regression)
# =================================================================

print("\n=================================================================")
print("5. Model Training")
print("=================================================================")

# Train a logistic regression model
model = LogisticRegression(max_iter=10000) # Increased max_iter for convergence due to unscaled data
print("Logistic Regression model initialized.")

# Train the model on the training dataset
model.fit(X_train, Y_train)
print("Model training complete.")


# =================================================================
# 6. Model Evaluation
# =================================================================

print("\n=================================================================")
print("6. Model Evaluation (Accuracy Score)")
print("=================================================================")
#

# 6.1 Accuracy on training data:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print('Accuracy on training data = ', training_data_accuracy)

# 6.2 Accuracy on testing data:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print('Accuracy on test data = ', test_data_accuracy)

# Interpretation: High accuracy on both sets indicates the model has learned the patterns well and is not overfitting.


# =================================================================
# 7. Building a Predictive System
# =================================================================

print("\n=================================================================")
print("7. Building a Predictive System")
print("=================================================================")

# Input a sample data point (Malignant case, based on first row of the original dataset)
# The uploaded CSV starts with a Malignant case (M -> 0)
input_data = (17.99, 10.38, 122.8, 1001, 0.1184, 0.2776, 0.3001, 0.1471, 0.2419, 0.07871, 1.095, 0.9053, 8.589, 153.4, 0.006399, 0.04904, 0.05373, 0.01587, 0.03003, 0.006193, 25.38, 17.33, 184.6, 2019, 0.1622, 0.6656, 0.7119, 0.2654, 0.4601, 0.1189)

# Convert the input data into a NumPy array and reshape it
input_data_as_numpy_array = np.asarray(input_data)

# Reshape the array as we are predicting for only one instance (1 row, 30 columns)
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# Predict the output using the trained model
prediction = model.predict(input_data_reshaped)
print(f"Prediction result array: {prediction}")

# Output the result
if (prediction[0] == 0):
    print('The Breast Cancer is Predicted as Malignant (Cancerous)')
else:
    print('The Breast Cancer is Predicted as Benign (Non-Cancerous)')

# Note: This specific input data should ideally predict 0 (Malignant).


# =================================================================
# Submission Deliverables Summary
# =================================================================
print("\n=================================================================")
print("Submission Deliverables Summary")
print("=================================================================")
print(f"1. .ipynb notebook: Code completed above, using data (1).csv.")
print(f"2. Training Accuracy: {training_data_accuracy:.4f}")
print(f"3. Testing Accuracy: {test_data_accuracy:.4f}")
print("4. Predictive System Demonstration: Successfully executed in Step 7.")


1. Libraries Loaded Successfully.
Data collected and loaded from CSV: data (1).csv

2. Exploratory Data Analysis (EDA)
Dropped 'id' and 'Unnamed: 32' columns.

2.1 First five rows of the DataFrame:
   label  radius_mean  texture_mean  perimeter_mean  area_mean  \
0      0        17.99         10.38          122.80     1001.0   
1      0        20.57         17.77          132.90     1326.0   
2      0        19.69         21.25          130.00     1203.0   
3      0        11.42         20.38           77.58      386.1   
4      0        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030  

