### Breast Cancer Campaign

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
import kagglehub

# Step 1: Download the dataset
path = kagglehub.dataset_download("uciml/breast-cancer-wisconsin-data")
print("Path to dataset files:", path)

# Step 2: Read the dataset
data_path = path + "/data.csv"  # Update with the actual dataset path if needed
df = pd.read_csv(data_path)

# Check for missing values and inspect the dataset
print("Initial dataset shape:", df.shape)
print("Missing values per column before cleaning:")
print(df.isnull().sum())

# Drop columns with all missing values
df = df.dropna(axis=1, how='all')
print("Dataset shape after dropping columns with all missing values:", df.shape)

# Separate numeric and non-numeric columns
numeric_cols = df.select_dtypes(include=['number']).columns
non_numeric_cols = df.select_dtypes(exclude=['number']).columns

# Fill missing values in numeric columns with the column mean
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Fill missing values in non-numeric columns (e.g., categorical) with the mode
for col in non_numeric_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

print("Dataset shape after handling missing values:", df.shape)

# Check if DataFrame is empty
if df.empty:
    raise ValueError("The dataset is empty after handling missing values. Please verify the dataset.")

# Step 3: Drop unnecessary columns
if 'id' in df.columns:
    df = df.drop(columns=['id'])

# Confirming columns after dropping unnecessary ones
print("Columns after dropping unnecessary ones:")
print(df.columns)

# Step 4: Preprocessing
# Encoding the target column (assuming 'diagnosis' column needs encoding)
if 'diagnosis' in df.columns:
    label_encoder = LabelEncoder()
    df['diagnosis'] = label_encoder.fit_transform(df['diagnosis'])
else:
    raise ValueError("The 'diagnosis' column is missing from the dataset.")

# Prepare features and target
features = df.drop(columns=['diagnosis'])
target = df['diagnosis']

# Check if features are empty
if features.empty:
    raise ValueError("No features remain after preprocessing. Please verify the dataset.")

# Scaling features
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)

# Re-add the target column
df_scaled['diagnosis'] = target

# Output the refined data to a CSV file
df_scaled.to_csv("data_refined.csv", index=False)
print("Preprocessing complete. Refined data saved as 'data_refined.csv'.")

# Step 5: Visualization
# Pair plots
sns.pairplot(df_scaled, hue='diagnosis')
plt.show()

# Correlation matrix heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df_scaled.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

# Box plots for features
for column in features.columns:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=df_scaled['diagnosis'], y=df_scaled[column])
    plt.title(f"Box Plot of {column} by Diagnosis")
    plt.show()

# Violin plots for features
for column in features.columns:
    plt.figure(figsize=(8, 4))
    sns.violinplot(x=df_scaled['diagnosis'], y=df_scaled[column])
    plt.title(f"Violin Plot of {column} by Diagnosis")
    plt.show()
