In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/content/Classification of Breast Cancer/data (2).csv")

# Display the first 5 rows
print("First 5 rows of the dataset:")
print(df.head().to_markdown(index=False, numalign="left", stralign="left"))

# Display column information
print("\nColumn information:")
print(df.info())

First 5 rows of the dataset:
| id       | diagnosis   | radius_mean   | texture_mean   | perimeter_mean   | area_mean   | smoothness_mean   | compactness_mean   | concavity_mean   | concave points_mean   | symmetry_mean   | fractal_dimension_mean   | radius_se   | texture_se   | perimeter_se   | area_se   | smoothness_se   | compactness_se   | concavity_se   | concave points_se   | symmetry_se   | fractal_dimension_se   | radius_worst   | texture_worst   | perimeter_worst   | area_worst   | smoothness_worst   | compactness_worst   | concavity_worst   | concave points_worst   | symmetry_worst   | fractal_dimension_worst   | Unnamed: 32   |
|:---------|:------------|:--------------|:---------------|:-----------------|:------------|:------------------|:-------------------|:-----------------|:----------------------|:----------------|:-------------------------|:------------|:-------------|:---------------|:----------|:----------------|:-----------------|:---------------|:-------------------

In [None]:
# Drop unnecessary columns
df_cleaned = df.drop(columns=['id', 'Unnamed: 32'])

# Analyze the target variable 'diagnosis'
diagnosis_counts = df_cleaned['diagnosis'].value_counts()
diagnosis_proportions = df_cleaned['diagnosis'].value_counts(normalize=True).mul(100).round(2)

print("Diagnosis counts:")
print(diagnosis_counts.to_markdown(numalign="left", stralign="left"))
print("\nDiagnosis proportions (%):")
print(diagnosis_proportions.to_markdown(numalign="left", stralign="left"))

# Create a bar chart for diagnosis counts
import altair as alt

# Convert counts to a DataFrame for plotting
diagnosis_df = diagnosis_counts.reset_index()
diagnosis_df.columns = ['diagnosis', 'count']

# Define the base chart
base = alt.Chart(diagnosis_df).encode(
    x=alt.X('diagnosis', axis=alt.Axis(title='Diagnosis (B=Benign, M=Malignant)')),
    y=alt.Y('count', axis=alt.Axis(title='Count')),
    tooltip=['diagnosis', 'count']
)

# Create the bar chart
chart = base.mark_bar().encode(
    color=alt.Color('diagnosis', legend=alt.Legend(title="Diagnosis")),
)

# Add text labels on the bars
text = base.mark_text(
    align='center',
    baseline='bottom',
    dy=-5  # Nudge the text up a bit
).encode(
    text=alt.Text('count', format=',.0f'),
    color=alt.value('black') # Set a fixed color for the labels
)

final_chart = (chart + text).properties(
    title='Distribution of Cancer Diagnosis (B=Benign, M=Malignant)'
).interactive() # Make chart interactive for better viewing of details

# Save the chart
final_chart.save('diagnosis_distribution.json')

Diagnosis counts:
| diagnosis   | count   |
|:------------|:--------|
| B           | 357     |
| M           | 212     |

Diagnosis proportions (%):
| diagnosis   | proportion   |
|:------------|:-------------|
| B           | 62.74        |
| M           | 37.26        |


In [None]:
# Select columns ending with '_mean'
mean_cols = [col for col in df_cleaned.columns if col.endswith('_mean')]

# Calculate descriptive statistics for mean features, grouped by 'diagnosis'
descriptive_stats = df_cleaned.groupby('diagnosis')[mean_cols].describe().stack().unstack(level=1)

print("Descriptive statistics of mean features, grouped by diagnosis:")
# Use .round(2) for better readability
print(descriptive_stats.round(2).to_markdown(numalign="left", stralign="left"))

Descriptive statistics of mean features, grouped by diagnosis:
| diagnosis   | ('radius_mean', 'count')   | ('radius_mean', 'mean')   | ('radius_mean', 'std')   | ('radius_mean', 'min')   | ('radius_mean', '25%')   | ('radius_mean', '50%')   | ('radius_mean', '75%')   | ('radius_mean', 'max')   | ('texture_mean', 'count')   | ('texture_mean', 'mean')   | ('texture_mean', 'std')   | ('texture_mean', 'min')   | ('texture_mean', '25%')   | ('texture_mean', '50%')   | ('texture_mean', '75%')   | ('texture_mean', 'max')   | ('perimeter_mean', 'count')   | ('perimeter_mean', 'mean')   | ('perimeter_mean', 'std')   | ('perimeter_mean', 'min')   | ('perimeter_mean', '25%')   | ('perimeter_mean', '50%')   | ('perimeter_mean', '75%')   | ('perimeter_mean', 'max')   | ('area_mean', 'count')   | ('area_mean', 'mean')   | ('area_mean', 'std')   | ('area_mean', 'min')   | ('area_mean', '25%')   | ('area_mean', '50%')   | ('area_mean', '75%')   | ('area_mean', 'max')   | ('smoothness_mean', 'count') 

  descriptive_stats = df_cleaned.groupby('diagnosis')[mean_cols].describe().stack().unstack(level=1)


In [None]:
import altair as alt

# Create a box plot for 'radius_mean' across 'diagnosis' groups
chart_radius = alt.Chart(df_cleaned).mark_boxplot(extent='min-max').encode(
    x=alt.X('diagnosis', title='Diagnosis (B=Benign, M=Malignant)'),
    y=alt.Y('radius_mean', title='Radius (Mean)'),
    color='diagnosis'
).properties(
    title='Distribution of Mean Radius by Diagnosis'
).interactive()

# Save the chart
chart_radius.save('radius_mean_boxplot.json')

In [None]:
# Create a bar chart comparing mean values of top features by diagnosis
import altair as alt

# Select top 5 mean features for comparison
top_features = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean']

# Calculate mean values for each diagnosis group
comparison_data = df_cleaned.groupby('diagnosis')[top_features].mean().reset_index()

# Melt the data for plotting
comparison_melted = comparison_data.melt(id_vars='diagnosis', var_name='feature', value_name='mean_value')

# Create the bar chart
bar_chart = alt.Chart(comparison_melted).mark_bar().encode(
    x=alt.X('feature:N', axis=alt.Axis(title='Feature', labelAngle=-45)),
    y=alt.Y('mean_value:Q', axis=alt.Axis(title='Mean Value')),
    color=alt.Color('diagnosis:N', legend=alt.Legend(title='Diagnosis')),
    column=alt.Column('feature:N', header=alt.Header(title=None)),
    tooltip=['diagnosis', 'feature', alt.Tooltip('mean_value:Q', format='.2f')]
).properties(
    width=120,
    height=300,
    title='Comparison of Mean Feature Values by Diagnosis'
).configure_axis(
    labelFontSize=10,
    titleFontSize=12
)

# Alternative: Grouped bar chart (easier to compare)
grouped_bar = alt.Chart(comparison_melted).mark_bar().encode(
    x=alt.X('diagnosis:N', axis=alt.Axis(title='Diagnosis (B=Benign, M=Malignant)')),
    y=alt.Y('mean_value:Q', axis=alt.Axis(title='Mean Value')),
    color=alt.Color('diagnosis:N', legend=alt.Legend(title='Diagnosis')),
    column=alt.Column('feature:N', header=alt.Header(titleOrient='bottom', labelAngle=-45)),
    tooltip=['diagnosis', 'feature', alt.Tooltip('mean_value:Q', format='.2f')]
).properties(
    width=100,
    height=300,
    title='Mean Feature Values: Benign vs Malignant'
).interactive()

# Display the grouped bar chart
grouped_bar