In [26]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer

# Load the "Simulator" data
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['diagnosis'] = data.target  # 0 = Malignant, 1 = Benign (Checking labels)

# Map numbers to names for clarity
df['diagnosis'] = df['diagnosis'].map({0: 'Malignant', 1: 'Benign'})



# Group by 'diagnosis' and calculate the mean

print('Average Mean Radius for each Diagnosis Group')
display(df.groupby('diagnosis')['mean radius'].mean())


# plotting the data
# sns.boxplot(x='diagnosis', y='mean radius', data=df)

# Create a fake dictionary mapping features to "Gene Codes"
dict_data = {
    'Feature_Name': ['mean radius', 'mean texture', 'mean perimeter'],
    'Gene_Code': ['GENE_A', 'GENE_B', 'GENE_C']
}
df_dict = pd.DataFrame(dict_data)
display(df_dict)
print("Give Me Space")
print("-------")
print("-------")
print("-------")
print("-------")
print("-------")

# Melt the dataframe
# id_vars: Columns you want to KEEP as identifiers (diagnosis)
# var_name: What to call the new column that holds the old headers
# value_name: What to call the new column that holds the numbers

df_melted = df.melt(id_vars='diagnosis', 
                    var_name='Feature_Name', 
                    value_name='Expression_Value')

print("--- Melted Data ---")
display(df_melted.head())

merged_df = df_melted.merge(df_dict, on='Feature_Name', how='left' )

print("--- Merged Data ---")
display(merged_df.head())

print("-------")
benign_data = merged_df[merged_df['diagnosis'] == 'Benign']
display(benign_data.head())

Average Mean Radius for each Diagnosis Group


diagnosis
Benign       12.146524
Malignant    17.462830
Name: mean radius, dtype: float64

Unnamed: 0,Feature_Name,Gene_Code
0,mean radius,GENE_A
1,mean texture,GENE_B
2,mean perimeter,GENE_C


Give Me Space
-------
-------
-------
-------
-------
--- Melted Data ---


Unnamed: 0,diagnosis,Feature_Name,Expression_Value
0,Malignant,mean radius,17.99
1,Malignant,mean radius,20.57
2,Malignant,mean radius,19.69
3,Malignant,mean radius,11.42
4,Malignant,mean radius,20.29


--- Merged Data ---


Unnamed: 0,diagnosis,Feature_Name,Expression_Value,Gene_Code
0,Malignant,mean radius,17.99,GENE_A
1,Malignant,mean radius,20.57,GENE_A
2,Malignant,mean radius,19.69,GENE_A
3,Malignant,mean radius,11.42,GENE_A
4,Malignant,mean radius,20.29,GENE_A


-------


Unnamed: 0,diagnosis,Feature_Name,Expression_Value,Gene_Code
19,Benign,mean radius,13.54,GENE_A
20,Benign,mean radius,13.08,GENE_A
21,Benign,mean radius,9.504,GENE_A
37,Benign,mean radius,13.03,GENE_A
46,Benign,mean radius,8.196,GENE_A
