In [105]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from minisom import MiniSom

from sklearn.preprocessing import MinMaxScaler

In [84]:
# load dataset
data_file_path = '/Users/asmakhimani/Downloads/systemic lupus_Dataset_Team_ML.csv'
data = pd.read_csv(data_file_path)

In [85]:
# preview dataset 
print(data.head())

        ID  Ancestry_African  Ancestry_Any other Asian background  \
0  5713894               0.0                                  0.0   
1  2048882               0.0                                  0.0   
2  1748132               0.0                                  0.0   
3  3772083               0.0                                  0.0   
4  4376107               0.0                                  0.0   

   Ancestry_Any other Black background  Ancestry_Any other mixed background  \
0                                  0.0                                  0.0   
1                                  0.0                                  0.0   
2                                  0.0                                  0.0   
3                                  0.0                                  0.0   
4                                  0.0                                  0.0   

   Ancestry_Any other white background  Ancestry_Asian or Asian British  \
0                                  

In [86]:
# filter for british ancestry 
data = data[data['Ancestry_British'] == 1]

In [87]:
# how many british ancestry patients
int(data['Ancestry_British'].sum())

442574

In [88]:
# drop nan values from data
data = data.dropna()

In [89]:
# rename columns 

column_renames = {
    'f.21022.0.0': 'Age',
    'f.21001.0.0': 'BMI',
    'f.31.0.0_Female': 'Female',
    'f.31.0.0_Male': 'Male',
    'f.20116.0.0_Current': 'Current Smoker',
    'f.20116.0.0_Previous': 'Former Smoker',
    'f.20116.0.0_Never': 'Never Smoker',
    'f.20116.0.0_Prefer not to answer': 'Unknown Smoking Status',
    'f.6145_Death of a close relative': 'Death of Family Member',
    'f.6145_Serious illness, injury or assault to yourself': 'Serious illness, injury or assault to yourself',
    'f.6145_Serious illness, injury or assault of a close relative': 'Serious illness, injury or assault of a close relative',
    'f.6145_Financial difficulties': 'Financial difficulties',
    'f.6145_Marital separation/divorce': 'Marital separation/divorce',
    'f.6145_Death of a spouse or partner': 'Death of a spouse or partner',
    'f.6145_None of the above': 'No Stress',
    'f.6145_Prefer not to answer': 'Unknown Stress Status',
    'f.1558.0.0_Daily or almost daily': 'Daily Alcohol Consumption',
    'f.1558.0.0_Never': 'No Alcohol Consumption',
    'f.1558.0.0_Once or twice a week': 'Once or Twice a Week Alcohol Consumption',
    'f.1558.0.0_One to three times a month': 'One to Three Times a Month Alcohol Consumption',
    'f.1558.0.0_Prefer not to answer': 'Unknown Alcohol Consumption',
    'f.1558.0.0_Special occasions only': 'Occasional Alcohol Consumption',
    'f.1558.0.0_Three or four times a week': 'Three or Four Times a week Alcohol Consumption',
    'f.904.0.0': 'Number of Days of Physical Activity',
    'f.904.0.0_Do not know': 'Not known Physical Activity Status',
    'f.904.0.0_Prefer not to answer': 'Not Answered Physical Activity Status',
    'f.20107_Father_Bowel cancer' : 'Father has Bowel Cancer',
    'f.20107_Father_Diabetes': 'Father has Diabetes',
    'f.20110_Mother_Bowel cancer': 'Mother has Bowel Cancer',
    'f.20110_Mother_Diabetes': 'Mother has Diabetes'
}

In [90]:
# only keep renamed columns in dataframe
columns_to_keep = column_renames.keys()
data = data[columns_to_keep]
data = data.rename(columns=column_renames)
print(data.head())

    Age    BMI  Female  Male  Current Smoker  Former Smoker  Never Smoker  \
0  60.0  34.31     0.0   1.0             0.0            0.0           1.0   
1  54.0  22.62     1.0   0.0             0.0            0.0           1.0   
2  64.0  23.18     1.0   0.0             0.0            0.0           1.0   
3  47.0  23.54     0.0   1.0             0.0            0.0           1.0   
5  68.0  24.65     1.0   0.0             0.0            0.0           1.0   

   Unknown Smoking Status  Death of Family Member  \
0                     0.0                     1.0   
1                     0.0                     1.0   
2                     0.0                     0.0   
3                     0.0                     0.0   
5                     0.0                     1.0   

   Serious illness, injury or assault to yourself  ...  \
0                                             0.0  ...   
1                                             0.0  ...   
2                                           

In [91]:
# combine smoking related columns into a single feature
def combine_smoking_status(row):
    if row['Current Smoker'] == 1:
        return 3
    elif row['Former Smoker'] == 1:
        return 2
    elif row['Never Smoker'] == 1:
        return 1
    else:  # unknown or prefer not to answer
        return 0

data['Smoking_Status'] = data.apply(combine_smoking_status, axis=1)

In [92]:
# stress feature 

def combine_stress_binary(row):
    # check if any stress-related column is marked
    if (
        row['Death of Family Member'] == 1 or
        row['Serious illness, injury or assault to yourself'] == 1 or
        row['Serious illness, injury or assault of a close relative'] == 1 or
        row['Financial difficulties'] == 1 or
        row['Marital separation/divorce'] == 1 or
        row['Death of a spouse or partner'] == 1
    ):
        return 1  # stress present
    elif row['No Stress'] == 1:
        return 0  # no stress
    else:
        return 0  # no stress if unknown or missing
    
data['Stress'] = data.apply(combine_stress_binary, axis=1)

In [93]:
# Combine alcohol consumption columns into a single feature
def combine_alcohol(row):
    if row['Daily Alcohol Consumption'] == 1:
        return 5
    elif row['Three or Four Times a week Alcohol Consumption'] == 1:
        return 4
    elif row['Once or Twice a Week Alcohol Consumption'] == 1:
        return 3
    elif row['One to Three Times a Month Alcohol Consumption'] == 1:
        return 2
    elif row['Occasional Alcohol Consumption'] == 1:
        return 1
    elif row['No Alcohol Consumption'] == 1:
        return 0
    else:  # Unknown
        return -1  # Missing value marker

data['Alcohol_Consumption'] = data.apply(combine_alcohol, axis=1)

In [94]:
# physical activity feature
data['Physical_Activity'] = data['Number of Days of Physical Activity'].fillna(-1)

In [95]:
# family history feature 
data['Family_History'] = (
    (data['Father has Bowel Cancer'] == 1) | 
    (data['Father has Diabetes'] == 1) | 
    (data['Mother has Bowel Cancer'] == 1) | 
    (data['Mother has Diabetes'] == 1)
).astype(int)

In [96]:
# keep only the combined features



# verify features
print(data.head())


    Age    BMI  Female  Male  Smoking_Status  Stress  Alcohol_Consumption  \
0  60.0  34.31     0.0   1.0               1       1                    0   
1  54.0  22.62     1.0   0.0               1       1                    2   
2  64.0  23.18     1.0   0.0               1       0                    4   
3  47.0  23.54     0.0   1.0               1       0                    3   
5  68.0  24.65     1.0   0.0               1       1                    3   

   Physical_Activity  Family_History  
0                0.0               1  
1                0.0               1  
2                5.0               0  
3                3.0               0  
5                3.0               0  


In [97]:
# combine male and female into a single gender feature
data['Gender'] = data['Male'].astype(int)  # male as 1, female as 0
data = data.drop(columns=['Male', 'Female'])  # drop the original columns

In [101]:
combined_features = ['Age', 'BMI', 'Gender', 'Smoking_Status', 'Stress', 'Alcohol_Consumption', 
                     'Physical_Activity', 'Family_History']
data = data[combined_features]

# verify features
print(data.head())

    Age    BMI  Gender  Smoking_Status  Stress  Alcohol_Consumption  \
0  60.0  34.31       1               1       1                    0   
1  54.0  22.62       0               1       1                    2   
2  64.0  23.18       0               1       0                    4   
3  47.0  23.54       1               1       0                    3   
5  68.0  24.65       0               1       1                    3   

   Physical_Activity  Family_History  
0                0.0               1  
1                0.0               1  
2                5.0               0  
3                3.0               0  
5                3.0               0  


In [102]:
# scale features
scaler = MinMaxScaler()
data[combined_features] = scaler.fit_transform(data[combined_features])
print(data.head())

        Age       BMI  Gender  Smoking_Status  Stress  Alcohol_Consumption  \
0  0.628571  0.354699     1.0        0.333333     1.0             0.166667   
1  0.457143  0.167839     0.0        0.333333     1.0             0.500000   
2  0.742857  0.176790     0.0        0.333333     0.0             0.833333   
3  0.257143  0.182545     1.0        0.333333     0.0             0.666667   
5  0.857143  0.200288     0.0        0.333333     1.0             0.666667   

   Physical_Activity  Family_History  
0           0.000000             1.0  
1           0.000000             1.0  
2           0.714286             0.0  
3           0.428571             0.0  
5           0.428571             0.0  


## SOM Implementation