
# 🩸 Aura Flow: Menstrual Health Modeling Notebook 🩸
## **Predicting Period Pain Levels with Machine Learning**
This notebook builds **multiple machine learning models** to classify period pain levels, ensuring continuity with the **preprocessed dataset** from the previous analysis.


In [548]:

# Step 1: Load Preprocessed Dataset
import pandas as pd

# Load the dataset 
file_path = "cleaned_menstrual_health_data.csv"
df = pd.read_csv(file_path)


df.head()


Unnamed: 0,StartDate,EndDate,Status,Progress,Duration (in seconds),Finished,RecordedDate,DistributionChannel,UserLanguage,Age_Group,...,42,43,44,45,46,47,48,49,Q41,Q43
0,2023-02-07 17:11:00,2023-02-07 17:18:00,Imported,100,410,True,2023-03-23 08:38:00,qr,EN,22-23,...,Dissatisfied,Satisfied,Satisfied,Satisfied,Satisfied,Very satisfied,Very satisfied,Quite often,Yes,Yes
1,2023-02-08 12:20:00,2023-02-08 12:25:00,Imported,100,334,True,2023-03-23 08:38:00,qr,EN,20-21,...,Very satisfied,Very satisfied,Satisfied,Satisfied,Satisfied,Satisfied,Very satisfied,Quite often,Yes,Yes
2,2023-02-07 15:20:00,2023-02-07 15:26:00,Imported,100,382,True,2023-03-23 08:38:00,qr,EN,18-19,...,Satisfied,Satisfied,Very satisfied,Very satisfied,Very satisfied,Satisfied,Satisfied,Seldom,No,No
3,2023-02-08 13:13:00,2023-02-08 13:17:00,Imported,100,288,True,2023-03-23 08:38:00,qr,EN,22-23,...,Dissatisfied,Satisfied,Dissatisfied,Satisfied,Satisfied,Very satisfied,Very satisfied,Very often,No,No
4,2023-02-07 19:17:00,2023-02-07 19:25:00,Imported,100,464,True,2023-03-23 08:38:00,qr,EN,20-21,...,Very satisfied,Very satisfied,Very satisfied,Very satisfied,Satisfied,Very satisfied,Very satisfied,Seldom,Yes,Yes


In [549]:

# Step 2: Define Features and Target Variable
from sklearn.model_selection import train_test_split

# Define target variable
target = "Period_Pain_Level"


features = ["Work_School_Impact_Days", "Concern_Level_About_Bleeding", 
            "Personal_Income", "Missed_Work_School_Days", "Heavy_Bleeding_Indicators"]

# Split data into training and testing sets
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data successfully split into training and testing sets.")


Data successfully split into training and testing sets.


In [550]:
# Check data types of features
print(X_train.dtypes)

# Identify non-numeric columns
non_numeric_cols = X_train.select_dtypes(exclude=['number']).columns
print("\nNon-numeric columns:", non_numeric_cols)


Work_School_Impact_Days         object
Concern_Level_About_Bleeding    object
Personal_Income                 object
Missed_Work_School_Days         object
Heavy_Bleeding_Indicators       object
dtype: object

Non-numeric columns: Index(['Work_School_Impact_Days', 'Concern_Level_About_Bleeding',
       'Personal_Income', 'Missed_Work_School_Days',
       'Heavy_Bleeding_Indicators'],
      dtype='object')


In [551]:
# Display unique values for each categorical column
for col in ['Work_School_Impact_Days', 'Concern_Level_About_Bleeding', 
            'Personal_Income', 'Missed_Work_School_Days', 'Heavy_Bleeding_Indicators']:
    print(f"\nUnique values in {col}:")
    print(df[col].unique())



Unique values in Work_School_Impact_Days:
['1-3 days' '4-8 days'
 'Never, my bleeding does not affect my work or school' '9-12 days'
 'I am currently not working or attending school outside of the home'
 '13 days or more']

Unique values in Concern_Level_About_Bleeding:
['2' '3' '4' '0' '5' '7' '1' '6' '10' '8' '9' 'Unknown']

Unique values in Personal_Income:
['20,000-34,999' 'Under 20,000' '35,000- 49,999' '75,000-99,999'
 '50,000-74,999' 'Over 100,000' 'Unknown']

Unique values in Missed_Work_School_Days:
['Never, my bleeding does not affect my work or school' '1-3 days'
 '4-8 days' '13 days or more'
 'I am currently not working or attending school outside of the home'
 '9-12 days']

Unique values in Heavy_Bleeding_Indicators:
['Need to use double sanitary protection to control your menstrual flow'
 'Soak through one or more sanitary pads or tampons every hour for several consecutive hours,Need to use double sanitary protection to control your menstrual flow,Bleed for longer than a

In [552]:
import numpy as np

# Define mappings
work_school_mapping = {
    "Never, my bleeding does not affect my work or school": 0,
    "I am currently not working or attending school outside of the home": -1,  # Special case
    "1-3 days": 2,
    "4-8 days": 6,
    "9-12 days": 10.5,
    "13 days or more": 13
}

# Apply mapping
df["Work_School_Impact_Days"] = df["Work_School_Impact_Days"].map(work_school_mapping)
df["Missed_Work_School_Days"] = df["Missed_Work_School_Days"].map(work_school_mapping)


In [553]:
df["Concern_Level_About_Bleeding"] = df["Concern_Level_About_Bleeding"].replace("Unknown", np.nan).astype(float)


In [554]:
income_mapping = {
    "Under 20,000": 10000,
    "20,000-34,999": 27500,
    "35,000- 49,999": 42500,
    "50,000-74,999": 62500,
    "75,000-99,999": 87500,
    "Over 100,000": 110000,
    "Unknown": np.nan
}

df["Personal_Income"] = df["Personal_Income"].map(income_mapping)


In [555]:
# List of symptoms to extract
symptoms = [
    "Need to use double sanitary protection",
    "Soak through one or more sanitary pads or tampons every hour",
    "Bleed for longer than a week",
    "Pass blood clots larger than a quarter",
    "Restrict daily activities due to heavy menstrual flow",
    "Need to wake up to change sanitary protection during the night"
]


for symptom in symptoms:
    df[symptom] = df["Heavy_Bleeding_Indicators"].apply(lambda x: 1 if symptom in str(x) else 0)


df.drop(columns=["Heavy_Bleeding_Indicators"], inplace=True)


In [556]:
print(df.dtypes)  
print(df.isnull().sum())  


StartDate                                                         object
EndDate                                                           object
Status                                                            object
Progress                                                           int64
Duration (in seconds)                                              int64
                                                                   ...  
Soak through one or more sanitary pads or tampons every hour       int64
Bleed for longer than a week                                       int64
Pass blood clots larger than a quarter                             int64
Restrict daily activities due to heavy menstrual flow              int64
Need to wake up to change sanitary protection during the night     int64
Length: 71, dtype: object
StartDate                                                         0
EndDate                                                           0
Status                             

In [557]:
# Remove unnecessary text-based columns
df = df.drop(columns=["StartDate", "EndDate", "Status"], errors="ignore")

print("Unnecessary columns removed!")


Unnecessary columns removed!


In [558]:
# Find any remaining non-numeric columns
non_numeric_cols = df.select_dtypes(exclude=["number"]).columns

print("Remaining Non-Numeric Columns:", non_numeric_cols)


Remaining Non-Numeric Columns: Index(['Finished', 'RecordedDate', 'DistributionChannel', 'UserLanguage',
       'Age_Group', 'Gender_Identity', 'Q68', 'Race', 'Health_Insurance',
       'Primary_Insurance', '5_4_TEXT', 'Period_Flow_Description',
       'Period_Pain_Level', 'Avoided_Social_Activities', 'Chronic_Pelvic_Pain',
       'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26', '17', '19',
       '20', '21', '22', '23', '24', '25', '27', '28', '29', '30', '31', '32',
       '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44',
       '45', '46', '47', '48', '49', 'Q41', 'Q43'],
      dtype='object')


In [559]:
print("Current Columns in DataFrame:")
print(df.columns.tolist())


Current Columns in DataFrame:
['Progress', 'Duration (in seconds)', 'Finished', 'RecordedDate', 'DistributionChannel', 'UserLanguage', 'Age_Group', 'Gender_Identity', 'Q68', 'Race', 'Health_Insurance', 'Primary_Insurance', '5_4_TEXT', 'Personal_Income', 'Period_Flow_Description', 'Period_Pain_Level', 'Work_School_Impact_Days', 'Missed_Work_School_Days', 'Avoided_Social_Activities', 'Concern_Level_About_Bleeding', 'Chronic_Pelvic_Pain', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26', '17', '19', '20', '21', '22', '23', '24', '25', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', 'Q41', 'Q43', 'Need to use double sanitary protection', 'Soak through one or more sanitary pads or tampons every hour', 'Bleed for longer than a week', 'Pass blood clots larger than a quarter', 'Restrict daily activities due to heavy menstrual flow', 'Need to wake up to change sanitary protection during the night']


In [560]:
# Check column data types
print(df.dtypes)

# Check for missing values
print(df.isnull().sum())


Progress                                                           int64
Duration (in seconds)                                              int64
Finished                                                            bool
RecordedDate                                                      object
DistributionChannel                                               object
                                                                   ...  
Soak through one or more sanitary pads or tampons every hour       int64
Bleed for longer than a week                                       int64
Pass blood clots larger than a quarter                             int64
Restrict daily activities due to heavy menstrual flow              int64
Need to wake up to change sanitary protection during the night     int64
Length: 68, dtype: object
Progress                                                          0
Duration (in seconds)                                             0
Finished                           

In [561]:
# Find remaining object columns
non_numeric_cols = df.select_dtypes(include=['object']).columns
print("Non-Numeric Columns:", non_numeric_cols.tolist())


Non-Numeric Columns: ['RecordedDate', 'DistributionChannel', 'UserLanguage', 'Age_Group', 'Gender_Identity', 'Q68', 'Race', 'Health_Insurance', 'Primary_Insurance', '5_4_TEXT', 'Period_Flow_Description', 'Period_Pain_Level', 'Avoided_Social_Activities', 'Chronic_Pelvic_Pain', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26', '17', '19', '20', '21', '22', '23', '24', '25', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', 'Q41', 'Q43']


In [562]:
# Drop unnecessary columns that don't help prediction
drop_cols = ["RecordedDate", "5_4_TEXT", "Q68"]
df.drop(columns=drop_cols, inplace=True, errors="ignore")

print("Removed unnecessary columns.")


Removed unnecessary columns.


In [563]:
# Convert categorical variables into numbers
categorical_cols = [
    "DistributionChannel", "UserLanguage", "Age_Group", "Gender_Identity", "Race",
    "Health_Insurance", "Primary_Insurance", "Period_Flow_Description"
]

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("Converted categorical columns to numeric values.")


Converted categorical columns to numeric values.


In [564]:
# Convert Yes/No answers to binary (1/0)
binary_cols = ["Avoided_Social_Activities", "Chronic_Pelvic_Pain"]

for col in binary_cols:
    df[col] = df[col].map({"Yes": 1, "No": 0})

print("Converted Yes/No columns to binary (1/0).")


Converted Yes/No columns to binary (1/0).


In [565]:
# Convert survey responses to numeric (if they are not already)
survey_cols = [col for col in df.columns if col.startswith("Q") or col.isdigit()]
df[survey_cols] = df[survey_cols].apply(pd.to_numeric, errors="coerce")

print("Converted survey response columns to numbers.")


Converted survey response columns to numbers.


In [566]:
# Check if everything is now numeric
print(df.dtypes)


Progress                                int64
Duration (in seconds)                   int64
Finished                                 bool
Personal_Income                       float64
Period_Pain_Level                      object
                                       ...   
Primary_Insurance_Unknown                bool
Period_Flow_Description_Light            bool
Period_Flow_Description_Moderate         bool
Period_Flow_Description_Very Heavy       bool
Period_Flow_Description_Very Light       bool
Length: 124, dtype: object


In [567]:
print(df["Period_Pain_Level"].unique())


['Slight pain' 'Severe pain' 'Moderate pain' 'No pain' 'Unknown']


In [568]:
pain_mapping = {
    "No pain": 0,
    "Slight pain": 1,
    "Moderate pain": 2,
    "Severe pain": 3,
    "Unknown": np.nan  # Handle "Unknown" as missing data
}

df["Period_Pain_Level"] = df["Period_Pain_Level"].map(pain_mapping)

# Fill any missing values in the target column
df["Period_Pain_Level"].fillna(df["Period_Pain_Level"].median(), inplace=True)

print("Period_Pain_Level converted to numeric values!")


Period_Pain_Level converted to numeric values!


In [569]:
print(df["Period_Pain_Level"].unique())  
print(df.dtypes["Period_Pain_Level"])  


[1. 3. 2. 0.]
float64


In [570]:
# Count missing values in each column
print(df.isnull().sum())


Progress                              0
Duration (in seconds)                 0
Finished                              0
Personal_Income                       1
Period_Pain_Level                     0
                                     ..
Primary_Insurance_Unknown             0
Period_Flow_Description_Light         0
Period_Flow_Description_Moderate      0
Period_Flow_Description_Very Heavy    0
Period_Flow_Description_Very Light    0
Length: 124, dtype: int64


In [571]:

df["Personal_Income"].fillna(df["Personal_Income"].median(), inplace=True)

# Confirm missing values are gone
print("Total missing values left in dataset:", df.isnull().sum().sum())  # Should print "0"


Total missing values left in dataset: 22985


In [572]:
# Display columns with missing values
missing_counts = df.isnull().sum()
print(missing_counts[missing_counts > 0])  


Avoided_Social_Activities       545
Concern_Level_About_Bleeding      5
Chronic_Pelvic_Pain              90
Q19                             545
Q20                             545
Q21                             545
Q22                             545
Q23                             545
Q24                             545
Q25                             545
Q26                             545
17                              545
19                              545
20                              545
21                              545
22                              545
23                              545
24                              545
25                              545
27                              545
28                              545
29                              545
30                              545
31                              545
32                              545
33                              545
34                              545
35                          

In [573]:
df.fillna(df.median(), inplace=True)
print("Filled missing values with median.")


Filled missing values with median.


In [574]:
print("Total missing values left in dataset:", df.isnull().sum().sum())  


Total missing values left in dataset: 22890


In [575]:
# Display only columns with missing values
missing_counts = df.isnull().sum()
print(missing_counts[missing_counts > 0])  

Avoided_Social_Activities    545
Q19                          545
Q20                          545
Q21                          545
Q22                          545
Q23                          545
Q24                          545
Q25                          545
Q26                          545
17                           545
19                           545
20                           545
21                           545
22                           545
23                           545
24                           545
25                           545
27                           545
28                           545
29                           545
30                           545
31                           545
32                           545
33                           545
34                           545
35                           545
36                           545
37                           545
38                           545
39                           545
40        

In [576]:
# Convert numeric columns to float
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='ignore')

print("Converted all columns to their correct data types.")


Converted all columns to their correct data types.


In [577]:
# Display only columns with missing values
missing_counts = df.isnull().sum()
print(missing_counts[missing_counts > 0]) 


Avoided_Social_Activities    545
Q19                          545
Q20                          545
Q21                          545
Q22                          545
Q23                          545
Q24                          545
Q25                          545
Q26                          545
17                           545
19                           545
20                           545
21                           545
22                           545
23                           545
24                           545
25                           545
27                           545
28                           545
29                           545
30                           545
31                           545
32                           545
33                           545
34                           545
35                           545
36                           545
37                           545
38                           545
39                           545
40        

In [578]:
# Identify numeric columns
numeric_cols = df.select_dtypes(include=['number']).columns
print("Numeric Columns:", numeric_cols)


Numeric Columns: Index(['Progress', 'Duration (in seconds)', 'Personal_Income',
       'Period_Pain_Level', 'Work_School_Impact_Days',
       'Missed_Work_School_Days', 'Avoided_Social_Activities',
       'Concern_Level_About_Bleeding', 'Chronic_Pelvic_Pain', 'Q19', 'Q20',
       'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26', '17', '19', '20', '21', '22',
       '23', '24', '25', '27', '28', '29', '30', '31', '32', '33', '34', '35',
       '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47',
       '48', '49', 'Q41', 'Q43', 'Need to use double sanitary protection',
       'Soak through one or more sanitary pads or tampons every hour',
       'Bleed for longer than a week',
       'Pass blood clots larger than a quarter',
       'Restrict daily activities due to heavy menstrual flow',
       'Need to wake up to change sanitary protection during the night'],
      dtype='object')


In [579]:
# Fill missing values in all numeric columns with their median
df.fillna(df.median(numeric_only=True), inplace=True)

print("Filled missing values in all numeric columns with the median.")


Filled missing values in all numeric columns with the median.


In [580]:
# Show only columns with missing values
missing_counts = df.isnull().sum()
print(missing_counts[missing_counts > 0])  


Avoided_Social_Activities    545
Q19                          545
Q20                          545
Q21                          545
Q22                          545
Q23                          545
Q24                          545
Q25                          545
Q26                          545
17                           545
19                           545
20                           545
21                           545
22                           545
23                           545
24                           545
25                           545
27                           545
28                           545
29                           545
30                           545
31                           545
32                           545
33                           545
34                           545
35                           545
36                           545
37                           545
38                           545
39                           545
40        

In [581]:
# Convert all these columns to numeric type (forcing conversion)
missing_cols = [
    "Avoided_Social_Activities", "Q19", "Q20", "Q21", "Q22", "Q23", "Q24", "Q25", "Q26", "17", "19", "20", "21",
    "22", "23", "24", "25", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "40",
    "41", "42", "43", "44", "45", "46", "47", "48", "49", "Q41", "Q43"
]


df[missing_cols] = df[missing_cols].apply(pd.to_numeric, errors='coerce')

print("Forced all missing-value columns to numeric type.")


Forced all missing-value columns to numeric type.


In [582]:
# Fill missing values with the median (since they are now numeric)
df[missing_cols] = df[missing_cols].fillna(df[missing_cols].median())

print("Successfully filled missing values in all previously stuck columns!")


Successfully filled missing values in all previously stuck columns!


In [583]:
print("Total missing values left in dataset:", df.isnull().sum().sum())  


Total missing values left in dataset: 22890


In [584]:
# Find percentage of missing values in each column
missing_percentages = (df.isnull().sum() / len(df)) * 100

# Show columns with more than 10% missing values
print(missing_percentages[missing_percentages > 10])


Avoided_Social_Activities    100.0
Q19                          100.0
Q20                          100.0
Q21                          100.0
Q22                          100.0
Q23                          100.0
Q24                          100.0
Q25                          100.0
Q26                          100.0
17                           100.0
19                           100.0
20                           100.0
21                           100.0
22                           100.0
23                           100.0
24                           100.0
25                           100.0
27                           100.0
28                           100.0
29                           100.0
30                           100.0
31                           100.0
32                           100.0
33                           100.0
34                           100.0
35                           100.0
36                           100.0
37                           100.0
38                  

In [585]:
# Drop all columns that are 100% missing
df.dropna(axis=1, how="all", inplace=True)

print("Dropped all columns with 100% missing values.")


Dropped all columns with 100% missing values.


In [586]:
print("Total missing values left in dataset:", df.isnull().sum().sum())  


Total missing values left in dataset: 0


In [587]:
# Check if any column in X_train contains text
print(X_train.dtypes)


non_numeric_cols = X_train.select_dtypes(exclude=['number']).columns
print("\nNon-numeric columns:", non_numeric_cols)


Work_School_Impact_Days         object
Concern_Level_About_Bleeding    object
Personal_Income                 object
Missed_Work_School_Days         object
Heavy_Bleeding_Indicators       object
dtype: object

Non-numeric columns: Index(['Work_School_Impact_Days', 'Concern_Level_About_Bleeding',
       'Personal_Income', 'Missed_Work_School_Days',
       'Heavy_Bleeding_Indicators'],
      dtype='object')


In [588]:
# Convert all categorical columns into numeric format (if needed)
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
for col in non_numeric_cols:
    X_train[col] = label_encoder.fit_transform(X_train[col])
    X_test[col] = label_encoder.transform(X_test[col])  

print("Reapplied encoding to categorical columns.")


ValueError: y contains previously unseen labels: 'Soak through one or more sanitary pads or tampons every hour for several consecutive hours,Pass blood clots larger than a quarter,Restrict daily activities due to heavy menstrual flow'

In [589]:
# Ensure categorical columns are correctly identified
non_numeric_cols = X_train.select_dtypes(exclude=['number']).columns

# Apply One-Hot Encoding to both train and test sets
X_train = pd.get_dummies(X_train, columns=non_numeric_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=non_numeric_cols, drop_first=True)


X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

print("Successfully applied One-Hot Encoding. Both train and test sets now have matching columns.")


Successfully applied One-Hot Encoding. Both train and test sets now have matching columns.


In [590]:
# Train a new Random Forest model
rf_model = RandomForestClassifier(n_estimators=300, class_weight="balanced", random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Improved Random Forest Accuracy: {accuracy_rf:.2f}\n")

print("Improved Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))



ValueError: could not convert string to float: 'Unknown'

In [591]:
# Identify non-numeric columns in X_test
non_numeric_cols = X_test.select_dtypes(exclude=['number']).columns
print("\nNon-numeric columns in X_test:", non_numeric_cols)



Non-numeric columns in X_test: Index(['Heavy_Bleeding_Indicators'], dtype='object')


In [592]:
# Define the list of possible symptoms
symptoms = [
    "Need to use double sanitary protection",
    "Soak through one or more sanitary pads or tampons every hour",
    "Bleed for longer than a week",
    "Pass blood clots larger than a quarter",
    "Restrict daily activities due to heavy menstrual flow",
    "Need to wake up to change sanitary protection during the night"
]

# Create new binary columns for each symptom
for symptom in symptoms:
    X_train[symptom] = X_train["Heavy_Bleeding_Indicators"].apply(lambda x: 1 if symptom in str(x) else 0)
    X_test[symptom] = X_test["Heavy_Bleeding_Indicators"].apply(lambda x: 1 if symptom in str(x) else 0)

# Drop the original text column
X_train.drop(columns=["Heavy_Bleeding_Indicators"], inplace=True)
X_test.drop(columns=["Heavy_Bleeding_Indicators"], inplace=True)

print("Converted `Heavy_Bleeding_Indicators` into separate numeric columns.")


Converted `Heavy_Bleeding_Indicators` into separate numeric columns.


In [593]:
# Check for any remaining non-numeric columns
non_numeric_cols_after = X_test.select_dtypes(exclude=['number']).columns
print("\nRemaining non-numeric columns in X_test:", non_numeric_cols_after)



Remaining non-numeric columns in X_test: Index([], dtype='object')


In [594]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train a new Random Forest model
rf_model = RandomForestClassifier(n_estimators=300, class_weight="balanced", random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Improved Random Forest Accuracy: {accuracy_rf:.2f}\n")

print("Improved Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))


Improved Random Forest Accuracy: 0.35

Improved Random Forest Classification Report:
               precision    recall  f1-score   support

Moderate pain       0.41      0.24      0.30        46
      No pain       0.17      0.67      0.27         3
  Severe pain       0.33      0.38      0.35        24
  Slight pain       0.37      0.44      0.41        36

     accuracy                           0.35       109
    macro avg       0.32      0.43      0.33       109
 weighted avg       0.37      0.35      0.35       109



In [595]:
from imblearn.over_sampling import SMOTE


smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("Applied SMOTE to balance class distribution in training data.")


ValueError: Expected n_neighbors <= n_samples,  but n_samples = 1, n_neighbors = 6

In [596]:

print(y_train.value_counts())


Period_Pain_Level
Moderate pain    180
Slight pain      151
Severe pain       78
No pain           26
Unknown            1
Name: count, dtype: int64


In [597]:
# Drop "Unknown" from y_train and corresponding X_train rows
mask = y_train != "Unknown"
X_train = X_train[mask]
y_train = y_train[mask]

print("Removed the 'Unknown' class from training data.")


Removed the 'Unknown' class from training data.


In [598]:
from imblearn.over_sampling import SMOTE


smote = SMOTE(random_state=42, k_neighbors=1)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("Applied SMOTE successfully with adjusted k_neighbors.")


Applied SMOTE successfully with adjusted k_neighbors.


In [599]:
# Train the optimized Random Forest model
rf_model = RandomForestClassifier(n_estimators=300, class_weight="balanced", random_state=42)
rf_model.fit(X_train_balanced, y_train_balanced)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Improved Random Forest Accuracy: {accuracy_rf:.2f}\n")

print("Improved Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))


Improved Random Forest Accuracy: 0.32

Improved Random Forest Classification Report:
               precision    recall  f1-score   support

Moderate pain       0.46      0.28      0.35        46
      No pain       0.07      0.33      0.11         3
  Severe pain       0.36      0.33      0.35        24
  Slight pain       0.30      0.36      0.33        36

     accuracy                           0.32       109
    macro avg       0.30      0.33      0.28       109
 weighted avg       0.38      0.32      0.34       109



In [600]:
# Train Random Forest without SMOTE, using class_weight instead
rf_model = RandomForestClassifier(n_estimators=300, class_weight="balanced", random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy (No SMOTE): {accuracy_rf:.2f}\n")

print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy (No SMOTE): 0.34

Random Forest Classification Report:
               precision    recall  f1-score   support

Moderate pain       0.41      0.24      0.30        46
      No pain       0.15      0.67      0.25         3
  Severe pain       0.33      0.38      0.35        24
  Slight pain       0.36      0.42      0.38        36

     accuracy                           0.34       109
    macro avg       0.31      0.42      0.32       109
 weighted avg       0.37      0.34      0.34       109



In [601]:
feature_importances = pd.Series(rf_model.feature_importances_, index=X_train.columns)
feature_importances = feature_importances.sort_values(ascending=False)
print("Feature Importance Ranking:\n")
print(feature_importances.head(20))  


Feature Importance Ranking:

Concern_Level_About_Bleeding                                      0.444277
Work_School_Impact_Days                                           0.210514
Personal_Income                                                   0.200079
Missed_Work_School_Days                                           0.145130
Need to use double sanitary protection                            0.000000
Soak through one or more sanitary pads or tampons every hour      0.000000
Bleed for longer than a week                                      0.000000
Pass blood clots larger than a quarter                            0.000000
Restrict daily activities due to heavy menstrual flow             0.000000
Need to wake up to change sanitary protection during the night    0.000000
dtype: float64


In [602]:
# Keep only the top features (remove features with 0 importance)
top_features = [
    "Concern_Level_About_Bleeding",
    "Work_School_Impact_Days",
    "Personal_Income",
    "Missed_Work_School_Days"
]

X_train_filtered = X_train[top_features]
X_test_filtered = X_test[top_features]

print("Removed non-informative features. Using only the most important ones.")


Removed non-informative features. Using only the most important ones.


In [603]:
# Train Random Forest with filtered features
rf_model = RandomForestClassifier(n_estimators=300, class_weight="balanced", random_state=42)
rf_model.fit(X_train_filtered, y_train)  # Train with filtered dataset

# Make predictions
y_pred_rf = rf_model.predict(X_test_filtered)  
# Evaluate performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Improved Random Forest Accuracy: {accuracy_rf:.2f}\n")

print("Improved Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))


Improved Random Forest Accuracy: 0.35

Improved Random Forest Classification Report:
               precision    recall  f1-score   support

Moderate pain       0.42      0.24      0.31        46
      No pain       0.15      0.67      0.25         3
  Severe pain       0.33      0.38      0.35        24
  Slight pain       0.37      0.44      0.41        36

     accuracy                           0.35       109
    macro avg       0.32      0.43      0.33       109
 weighted avg       0.38      0.35      0.35       109



In [604]:
from xgboost import XGBClassifier

# Train XGBoost with filtered features
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42)
xgb_model.fit(X_train_filtered, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test_filtered)

# Evaluate performance
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb:.2f}\n")

print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3], got ['Moderate pain' 'No pain' 'Severe pain' 'Slight pain']

In [605]:
!pip install xgboost




In [606]:
import xgboost
print("XGBoost is installed and ready to use!")


XGBoost is installed and ready to use!


In [607]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train XGBoost with filtered features
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42)
xgb_model.fit(X_train_filtered, y_train)  # Train on your dataset

# Make predictions
y_pred_xgb = xgb_model.predict(X_test_filtered)

# Evaluate performance
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb:.2f}\n")

print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3], got ['Moderate pain' 'No pain' 'Severe pain' 'Slight pain']

In [608]:
from sklearn.preprocessing import LabelEncoder

# Initialize label encoder
label_encoder = LabelEncoder()

# Convert y_train and y_test to numeric values
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

print("Converted `y_train` and `y_test` to numeric values:")
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))  


Converted `y_train` and `y_test` to numeric values:
{'Moderate pain': 0, 'No pain': 1, 'Severe pain': 2, 'Slight pain': 3}


In [609]:
# Train XGBoost with encoded labels
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42)
xgb_model.fit(X_train_filtered, y_train_encoded)  # Use encoded labels

# Make predictions
y_pred_xgb = xgb_model.predict(X_test_filtered)

# Evaluate performance
accuracy_xgb = accuracy_score(y_test_encoded, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb:.2f}\n")

print("XGBoost Classification Report:")
print(classification_report(y_test_encoded, y_pred_xgb))


XGBoost Accuracy: 0.40

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.46      0.49        46
           1       0.00      0.00      0.00         3
           2       0.35      0.25      0.29        24
           3       0.38      0.47      0.42        36

    accuracy                           0.40       109
   macro avg       0.31      0.29      0.30       109
weighted avg       0.42      0.40      0.41       109



Parameters: { "use_label_encoder" } are not used.



In [610]:
# Get class distribution in y_train_encoded
import numpy as np
from collections import Counter

class_counts = dict(Counter(y_train_encoded))
total_samples = sum(class_counts.values())


scale_weights = {k: total_samples/v for k, v in class_counts.items()}
scale_pos_weight = [scale_weights[i] for i in sorted(scale_weights.keys())]

xgb_model_balanced = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss",
                                   scale_pos_weight=scale_pos_weight, random_state=42)
xgb_model_balanced.fit(X_train_filtered, y_train_encoded)


y_pred_balanced = xgb_model_balanced.predict(X_test_filtered)

accuracy_balanced = accuracy_score(y_test_encoded, y_pred_balanced)
print(f"XGBoost Accuracy (with Class Weighting): {accuracy_balanced:.2f}\n")

print("XGBoost Classification Report (with Class Weighting):")
print(classification_report(y_test_encoded, y_pred_balanced))


XGBoost Accuracy (with Class Weighting): 0.40

XGBoost Classification Report (with Class Weighting):
              precision    recall  f1-score   support

           0       0.53      0.46      0.49        46
           1       0.00      0.00      0.00         3
           2       0.35      0.25      0.29        24
           3       0.38      0.47      0.42        36

    accuracy                           0.40       109
   macro avg       0.31      0.29      0.30       109
weighted avg       0.42      0.40      0.41       109



Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



In [611]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for tuning
param_grid = {
    "n_estimators": [100, 300, 500],  
    "learning_rate": [0.01, 0.05, 0.1],  
    "max_depth": [3, 5, 7]
}

grid_search = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42), 
                           param_grid, cv=3, scoring="accuracy", n_jobs=-1)

grid_search.fit(X_train_filtered, y_train_encoded)

best_xgb = grid_search.best_estimator_
best_xgb.fit(X_train_filtered, y_train_encoded)


y_pred_best_xgb = best_xgb.predict(X_test_filtered)


accuracy_best_xgb = accuracy_score(y_test_encoded, y_pred_best_xgb)
print(f"Best Tuned XGBoost Accuracy: {accuracy_best_xgb:.2f}\n")

print("Best Tuned XGBoost Classification Report:")
print(classification_report(y_test_encoded, y_pred_best_xgb))


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Best Tuned XGBoost Accuracy: 0.50

Best Tuned XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.72      0.59        46
           1       0.00      0.00      0.00         3
           2       0.57      0.33      0.42        24
           3       0.50      0.39      0.44        36

    accuracy                           0.50       109
   macro avg       0.39      0.36      0.36       109
weighted avg       0.50      0.50      0.49       109



In [613]:
# Keep only the top 4 important features
top_features = ["Work_School_Impact_Days", "Missed_Work_School_Days", 
                "Concern_Level_About_Bleeding", "Personal_Income"]

# Create new datasets
X_train_optimized = X_train_filtered[top_features]
X_test_optimized = X_test_filtered[top_features]

print(f"Keeping only top {len(top_features)} features.")

# Retrain XGBoost with optimized features
best_xgb.fit(X_train_optimized, y_train_encoded)

# Make predictions
y_pred_optimized = best_xgb.predict(X_test_optimized)

# Evaluate performance
accuracy_optimized = accuracy_score(y_test_encoded, y_pred_optimized)
print(f"Optimized XGBoost Accuracy (Top Features Only): {accuracy_optimized:.2f}\n")

print(" Optimized XGBoost Classification Report:")
print(classification_report(y_test_encoded, y_pred_optimized))


Keeping only top 4 features.
Optimized XGBoost Accuracy (Top Features Only): 0.50

 Optimized XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.72      0.59        46
           1       0.00      0.00      0.00         3
           2       0.57      0.33      0.42        24
           3       0.50      0.39      0.44        36

    accuracy                           0.50       109
   macro avg       0.39      0.36      0.36       109
weighted avg       0.50      0.50      0.49       109



Parameters: { "use_label_encoder" } are not used.



In [614]:
import joblib

# Save the best XGBoost model
joblib.dump(best_xgb, "final_xgboost_model.pkl")

print("The best model has been saved! You can now use it anytime.")


The best model has been saved! You can now use it anytime.


In [615]:
# Print accuracy comparison
print("\nModel Accuracy Summary:")
print(f"Random Forest Accuracy: {accuracy_rf:.2f}")
print(f"Logistic Regression Accuracy: {accuracy_log:.2f}")
print(f"XGBoost (Optimized): {accuracy_optimized:.2f}")  


model_accuracies = {
    "Random Forest": accuracy_rf,
    "Logistic Regression": accuracy_log,
    "XGBoost (Optimized)": accuracy_optimized  
}


best_model_name = max(model_accuracies, key=model_accuracies.get)
best_model_accuracy = model_accuracies[best_model_name]


print(f"\n**Best Model:** {best_model_name} with {best_model_accuracy:.2f} accuracy")



Model Accuracy Summary:
Random Forest Accuracy: 0.35
Logistic Regression Accuracy: 0.25
XGBoost (Optimized): 0.50

**Best Model:** XGBoost (Optimized) with 0.50 accuracy


In [616]:
import pandas as pd

# Load the saved model
loaded_model = joblib.load("final_xgboost_model.pkl")

# Example: New data for prediction
new_data = pd.DataFrame({
    "Work_School_Impact_Days": [3, 1, 5],  
    "Missed_Work_School_Days": [2, 0, 3],
    "Concern_Level_About_Bleeding": [4, 1, 3],
    "Personal_Income": [50000, 30000, 75000]
})


predictions = loaded_model.predict(new_data)


label_mapping = {0: "Moderate pain", 1: "No pain", 2: "Severe pain", 3: "Slight pain"}
predicted_labels = [label_mapping[p] for p in predictions]

print("🩸 **Predicted Pain Levels:**", predicted_labels)


🩸 **Predicted Pain Levels:** ['Severe pain', 'Moderate pain', 'Slight pain']
