In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Load the dataset
train_df = pd.read_csv('pcos_train.csv')
test_df = pd.read_csv('pcos_test.csv')

# Split the data into features and target variable
X_train = train_df.drop(columns=['PCOS_diagnosis'])
y_train = train_df['PCOS_diagnosis']
X_test = test_df.drop(columns=['PCOS_diagnosis'])
y_test = test_df['PCOS_diagnosis']

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the best hyperparameters for Logistic Regression
best_log_reg = LogisticRegression(
    C=0.1,
    max_iter=1000,
    penalty='l2',
    solver='liblinear'
)

# Fit the Logistic Regression model
best_log_reg.fit(X_train_scaled, y_train)

# Get the coefficients for Logistic Regression
coefficients = best_log_reg.coef_[0]

# Create a DataFrame for Logistic Regression feature importance
log_reg_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': coefficients,
    'Importance': np.abs(coefficients)  # Use absolute values for importance
})

# Sort the DataFrame by importance in descending order
log_reg_importance_df = log_reg_importance_df.sort_values(by='Importance', ascending=False)

# Display the feature importance for Logistic Regression
print("Feature Importance - Logistic Regression:")
print(log_reg_importance_df)

# Define the best hyperparameters for Random Forest
best_rf_clf = RandomForestClassifier(
    bootstrap=False,
    max_depth=30,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=500
)

# Fit the Random Forest model
best_rf_clf.fit(X_train_scaled, y_train)

# Get feature importances for Random Forest
feature_importances = best_rf_clf.feature_importances_

# Create a DataFrame for Random Forest feature importance
rf_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
})

# Sort the DataFrame by importance in descending order
rf_importance_df = rf_importance_df.sort_values(by='Importance', ascending=False)

# Display the feature importance for Random Forest
print("\nFeature Importance - Random Forest:")
print(rf_importance_df)

Feature Importance - Logistic Regression:
                    Feature  Coefficient  Importance
10     have_regular_periods    -1.389456    1.389456
16           blood_group_O+    -1.019281    1.019281
14           blood_group_B+    -0.877373    0.877373
12           blood_group_A+    -0.756592    0.756592
13           blood_group_A-    -0.280894    0.280894
3   have_ex_bodyhair_growth     0.238105    0.238105
8             have_exercise    -0.218535    0.218535
17           blood_group_O-    -0.214868    0.214868
15           blood_group_B-    -0.206602    0.206602
5            have_hair_loss     0.188433    0.188433
0                       age    -0.140343    0.140343
1          period_intervial    -0.066287    0.066287
9          have_mood_swings     0.061227    0.061227
11              period_last     0.057962    0.057962
2          have_weight_gain     0.054077    0.054077
6              have_pimples    -0.048886    0.048886
18                      BMI     0.042827    0.042827
4   