In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

In [None]:
from scipy.stats import chi2_contingency, pointbiserialr
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB

In [None]:
hair = pd.read_csv('Predict Hair Fall.csv')

In [None]:
hair.head()

In [None]:
hair.info()

At the beginning we can remove Id column, is useful for our analysis 

In [None]:
hair.columns = hair.columns.str.strip().str.replace(' ', '_')
hair.rename(columns = {'Medications_&_Treatments':'Medications&Treatments'}, inplace = True)

In [None]:
hair.drop('Id', axis = 1, inplace = True)

In [None]:
hair.info()

In [None]:
hair.describe().T

We don't have any null values so work in this field is not required

In [None]:
hair.isna().sum()

In [None]:
hair[hair.duplicated()]

In [None]:
hair.drop_duplicates(inplace=True, ignore_index=True)

In [None]:
hair.duplicated().sum()

In [None]:
counts =  hair['Hair_Loss'].value_counts()

In [None]:
sns.set_style("whitegrid")
plt.pie(counts, autopct='%1.1f%%',colors = ['cornflowerblue','darksalmon'], 
         textprops={'fontsize': 12})
plt.legend(labels = ['Person is bold', 'Person is not bold'], bbox_to_anchor=(1.2, 1.0))

In [None]:
canvas = sns.FacetGrid(hair, col='Hair_Loss')
canvas.map(sns.histplot, 'Age', bins = 12, color = 'thistle', kde = True)

In [None]:
plt.figure(figsize = (8,5))
sns.countplot(hair, x= 'Stress', hue = 'Hair_Loss', palette = 'Paired')

In [None]:
plt.figure(figsize=(10, 7))
sns.countplot(hair, y = 'Medical_Conditions', hue = 'Hair_Loss', palette = 'muted')

In [None]:
plt.figure(figsize=(10, 7))
sns.countplot(hair, y = 'Medications&Treatments', hue = 'Hair_Loss', palette = 'muted')

In [None]:
plt.figure(figsize=(10, 7))
sns.countplot(hair, y = 'Nutritional_Deficiencies', hue = 'Hair_Loss', palette = 'muted')

In [None]:
categorical = ['Genetics', 'Hormonal_Changes', 'Poor_Hair_Care_Habits', 'Environmental_Factors', 'Smoking', 'Weight_Loss']

fig, axes = plt.subplots(2, 3, figsize=(12, 10))  

for ax, category in zip(axes.flatten(), categorical):
    counts = hair[category].value_counts()
    ax.pie(counts, labels=counts.index, autopct='%1.1f%%', colors=['skyblue', 'pink'])
    ax.set_title(category.replace('_', ' '))

In [None]:
hair.info()

In [None]:
hair.apply(lambda x: x.unique())

Let's change all yes no columns to binary

In [None]:
binary_columns = ['Genetics', 'Hormonal_Changes', 'Poor_Hair_Care_Habits', 'Environmental_Factors', 'Smoking', 'Weight_Loss']

def change_colums(df, columns):
    for column in columns:
        df[column] = df[column].map({'Yes':1, 'No':0})
change_colums(hair,binary_columns)

In [None]:
hair.apply(lambda x: x.unique())

Modifiaction column with order - Stress                                                  

In [None]:
hair['Stress'] = hair['Stress'].map({'Low': 1, 'Moderate' : 2, 'High' : 3})

In [None]:
hair.apply(lambda x: x.unique())

In [None]:
hair['Medical_Conditions'].value_counts()

In [None]:
hair['Medications&Treatments'].value_counts()

In [None]:
hair['Nutritional_Deficiencies'].value_counts()

In [None]:
hair = pd.get_dummies(hair,columns = ['Nutritional_Deficiencies','Medications&Treatments', 'Medical_Conditions'],drop_first=True )

In [None]:
hair.columns

In [None]:
hair = hair.loc[:, ~hair.columns.str.contains('No Data')]

In [None]:
hair.columns

In [None]:
hair.info()

In [None]:
hair.head()

In [None]:
ref = 'Hair_Loss'
selected = hair.drop(['Age','Hair_Loss' ],axis = 1)
result_list = []

for col in selected.columns:
    if col != ref:
        contingency_table = pd.crosstab(hair[ref], selected[col])
        chi2, p, dof, _ = chi2_contingency(contingency_table)
        result_list.append({'Zmienna': col, 'Chi2': chi2, 'p-value': p})
result_df = pd.DataFrame(result_list)

In [None]:
def highlight(s):
    if s['p-value'] < 0.25 :
        return ['background-color: skyblue'] * len(s)

In [None]:
result_df.sort_values(by = 'p-value').style.apply(highlight, axis=1)

In [None]:
corr, p_value = pointbiserialr(hair['Hair_Loss'], hair['Age'])
print(f'correlation = {corr}, p_value = {p_value}')

In [None]:
X = hair.drop('Hair_Loss', axis = 1)
y = hair['Hair_Loss']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
print(X_train.shape, X_test.shape)

## Our data are quite balanced

In [None]:
print(y_train.value_counts())
print(y_test.value_counts())

In [None]:
scaler = MinMaxScaler().fit(X_train['Age'].values.reshape(-1, 1))
X_train['Age'] = scaler.transform(X_train['Age'].values.reshape(-1, 1))
X_test['Age']  = scaler.transform(X_test['Age'].values.reshape(-1, 1))

In [None]:
def ModelFinder(X_train, y_train, estimator, param_grid):
    try:
        # Check if the estimator has coef_ or feature_importances_ attributes required by RFECV
        if hasattr(estimator.fit(X_train,y_train), 'coef_') or hasattr(estimator.fit(X_train,y_train), 'feature_importances_'):
            selector = RFECV(estimator=estimator, step=1, cv=10, scoring='accuracy', n_jobs=6, min_features_to_select=5, verbose=2)
            X_train_transformed = selector.fit_transform(X_train, y_train)

            plt.title("RFECV - Number of Features vs. Accuracy")
            plt.xlabel("Number of Features")
            plt.ylabel("Cross-Validation Score (Accuracy)")
            plt.plot(range(1, len(selector.cv_results_['mean_test_score']) + 1), selector.cv_results_['mean_test_score'])
            plt.show()
        else:
            # If the estimator doesn't support RFECV directly, use the full feature set
            print("The estimator does not directly support RFECV. Using the full feature set.")
            X_train_transformed = X_train
    except Exception as e:
        print(f"An error occurred during RFECV: {e}")
        X_train_transformed = X_train
        if X_test is not None:
            X_test_transformed = X_test

    # GridSearchCV
    grid_search = GridSearchCV(estimator=estimator, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1)
    grid_search.fit(X_train_transformed, y_train)

    # Results
    results = {
        'n_features': X_train_transformed.shape[1] if hasattr(estimator.fit(X_train,y_train), 'coef_') or hasattr(estimator.fit(X_train,y_train), 'feature_importances_') else "N/A",
        'accuracy_score': grid_search.best_score_,
        'best_params': grid_search.best_params_,
    }
    if hasattr(estimator.fit(X_train,y_train), 'coef_') or hasattr(estimator.fit(X_train,y_train), 'feature_importances_'):
        results.update({
            'selected_features_indices': selector.get_support(indices=True) if hasattr(selector, 'support_') else "N/A",
            'selected_features_mask': selector.support_ if hasattr(selector, 'support_') else "N/A",
        })

    return results

In [None]:
def printInformation(X_train, y_train, X_test, y_test, model):
    
    test_predictions = model.predict(X_test)
    train_predictions = model.predict(X_train)
    
    print('Information for test data')
    print(classification_report(y_test, test_predictions))
    print('Information for train data')
    print(classification_report(y_train, train_predictions))
    
    cm_train = confusion_matrix(y_train, train_predictions)
    cm_test  = confusion_matrix(y_test, test_predictions)
    fig,ax = plt.subplots(1,2, figsize = (10,4))
    ConfusionMatrixDisplay(confusion_matrix = cm_train,display_labels=model.classes_).plot(ax = ax[0])
    ax[0].set_title('Training Data')
    ax[1].set_title('Testing Data')
    ConfusionMatrixDisplay(confusion_matrix = cm_test,display_labels=model.classes_).plot(ax = ax[1])
    fig.suptitle('Confusion Matrix')

    fpr_train, tpr_train, _ = roc_curve(y_train, train_predictions)
    roc_auc_train = auc(fpr_train, tpr_train)
    
    fpr_test, tpr_test, _ = roc_curve(y_test, test_predictions)
    roc_auc_test = auc(fpr_test, tpr_test)
    fig, ax_roc = plt.subplots(figsize=(5, 5))

    ax_roc.plot(fpr_train, tpr_train, label=f'Train ROC curve (area = {roc_auc_train:.2f})', color='blue', lw=2)
    ax_roc.plot(fpr_test, tpr_test, label=f'Test ROC curve (area = {roc_auc_test:.2f})', color='darkorange', lw=2)
    
    ax_roc.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    ax_roc.set_xlim([0.0, 1.0])
    ax_roc.set_ylim([0.0, 1.05])
    ax_roc.set_xlabel('False Positive Rate')
    ax_roc.set_ylabel('True Positive Rate')
    ax_roc.set_title('Receiver Operating Characteristic (ROC)')
    ax_roc.legend(loc="lower right")
    
    plt.show()

In [None]:
results_logreg = ModelFinder(X_train, y_train, LogisticRegression(max_iter=500, random_state = 101), {'C': [0.0001, 0.001, 0.001, 0.1, 1, 10, 100]})

In [None]:
results_logreg

In [None]:
X_train_selected = X_train.iloc[:,results_logreg['selected_features_indices'] ]
X_test_selected  = X_test.iloc[:,results_logreg['selected_features_indices'] ]
X_train_selected.columns

In [None]:
logreg = LogisticRegression(C = 10,random_state= 101).fit(X_train_selected, y_train).fit(X_train_selected, y_train)   

In [None]:
 printInformation(X_train_selected, y_train, X_test_selected, y_test, logreg)

In [None]:
results_tree = ModelFinder(X_train, y_train, DecisionTreeClassifier(random_state= 101),
                        { 'max_depth': [2 * x for x in range(2,8)], 
                         'criterion':['gini','entropy']})

In [None]:
results_tree

In [None]:
X_train_selected = X_train.iloc[:,results_tree['selected_features_indices'] ]
X_test_selected  = X_test.iloc[:,results_tree['selected_features_indices'] ]
X_train_selected.columns

In [None]:
tree = DecisionTreeClassifier(random_state= 101, criterion = 'entropy', max_depth= 8).fit(X_train_selected,y_train)

In [None]:
printInformation(X_train_selected, y_train, X_test_selected, y_test, tree)

In [None]:
results_forest = ModelFinder(X_train, y_train, RandomForestClassifier(random_state=101),
                            {'n_estimators': [5, 10, 50, 100, 200],  
                             'max_depth': [2 * x for x in range(2,8)],  
                             'criterion' :['gini', 'entropy']
                            })

In [None]:
results_forest

In [None]:
X_train_selected = X_train.iloc[:,results_forest['selected_features_indices'] ]
X_test_selected = X_test.iloc[:,results_forest['selected_features_indices'] ]
X_train_selected.columns

In [None]:
forest = RandomForestClassifier(random_state=101, max_depth= 14, criterion= 'entropy',  n_estimators = 5).fit(X_train_selected,y_train)

In [None]:
printInformation(X_train_selected, y_train, X_test_selected, y_test, forest)

In [None]:
results_gbc = ModelFinder(X_train, y_train, GradientBoostingClassifier(random_state=101),
                         {'n_estimators': [5, 10, 50, 100, 200],  # Liczba drzew
                          'max_depth': [2 * x for x in range(2,8)],  # Maksymalna głębokość drzewa
                          'criterion' :['friedman_mse', 'squared_error'],
                          'learning_rate': [ 0.025, 0.05, 0.1, 0.2]
                         })

In [None]:
results_gbc

In [None]:
X_train_selected = X_train.iloc[:,results_gbc['selected_features_indices'] ]
X_test_selected = X_test.iloc[:,results_gbc['selected_features_indices'] ]
X_train_selected.columns

In [None]:
gbc = GradientBoostingClassifier(random_state=101, max_depth= 4, criterion= 'friedman_mse',  n_estimators = 200, learning_rate = 0.1).fit(X_train_selected,y_train)

In [None]:
printInformation(X_train_selected, y_train, X_test_selected, y_test, gbc)

In [None]:
results_knc = ModelFinder(X_train, y_train, KNeighborsClassifier(),
              {'weights': ['uniform', 'distance'],  # Liczba drzew
               'n_neighbors': [5, 7, 9, 11, 15, 33, 66, 99],
               'metric': ['euclidean', 'manhattan']
              })

In [None]:
results_knc

In [None]:
knc = KNeighborsClassifier(n_neighbors= 9, weights='distance', metric = 'manhattan').fit(X_train, y_train)

In [None]:
printInformation(X_train, y_train, X_test, y_test, knc)

In [None]:
results_svc = ModelFinder(X_train, y_train, SVC(random_state=101),
             {'C': [0.1, 1, 10, 100],  
              'gamma': [1, 0.1, 0.01, 0.001],  
              'kernel': ['rbf', 'poly', 'sigmoid']
             })

In [None]:
results_svc

In [None]:
svc = SVC(random_state=101, C = 10, gamma = 0.1, kernel='sigmoid').fit(X_train, y_train) 

In [None]:
printInformation(X_train, y_train, X_test, y_test, svc)

In [None]:
results_naive = ModelFinder(X_train, y_train, BernoulliNB(),
               {'alpha': [0.01, 0.1, 1.0, 10.0], 
                'binarize': [0.0, 0.1, 0.2],  
               })


In [None]:
results_naive

In [None]:
naive = BernoulliNB( alpha = 0.1, binarize= 0.1).fit(X_train, y_train)

In [None]:
printInformation(X_train, y_train, X_test, y_test, naive)