In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

In [None]:
hair = pd.read_csv('Predict Hair Fall.csv')

In [None]:
hair.head()

In [None]:
hair.info()

At the beginning we can remove Id column, is useful for our analysis 

In [None]:
hair.columns = hair.columns.str.strip().str.replace(' ', '_')
hair.rename(columns = {'Medications_&_Treatments':'Medications&Treatments'}, inplace = True)

In [None]:
hair.drop('Id', axis = 1, inplace = True)

In [None]:
hair.info()

In [None]:
hair.describe().T

We don't have any null values so work in this field is not required

In [None]:
hair.isna().sum()

In [None]:
hair[hair.duplicated()]

In [None]:
hair.drop_duplicates(inplace=True, ignore_index=True)

In [None]:
hair.duplicated().sum()

In [None]:
count = hair.groupby('Hair_Loss')['Hair_Loss'].count()

In [None]:
is_bold = round(count[1] / len(hair) * 100, 2)
is_not_bold = round(count[0] / len(hair) * 100, 2)

In [None]:
sns.set_style("whitegrid")
plt.pie([is_bold, is_not_bold], autopct='%1.1f%%',colors = ['cornflowerblue','darksalmon'], 
         textprops={'fontsize': 12})
plt.legend(labels = ['Person is bold', 'Person is not bold'], bbox_to_anchor=(1.2, 1.0))

In [None]:
canvas = sns.FacetGrid(hair, col='Hair_Loss')
canvas.map(sns.histplot, 'Age', bins = 12, color = 'thistle', kde = True)

In [None]:
plt.figure(figsize = (8,5))
sns.countplot(hair, x= 'Stress', hue = 'Hair_Loss', palette = 'Paired')

In [None]:
hair.head()

In [None]:
categorical = ['Genetics', 'Hormonal_Changes', 'Poor_Hair_Care_Habits', 'Environmental_Factors', 'Smoking', 'Weight_Loss']

In [None]:
plt.figure(figsize=(10, 7))
sns.countplot(hair, y = 'Medical_Conditions', hue = 'Hair_Loss', palette = 'muted')

In [None]:
plt.figure(figsize=(10, 7))
sns.countplot(hair, y = 'Medications&Treatments', hue = 'Hair_Loss', palette = 'muted')

In [None]:
plt.figure(figsize=(10, 7))
sns.countplot(hair, y = 'Nutritional_Deficiencies', hue = 'Hair_Loss', palette = 'muted')

In [None]:
categorical = ['Genetics', 'Hormonal_Changes', 'Poor_Hair_Care_Habits', 'Environmental_Factors', 'Smoking', 'Weight_Loss']

# Ustawienie układu wykresów
fig, axes = plt.subplots(2, 3, figsize=(12, 10))  # 3 wiersze, 2 kolumny

# Iteracja przez listę kategorii i tworzenie wykresu kołowego dla każdej z nich
for ax, category in zip(axes.flatten(), categorical):
    counts = hair[category].value_counts()
    ax.pie(counts, labels=counts.index, autopct='%1.1f%%', colors=['skyblue', 'pink'])
    ax.set_title(category.replace('_', ' '))

plt.tight_layout()
plt.show()

In [None]:
hair.info()

In [None]:
hair.apply(lambda x: x.unique())

Let's change all yes no columns to binary

In [None]:
binary_columns = ['Genetics', 'Hormonal_Changes', 'Poor_Hair_Care_Habits', 'Environmental_Factors', 'Smoking', 'Weight_Loss']

def change_colums(df, columns):
    for column in columns:
        df[column] = df[column].map({'Yes':1, 'No':0})
change_colums(hair,binary_columns)

In [None]:
hair.apply(lambda x: x.unique())

Modifiaction column with order - Stress                                                  

In [None]:
hair['Stress'] = hair['Stress'].map({'Low': 1, 'Moderate' : 2, 'High' : 3})

In [None]:
hair.apply(lambda x: x.unique())

In [None]:
hair['Medical_Conditions'].value_counts()

In [None]:
hair['Medications&Treatments'].value_counts()

In [None]:
hair['Nutritional_Deficiencies'].value_counts()

In [None]:
hair = pd.get_dummies(hair,columns = ['Nutritional_Deficiencies','Medications&Treatments', 'Medical_Conditions'],drop_first=True )

In [None]:
hair.columns

In [None]:
hair = hair.loc[:, ~hair.columns.str.contains('No Data')]

In [None]:
hair.columns

In [None]:
hair.info()

In [None]:
hair.head()

In [None]:
from sklearn.model_selection import train_test_split 

In [None]:
X = hair.drop('Hair_Loss', axis = 1)
y = hair['Hair_Loss']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
print(X_train.shape, X_test.shape)

In [None]:
from sklearn import preprocessing

In [None]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logreg = LogisticRegression(C = 0.1,random_state= 101,).fit(X_train, y_train,)

In [None]:
predictions = logreg.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
print(classification_report(y_test, predictions))

In [None]:
predictions = logreg.predict(X_train)
print(classification_report(y_train, predictions))

In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score
import numpy as np

def forward_selection_with_gridsearch(model, param_grid, X, y, cv=5):
    best_score = 0
    best_model = None
    best_features = []
    available_features = list(range(X.shape[1]))
    while available_features:
        scores = []
        for feature in available_features:
            features_to_test = best_features + [feature]
            X_test = X[:, features_to_test]
            
            grid_search = GridSearchCV(model, param_grid, cv=cv)
            grid_search.fit(X_test, y)
            score = np.mean(cross_val_score(grid_search.best_estimator_, X_test, y, cv=cv))
            
            scores.append((score, feature, grid_search.best_estimator_))
        
        scores.sort(reverse=True)
        if scores[0][0] > best_score:
            best_score, feature_to_add, best_model = scores[0]
            best_features.append(feature_to_add)
            available_features.remove(feature_to_add)
            print(f"Feature {feature_to_add} added, score: {best_score}")
        else:
            break
    
    print(f"Best score: {best_score}")
    print(f"Selected features: {best_features}")
    return best_model, best_features

# Użycie przykładowego modelu i danych
# model = TwojModel()
# param_grid = {'parametr1': [wartości], 'parametr2': [wartości]}
# X = dane_wejściowe
# y = etykiety
# best_model, best_features = forward_selection_with_gridsearch(model, param_grid, X, y)
