**Importing Libraries**

In [15]:
import pandas as pd
import numpy as np
from glob import glob
import os
import cv2
import matplotlib.pylab as plt
from skimage.io import imread
from skimage.transform import resize
from skimage.color import rgb2gray,rgba2rgb

In [16]:
input_dir = '/kaggle/input/hair-type-splitted/train_set'
categories = ['Curly Hair', 'Straight Hair', 'Wavy Hair']
data = []
labels = []
target_size = (15, 15)
for category_idx, category in enumerate(categories):
    category_path = os.path.join(input_dir, category)
    for file in os.listdir(category_path):
        img_path = os.path.join(category_path, file)
        try:
            img = imread(img_path)
            if len(img.shape) == 3 and img.shape[2] == 4:  
                img = rgba2rgb(img) 
            if len(img.shape) == 3:  
                img = rgb2gray(img)

            img_resized = resize(img, target_size)
            data.append(img_resized.flatten())
            labels.append(category_idx)
        except Exception as e:
            print(f"Error processing {img_path}: {e}")
            continue

data = np.array(data)
labels = np.array(labels)

In [17]:
data

array([[0.43515261, 0.45782736, 0.46780252, ..., 0.44330161, 0.29881276,
        0.2914952 ],
       [0.98930154, 0.98699554, 0.98432146, ..., 0.14229922, 0.14942656,
        0.14828445],
       [0.97028092, 0.97181066, 0.97279248, ..., 0.90833367, 0.85114813,
        0.60820932],
       ...,
       [0.56539861, 0.54674647, 0.52259801, ..., 0.33927923, 0.25844749,
        0.15367959],
       [0.16251149, 0.15428063, 0.15245773, ..., 0.27342587, 0.22040636,
        0.2151701 ],
       [0.80431694, 0.80696752, 0.81665552, ..., 0.614681  , 0.61595323,
        0.62979654]])

In [18]:
print(data.shape)
print(labels.shape)

(694, 225)
(694,)


In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

scaler = StandardScaler()
train_data = scaler.fit_transform(data)

random_forest = RandomForestClassifier(
    n_estimators=100,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='log2',
    max_depth=20
)
random_forest.fit(train_data, labels)

In [20]:
train_data

array([[-0.73609373, -0.64666223, -0.5662971 , ..., -0.13962273,
        -0.75308063, -0.83094212],
       [ 1.21809772,  1.2534437 ,  1.29895763, ..., -1.24498799,
        -1.29188111, -1.33982359],
       [ 1.15102202,  1.19891876,  1.25732412, ...,  1.56810554,
         1.23906174,  0.29446183],
       ...,
       [-0.2767847 , -0.32737673, -0.36841945, ..., -0.52162213,
        -0.89866853, -1.32065262],
       [-1.69755538, -1.73661995, -1.7050711 , ..., -0.76345415,
        -1.03587386, -1.10215381],
       [ 0.56575446,  0.60700971,  0.69348187, ...,  0.48973052,
         0.39076966,  0.37116932]])

**Hyperparameter Tuning**

In [None]:
from sklearn.model_selection import RandomizedSearchCV
scaler = StandardScaler()
train_data1 = scaler.fit_transform(data)
param_dist = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt','log2',0.5,0.75]
}
random_forest1 = RandomForestClassifier(random_state=42)

random_search = RandomizedSearchCV(estimator=random_forest1, param_distributions=param_dist,
                                   n_iter=20, cv=20, random_state=42, n_jobs=-1, verbose=1, scoring='accuracy')

random_search.fit(train_data1, labels)
print(f"Best parameters: {random_search.best_params_}")
print(f"Best cross-validation accuracy: {random_search.best_score_}")

Fitting 20 folds for each of 20 candidates, totalling 400 fits


**Testing**

In [21]:
input_dir = '/kaggle/input/hair-type-splitted/test_set'
categories = ['Curly Hair', 'Straight Hair', 'Wavy Hair']
data1 = []
labels1 = []
target_size = (15, 15)
for category_idx, category in enumerate(categories):
    category_path = os.path.join(input_dir, category)
    for file in os.listdir(category_path):
        img_path = os.path.join(category_path, file)
        try:
            img = imread(img_path)
            if len(img.shape) == 3 and img.shape[2] == 4:  
                img = rgba2rgb(img) 
            if len(img.shape) == 3:  
                img = rgb2gray(img)

            img_resized = resize(img, target_size)
            data1.append(img_resized.flatten())
            labels1.append(category_idx)
        except Exception as e:
            print(f"Error processing {img_path}: {e}")
            continue

data1 = np.array(data1)
labels1 = np.array(labels1)

Error processing /kaggle/input/hair-type-splitted/test_set/Curly Hair/rs_1080x1080-200330130638-1080-ariana-grande-curly-hair-instagram-am-033020.gif: output_shape length cannot be smaller than the image number of dimensions


In [22]:
test_data = scaler.transform(data1)
test_data

array([[ 0.6944291 ,  0.59155919,  0.57666177, ...,  1.67971191,
         1.02796254,  0.88481128],
       [ 1.25578123,  1.29629032,  1.34523111, ...,  1.90100997,
         1.77593202,  1.68664095],
       [ 0.65278642,  0.14796596, -1.00709026, ...,  0.5969365 ,
         0.17859507,  0.31304641],
       ...,
       [-0.92603968, -0.94278146, -0.91101195, ..., -0.52811365,
        -0.87944604, -1.21815688],
       [ 0.2482902 ,  0.35766727,  0.50202962, ..., -1.20204045,
        -1.18165655, -1.18905338],
       [-0.45666179, -0.48763157, -0.37134223, ..., -1.31182354,
        -1.38959597, -1.44738698]])

In [23]:
test_predictions = random_forest.predict(test_data)
test_accuracy = accuracy_score(labels1, test_predictions)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

Test Accuracy: 64.55%


***Cross Validation***

In [24]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [25]:
def evaluate_model(model, train_data, train_labels, cv_folds=5):
    cv_scores = cross_val_score(model, train_data, train_labels, cv=cv_folds)
    print(f"{model.__class__.__name__} Cross-validation scores: {cv_scores}")
    print(f"Mean cross-validation accuracy: {np.mean(cv_scores) * 100:.2f}%")

In [26]:
models = [
    SVC(kernel='linear', C=1),
    RandomForestClassifier(n_estimators=100, random_state=42),
    KNeighborsClassifier(n_neighbors=5),
    LogisticRegression(max_iter=500)
]

for model in models:
    evaluate_model(model, train_data,labels)

SVC Cross-validation scores: [0.61870504 0.50359712 0.50359712 0.62589928 0.51449275]
Mean cross-validation accuracy: 55.33%
RandomForestClassifier Cross-validation scores: [0.61151079 0.58992806 0.65467626 0.64028777 0.5942029 ]
Mean cross-validation accuracy: 61.81%
KNeighborsClassifier Cross-validation scores: [0.51079137 0.41007194 0.48920863 0.50359712 0.51449275]
Mean cross-validation accuracy: 48.56%
LogisticRegression Cross-validation scores: [0.50359712 0.5323741  0.51798561 0.58992806 0.52173913]
Mean cross-validation accuracy: 53.31%


In [27]:
import joblib
joblib.dump(random_forest,'hairtype_model.pkl')
print("File made successfully")

File made successfully
