In [2]:
pip install scikit-learn scikit-image pandas


Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install imbalanced-learn


Collecting imbalanced-learn
  Using cached imbalanced_learn-0.12.2-py3-none-any.whl (257 kB)
Collecting joblib>=1.1.1
  Using cached joblib-1.3.2-py3-none-any.whl (302 kB)
Installing collected packages: joblib, imbalanced-learn
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.0
    Uninstalling joblib-1.1.0:
      Successfully uninstalled joblib-1.1.0
Successfully installed imbalanced-learn-0.12.2 joblib-1.3.2
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import numpy as np
import os
from skimage import io, color, feature, transform
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE

In [5]:
# Chargement des métadonnées
X = pd.read_csv("C:\\Users\\user\\OneDrive\\DATASIENCETEST\\PROJET\\Data\\Update\\X_train_update.csv", index_col=0)
y = pd.read_csv("C:\\Users\\user\\OneDrive\\DATASIENCETEST\\PROJET\\Data\\Update\\Y_train_CVw08PX.csv", index_col=0).squeeze().map(str)
X['image_path'] = X.apply(lambda row: os.path.join("C:/Users/user/OneDrive/DATASIENCETEST/PROJET/Data/Update/images/image_train", f'image_{row.imageid}_product_{row.productid}.jpg'), axis=1)

In [6]:
display(X)

Unnamed: 0,designation,description,productid,imageid,image_path
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,C:/Users/user/OneDrive/DATASIENCETEST/PROJET/D...
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237,C:/Users/user/OneDrive/DATASIENCETEST/PROJET/D...
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,C:/Users/user/OneDrive/DATASIENCETEST/PROJET/D...
3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496,C:/Users/user/OneDrive/DATASIENCETEST/PROJET/D...
4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786,C:/Users/user/OneDrive/DATASIENCETEST/PROJET/D...
...,...,...,...,...,...
84911,The Sims [ Import Anglais ],,206719094,941495734,C:/Users/user/OneDrive/DATASIENCETEST/PROJET/D...
84912,Kit piscine acier NEVADA déco pierre Ø 3.50m x...,<b>Description complète :</b><br />Kit piscine...,3065095706,1188462883,C:/Users/user/OneDrive/DATASIENCETEST/PROJET/D...
84913,Journal Officiel De La Republique Francaise N°...,,440707564,1009325617,C:/Users/user/OneDrive/DATASIENCETEST/PROJET/D...
84914,Table Basse Bois De Récupération Massif Base B...,<p>Cette table basse a un design unique et con...,3942400296,1267353403,C:/Users/user/OneDrive/DATASIENCETEST/PROJET/D...


In [7]:
# Fonction d'extraction des caractéristiques HOG
def extract_hog_features(image_path):
    image = io.imread(image_path)
    image_gray = color.rgb2gray(image)
    image_resized = transform.resize(image_gray, (128, 64), anti_aliasing=True)
    hog_features = feature.hog(image_resized, pixels_per_cell=(16, 16),
                               cells_per_block=(1, 1), visualize=False)
    return hog_features

# Création du dataset
def create_dataset(X, y):
    features_list = []
    for image_path in X['image_path']:
        features = extract_hog_features(image_path)
        features_list.append(features)
    return np.array(features_list), y.values

In [8]:
# Réduction pour des raisons de démonstration
sample_X = X.head(50000)
sample_y = y.head(50000)

X_features, y_labels = create_dataset(sample_X,sample_y)

In [9]:
# Compter le nombre d'échantillons par classe
unique, counts = np.unique(y_labels, return_counts=True)
counts_dict = dict(zip(unique, counts))

display(counts_dict)

{'10': 1794,
 '1140': 1584,
 '1160': 2244,
 '1180': 455,
 '1280': 2853,
 '1281': 1193,
 '1300': 2977,
 '1301': 454,
 '1302': 1476,
 '1320': 1963,
 '1560': 3054,
 '1920': 2579,
 '1940': 440,
 '2060': 2997,
 '2220': 490,
 '2280': 2860,
 '2403': 2794,
 '2462': 845,
 '2522': 2889,
 '2582': 1495,
 '2583': 5971,
 '2585': 1483,
 '2705': 1648,
 '2905': 516,
 '40': 1472,
 '50': 994,
 '60': 480}

In [10]:
# Rééquilibrage des classes avec SMOTE
smote = SMOTE()
X_res, y_res = smote.fit_resample(X_features, y_labels)

# Division en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)


In [11]:
# Recalcul des poids de classe après rééquilibrage
class_weights = class_weight.compute_class_weight(class_weight='balanced',
                                                  classes=np.unique(y_train),
                                                  y=y_train)
class_weights_dict = {class_label: weight for class_label, weight in zip(np.unique(y_train), class_weights)}

display(class_weights_dict)

{'10': 0.9966154345457496,
 '1140': 0.9889809063722107,
 '1160': 1.0105305220600334,
 '1180': 0.9970314710452469,
 '1280': 0.9914441215811078,
 '1281': 1.0161195017608546,
 '1300': 0.9995350026737346,
 '1301': 1.0050026883605676,
 '1302': 0.9947475588875005,
 '1320': 0.9989079418188578,
 '1560': 1.0020511386150153,
 '1920': 0.995162037037037,
 '1940': 1.0086101726726726,
 '2060': 0.9930930930930931,
 '2220': 1.0043687505840575,
 '2280': 0.9939196374901743,
 '2403': 1.0071216061095885,
 '2462': 1.0041576156774812,
 '2522': 0.9964075464701246,
 '2582': 1.0050026883605676,
 '2583': 0.9955768607290074,
 '2585': 1.0001628512935046,
 '2705': 0.9999534807991999,
 '2905': 1.00394656952034,
 '40': 0.9883670138170448,
 '50': 1.001420917773119,
 '60': 1.0028926680196888}

In [None]:
# Optimisation des hyperparamètres avec RandomizedSearchCV

from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'C': [1, 10],
    'gamma': ['scale'],
    'kernel': ['linear', 'rbf']
}

model = SVC(class_weight=class_weights_dict)

# Configuration pour RandomizedSearchCV
random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=4, cv=3, verbose=1, n_jobs=-1, scoring='accuracy')
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [None]:
# Affichage des meilleurs paramètres et de la meilleure score
print("Meilleurs paramètres:", random_search.best_params_)
print("Meilleur score:", random_search.best_score_)

In [None]:

# Évaluation sur l'ensemble de test
y_pred = random_search.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))