In [2]:
import joblib
import os
import shutil
import random
import pandas as pd
import cv2
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import Dataset, DataLoader
import torch

In [None]:
!unzip /kaggle/input/ifood-2019-fgvc6/train_set.zip -d train_set
!unzip /kaggle/input/ifood-2019-fgvc6/val_set.zip -d test_set

In [6]:
from torchvision import transforms
from PIL import Image
import os

# Function to limit images per class
def limit_images_per_class(labels_df, source_dir, dest_dir, limit=200):
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
    
    class_groups = labels_df.groupby('label')
    
    for label, group in class_groups:
        dest_class_dir = os.path.join(dest_dir, str(label))
        
        if not os.path.exists(dest_class_dir):
            os.makedirs(dest_class_dir)
        
        images = group['img_name'].tolist()
        selected_images = random.sample(images, min(limit, len(images)))
        
        for image_name in selected_images:
            source_image_path = os.path.join(source_dir, image_name)
            dest_image_path = os.path.join(dest_class_dir, image_name)
            shutil.copy(source_image_path, dest_image_path)
            
            
# Load the CSV files
train_labels = pd.read_csv('/kaggle/input/csv-files/train_labels.csv')
test_labels = pd.read_csv('/kaggle/input/csv-files/val_labels.csv')
# Define source and destination directories
train_source_dir = '/kaggle/working/train_set/train_set'
test_source_dir = '/kaggle/working/test_set/val_set'
limited_train_dir = '/kaggle/working/limited_train_set2'
limited_test_dir = '/kaggle/working/limited_test_set2'

"""# Limit images per class for train and test sets
limit_images_per_class(train_labels, train_source_dir, limited_train_dir)
limit_images_per_class(test_labels, test_source_dir, limited_test_dir)
# Now train data has 100 images per class, validation has different number of images per class max 100"""


'# Limit images per class for train and test sets\nlimit_images_per_class(train_labels, train_source_dir, limited_train_dir)\nlimit_images_per_class(test_labels, test_source_dir, limited_test_dir)\n# Now train data has 100 images per class, validation has different number of images per class max 100'

In [4]:
# Create train, test and validation data
# Directories
train_dir = '/kaggle/working/limited_train_set2'  # Your directory containing 251 folders for training
val_dir = '/kaggle/working/limited_val_set2'  # Directory to save validation data
test_dir = '/kaggle/working/limited_test_set2'  # Your directory containing 251 folders for testing
# Create validation directory
os.makedirs(val_dir)

In [5]:
import os
import shutil
import random
from sklearn.model_selection import train_test_split

# Function to split data into training and validation
def split_train_val(train_dir, val_dir, val_ratio=0.2):
    for class_name in os.listdir(train_dir):
        class_dir = os.path.join(train_dir, class_name)
        images = os.listdir(class_dir)
        
        train_images, val_images = train_test_split(images, test_size=val_ratio, random_state=42)
        
        val_class_dir = os.path.join(val_dir, class_name)
        if not os.path.exists(val_class_dir):
            os.makedirs(val_class_dir)
        
        for image in val_images:
            shutil.move(os.path.join(class_dir, image), os.path.join(val_class_dir, image))
# Perform the split
split_train_val(train_dir, val_dir)

In [7]:
import cv2
import numpy as np
import os
import random
import shutil
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

# Function to limit images per class
def limit_images_per_class(data_dir, limit=200):
    limited_image_paths = []
    labels = []
    class_names = os.listdir(data_dir)
    
    for class_idx, class_name in enumerate(class_names):
        class_dir = os.path.join(data_dir, class_name)
        images = os.listdir(class_dir)
        selected_images = random.sample(images, min(limit, len(images)))
        
        for img_name in selected_images:
            img_path = os.path.join(class_dir, img_name)
            limited_image_paths.append(img_path)
            labels.append(class_idx)
    
    return limited_image_paths, labels, class_names

# Function to extract SIFT features from images
def extract_sift_features(image_paths):
    sift = cv2.SIFT_create()
    descriptors_list = []
    for image_path in image_paths:
        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        image = cv2.resize(image, (256, 256))  # Resize for consistency
        keypoints, descriptors = sift.detectAndCompute(image, None)
        if descriptors is not None:
            descriptors_list.append(descriptors)
    return descriptors_list

# Create the Bag of Words model
def create_bow(descriptors_list, num_clusters):
    bow_trainer = cv2.BOWKMeansTrainer(num_clusters)
    for descriptors in descriptors_list:
        if descriptors is not None:
            bow_trainer.add(descriptors)
    vocabulary = bow_trainer.cluster()
    return vocabulary


# Create histograms of visual words
def create_histograms(image_paths, sift, bow_extractor):
    histograms = []
    for image_path in image_paths:
        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        keypoints, descriptors = sift.detectAndCompute(image, None)
        if descriptors is not None:
            histogram = bow_extractor.compute(image, keypoints)
            histograms.append(histogram)
        else:
            histogram = np.zeros((1, bow_extractor.getVocabulary().shape[0]), dtype=np.float32)
            histograms.append(histogram)
    return np.array(histograms)



print("func def ends")

func def ends


In [None]:

# Load and limit image paths and labels
train_image_paths, train_labels, class_names = limit_images_per_class(train_dir)
val_image_paths, val_labels, _ = limit_images_per_class(val_dir)
test_image_paths, test_labels, _ = limit_images_per_class(test_dir)


In [23]:
import joblib
import os
import numpy as np

# Paths to save the BoW vocabulary and histograms
vocabulary_path = 'bow_vocabulary.pkl'
train_histograms_path = 'train_histograms.pkl'
val_histograms_path = 'val_histograms.pkl'
test_histograms_path = 'test_histograms.pkl'

# Function to load or create and save the BoW vocabulary
def load_or_create_bow(descriptors_list, num_clusters, bow_path):
    if os.path.exists(bow_path):
        vocabulary = joblib.load(bow_path)
        print(f"BoW vocabulary loaded from {bow_path}")
    else:
        vocabulary = create_bow(descriptors_list, num_clusters)
        joblib.dump(vocabulary, bow_path)
        print(f"BoW vocabulary saved to {bow_path}")
    return vocabulary

# Function to load or create and save histograms
def load_or_create_histograms(image_paths, sift, bow_extractor, histograms_path):
    if os.path.exists(histograms_path):
        histograms = joblib.load(histograms_path)
        print(f"Histograms loaded from {histograms_path}")
    else:
        histograms = create_histograms(image_paths, sift, bow_extractor)
        joblib.dump(histograms, histograms_path)
        print(f"Histograms saved to {histograms_path}")
    return histograms



In [None]:
# Create or load BoW vocabulary
num_clusters = 100
vocabulary = load_or_create_bow(train_descriptors_list, num_clusters, vocabulary_path)

In [24]:
"""
# Create or load histograms of visual words
train_histograms = load_or_create_histograms(train_descriptors_list, vocabulary, train_histograms_path)
val_histograms = load_or_create_histograms(val_descriptors_list, vocabulary, val_histograms_path)
test_histograms = load_or_create_histograms(test_descriptors_list, vocabulary, test_histograms_path) """

# Create BoW extractor
sift = cv2.SIFT_create()
flann_params = dict(algorithm=1, trees=5)
flann_matcher = cv2.FlannBasedMatcher(flann_params, {})
bow_extractor = cv2.BOWImgDescriptorExtractor(sift, flann_matcher)
bow_extractor.setVocabulary(vocabulary)

# Create or load histograms of visual words
train_histograms = load_or_create_histograms(train_image_paths, sift, bow_extractor, train_histograms_path)
val_histograms = load_or_create_histograms(val_image_paths, sift, bow_extractor, val_histograms_path)
test_histograms = load_or_create_histograms(test_image_paths, sift, bow_extractor, test_histograms_path)

Histograms saved to train_histograms.pkl
Histograms saved to val_histograms.pkl
Histograms saved to test_histograms.pkl


In [9]:
# Function to load histograms
def load_histograms(histograms_path):
    if os.path.exists(histograms_path):
        histograms = joblib.load(histograms_path)
        print(f"Histograms loaded from {histograms_path}")
    else:
        raise FileNotFoundError(f"Histograms file {histograms_path} not found.")
    return histograms

train_histograms_path = "/kaggle/input/sift-bow-hisogram/train_histograms.pkl"
val_histograms_path = "/kaggle/input/sift-bow-hisogram/val_histograms.pkl"
test_histograms_path = "/kaggle/input/sift-bow-hisogram/test_histograms.pkl"

# Load histograms for train, validation, and test datasets
train_histograms = load_histograms(train_histograms_path)
val_histograms = load_histograms(val_histograms_path)
test_histograms = load_histograms(test_histograms_path)

Histograms loaded from /kaggle/input/sift-bow-hisogram/train_histograms.pkl
Histograms loaded from /kaggle/input/sift-bow-hisogram/val_histograms.pkl
Histograms loaded from /kaggle/input/sift-bow-hisogram/test_histograms.pkl


In [10]:
# Ensure histograms are in the correct shape
train_histograms = train_histograms.reshape(len(train_histograms), -1)
val_histograms = val_histograms.reshape(len(val_histograms), -1)
test_histograms = test_histograms.reshape(len(test_histograms), -1)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Define the parameter grid
param_grid = {
    'C': [1, 10, 100],
    'gamma': ['scale', 'auto', 0.01],
    'kernel': ['rbf', 'sigmoid']
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(train_histograms, train_labels)

# Output the best parameters and best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_}")

# Train the final model with the best parameters
best_svm = grid_search.best_estimator_
best_svm.fit(train_histograms, train_labels)

# Evaluate the model on validation and test sets
val_predictions = best_svm.predict(val_histograms)
test_predictions = best_svm.predict(test_histograms)

val_accuracy = accuracy_score(val_labels, val_predictions)
test_accuracy = accuracy_score(test_labels, test_predictions)

val_accuracy = accuracy_score(val_labels, val_predictions)
val_precision = precision_score(val_labels, val_predictions, average='weighted')
val_recall = recall_score(val_labels, val_predictions, average='weighted')
val_f1 = f1_score(val_labels, val_predictions, average='weighted')

test_accuracy = accuracy_score(test_labels, test_predictions)
test_precision = precision_score(test_labels, test_predictions, average='weighted')
test_recall = recall_score(test_labels, test_predictions, average='weighted')
test_f1 = f1_score(test_labels, test_predictions, average='weighted')

print(f"Validation Accuracy: {val_accuracy}")
print(f"Test Accuracy: {test_accuracy}")

print(f"Validation Accuracy: {val_accuracy}")
print(f"Validation Precision: {val_precision}")
print(f"Validation Recall: {val_recall}")
print(f"Validation F1-score: {val_f1}")

print(f"Test Accuracy: {test_accuracy}")
print(f"Test Precision: {test_precision}")
print(f"Test Recall: {test_recall}")
print(f"Test F1-score: {test_f1}")


In [None]:
# Train the final model with the best parameters
# Combine train and validation data
combined_histograms = np.vstack((train_histograms, val_histograms))
combined_labels = np.hstack((train_labels, val_labels))

# Scale the combined histograms
scaler = StandardScaler().fit(combined_histograms)
combined_histograms = scaler.transform(combined_histograms)
test_histograms = scaler.transform(test_histograms)

best_svm = SVC(C=1.0, gamma='scale', kernel='rbf') #enter de final(best) params 
best_svm.fit(train_histograms, train_labels)

# Train the SVM model with cross-validation to plot the performance
cv = StratifiedKFold(n_splits=5)
val_losses = []

for train_index, val_index in cv.split(combined_histograms, combined_labels):
    X_train, X_val = combined_histograms[train_index], combined_histograms[val_index]
    y_train, y_val = combined_labels[train_index], combined_labels[val_index]
    
    best_svm.fit(X_train, y_train)
    val_score = best_svm.score(X_val, y_val)
    val_loss = 1 - val_score  # Loss is 1 - accuracy
    val_losses.append(val_loss)

# Plot the validation loss
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(val_losses) + 1), val_losses, marker='o', linestyle='-', color='b')
plt.title('Validation Loss Across Cross-Validation Folds')
plt.xlabel('Fold')
plt.ylabel('Validation Loss')
plt.ylim([0, 1])
plt.grid()
plt.show()


# Train the best model on the combined training and validation data
best_svm.fit(combined_histograms, combined_labels)

# Save the final model
joblib.dump(best_svm, 'best_svm_model_final.pkl')
print(f"Final model saved as 'best_svm_model_final.pkl'")

# Save the final model
joblib.dump(best_svm, 'best_svm_model.pkl')
print(f"Final model saved as 'best_svm_model.pkl'")


# Evaluate the final model on the test set
test_accuracy = best_svm.score(test_histograms, test_labels)
print(f'Test Accuracy: {test_accuracy}')
