In [2]:
pip install requests beautifulsoup4 nltk scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install pandas numpy requests pillow keras scikit-learn xgboost imbalanced-learn

In [None]:
pip install --upgrade scikit-learn


## Scraping Craigslist data

In [1]:
import requests
from bs4 import BeautifulSoup
import json

# Base URL of the Craigslist page
base_url = 'https://chicago.craigslist.org/search/sya'

# Function to scrape a single page
def scrape_page(page_number, product_id_counter):
    url = f'{base_url}?s={page_number * 120}'  # Update the 's' parameter for pagination

    # Send a GET request to the page
    response = requests.get(url)
    data = response.text

    # Parse the HTML content of the page
    soup = BeautifulSoup(data, 'html.parser')

    # Find the script tag that contains the JSON data
    script = soup.find('script', {'id': 'ld_searchpage_results'})
    if not script:
        return product_id_counter

    # Extract and parse the JSON data
    json_data = json.loads(script.string)

    # Extract the list of listings
    listings = json_data['itemListElement']

    # Iterate over the listings and print the titles, images, and product IDs
    for listing in listings:
        title = listing['item']['name']
        images = listing['item'].get('image', [])
        product_id = product_id_counter
        print(f"Product ID: {product_id}, Title: {title}")
        for image in images:
            print(f"Image URL: {image}")
        print('-' * 20)
        product_id_counter += 1

    return product_id_counter

# Initialize product ID counter
product_id_counter = 1

# Scrape all 13 pages
for page in range(13):
    print(f"Scraping page {page + 1}")
    product_id_counter = scrape_page(page, product_id_counter)


Scraping page 1
Product ID: 1, Title: Gaming / Office Desktop PC
Image URL: https://images.craigslist.org/00W0W_dAbkpzR0Yoq_1320MM_600x450.jpg
Image URL: https://images.craigslist.org/01111_4jM5tz1TOM8_1320MM_600x450.jpg
Image URL: https://images.craigslist.org/00g0g_jHezqgaQT8P_1320MM_600x450.jpg
Image URL: https://images.craigslist.org/00I0I_6cxwgW3ctkv_1320MM_600x450.jpg
Image URL: https://images.craigslist.org/00r0r_goL6ev8vB9x_1320MM_600x450.jpg
Image URL: https://images.craigslist.org/00N0N_hrcRKtMsIbI_1320MM_600x450.jpg
Image URL: https://images.craigslist.org/00y0y_aqG1VzPO6Pr_1320MM_600x450.jpg
Image URL: https://images.craigslist.org/00i0i_76m6bcyv01i_1320MM_600x450.jpg
--------------------
Product ID: 2, Title: 4 eero 1st generation wifi mesh network - Model A010001
Image URL: https://images.craigslist.org/00c0c_l8ZfU5FhM3i_0oc0dk_600x450.jpg
Image URL: https://images.craigslist.org/00e0e_hsycooKg9g3_0oc06S_600x450.jpg
Image URL: https://images.craigslist.org/00K0K_iIl1u2I1V

In [2]:
import requests
from bs4 import BeautifulSoup
import json
import csv

# Base URL of the Craigslist page
base_url = 'https://chicago.craigslist.org/search/sya'

# Function to scrape a single page
def scrape_page(page_number, product_id_counter, csv_writer):
    url = f'{base_url}?s={page_number * 120}'  # Update the 's' parameter for pagination

    # Send a GET request to the page
    response = requests.get(url)
    data = response.text

    # Parse the HTML content of the page
    soup = BeautifulSoup(data, 'html.parser')

    # Find the script tag that contains the JSON data
    script = soup.find('script', {'id': 'ld_searchpage_results'})
    if not script:
        return product_id_counter

    # Extract and parse the JSON data
    json_data = json.loads(script.string)

    # Extract the list of listings
    listings = json_data['itemListElement']

    # Iterate over the listings and write to CSV
    for listing in listings:
        title = listing['item']['name']
        images = listing['item'].get('image', [])
        first_image = images[0] if images else 'No Image'  # Get the first image, if available
        product_id = product_id_counter

        # Write data to CSV
        csv_writer.writerow([product_id, title, first_image])

        product_id_counter += 1

    return product_id_counter

# Initialize product ID counter
product_id_counter = 1

# Create a CSV file and write header
csv_filename = 'craigslist_products.csv'
with open(csv_filename, 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['Product ID', 'Title', 'Image URL'])

    # Scrape all 13 pages
    for page in range(13):
        print(f"Scraping page {page + 1}")
        product_id_counter = scrape_page(page, product_id_counter, csv_writer)

print(f'Data saved to {csv_filename}')


Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
Scraping page 11
Scraping page 12
Scraping page 13
Data saved to craigslist_products.csv


## Reading the craigslist data to be used for validation

In [3]:
import pandas as pd
image_df = pd.read_csv('final.csv')

In [4]:
image_df['Product ID'] = image_df['Image URL'].str.extract(r'.org/(.*)_600x450\.jpg')
image_df

Unnamed: 0,Title,Description,Product,Image URL,Product ID
0,7th Gen i5 Lenovo Thinkpad X1 Carbon Laptop (8...,CORE i5 Lenovo Thinkpad Laptop It is in great ...,laptops,https://images.craigslist.org/00000_kytbIPSKIC...,00000_kytbIPSKICf_0bC0fu
1,Dell XPS 8910 Intel Core i7 6700 8GB RAM 1TB H...,"Case has some scatches, cosmetic only. works p...",laptops,https://images.craigslist.org/00202_dIpxHWfxES...,00202_dIpxHWfxESV_0hR0CI
2,"HP Compaq Pro 6300 Desktop Workstation w/ SSD,...",A basic desktop computer running a modern oper...,desktop,https://images.craigslist.org/00303_5lylQ8p0sV...,00303_5lylQ8p0sVb_08I0fu
3,"BenQ 27"" GL2780 16:9 75Hz FHD LCD Monitor","BenQ GL2780 27"" Eye-Care Stylish 16:9 LCD Moni...",monitors,https://images.craigslist.org/00303_kXrm10haP2...,00303_kXrm10haP2O_084084
4,Dell Inspiron 5400 All-in-One PC i7-1165g7 256...,"Dell Inspiron 5400 24"" Intel Core i7-1165G7 25...",desktop,https://images.craigslist.org/00404_eztDG3JpPv...,00404_eztDG3JpPve_0cu09W
...,...,...,...,...,...
77,Slim HP Black Wired Keyboard KU-0841,Pre-owned but excellent condition slim black H...,keyboard,https://images.craigslist.org/01212_9062NxBH1U...,01212_9062NxBH1Uu_0uY0ej
78,2017 MSI gaming laptop,Gaming laptop for parts or repair\nDoes not tu...,laptops,https://images.craigslist.org/00q0q_1FvouTXVCs...,00q0q_1FvouTXVCse_0CI0t2
79,"Samsung Notebook NP300E5E Intel Celeron, 2 ext...","Samsung Notebook NP300E5E Intel Celeron, 2 ext...",laptops,https://images.craigslist.org/01313_fifkDj9TJK...,01313_fifkDj9TJKhz_0cO08w
80,LG MONITOR 19â€ inch,Works perfect donâ€™t use any more.,monitor,https://images.craigslist.org/00b0b_8nya1Bhvxn...,00b0b_8nya1Bhvxns_0CI0t2


## Read the amazon data for training

In [26]:
amazon_df = pd.read_csv('Products.csv')
amazon_df = amazon_df.rename(columns={'Link':'Image URL','Tag':'Product','Name':'Title'})
# amazon_df = amazon_df['Image URL','Product','Title']
amazon_df

Unnamed: 0,Product,Title,Image URL
0,Laptop & Accessories,"Matein Travel Laptop Backpack, Business Anti T...",https://images-na.ssl-images-amazon.com/images...
1,Laptop & Accessories,Mac Book Pro Charger - 118W USB C Charger Fast...,https://images-na.ssl-images-amazon.com/images...
2,Laptop & Accessories,"OMOTON Laptop Stand, Detachable Laptop Mount, ...",https://images-na.ssl-images-amazon.com/images...
3,Laptop & Accessories,"YOREPEK Travel Backpack, Extra Large 50L Lapto...",https://images-na.ssl-images-amazon.com/images...
4,Laptop & Accessories,MOSISO Compatible with MacBook Air 13 inch Cas...,https://images-na.ssl-images-amazon.com/images...
...,...,...,...
1507,Keyboard,Logitech Signature K650 Comfort Full-Size Wire...,https://images-na.ssl-images-amazon.com/images...
1508,Keyboard,Logitech POP Keys Mechanical Wireless Keyboard...,https://images-na.ssl-images-amazon.com/images...
1509,Keyboard,"iClever Bluetooth Keyboard, BK08 Folding Keybo...",https://images-na.ssl-images-amazon.com/images...
1510,Keyboard,Perixx Periboard-512 Ergonomic Split Keyboard ...,https://images-na.ssl-images-amazon.com/images...


In [72]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [73]:
pip install tensorflow


Note: you may need to restart the kernel to use updated packages.


## CNN

In [13]:
import pandas as pd
import numpy as np
import requests
from PIL import Image
from io import BytesIO
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from keras.applications import VGG16
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score


df=amazon_df.copy()

# Preprocess function for images
def preprocess_image(url, size=(64, 64)):
    try:
        response = requests.get(url)
        image = Image.open(BytesIO(response.content))
        image = image.resize(size)
        image = image.convert('RGB')  # Ensure 3 color channels
        return np.array(image) / 255.0  # Normalize the image
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return None

# Encoding labels
label_encoder = LabelEncoder()
label_encoder.fit(df['Product'])  # Fit on all available labels
integer_encoded = label_encoder.transform(df['Product'])

# Preprocess images and prepare dataset
X = []
y = []

for _, row in df.iterrows():
    image_url = row['Image URL']
    label = row['Product']

    image = preprocess_image(image_url)
    if image is not None:
        X.append(image)
        y.append(label_encoder.transform([label])[0])

X = np.array(X)
y = np.array(y)

# Split the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# One-hot encode the labels
y_train = to_categorical(y_train, num_classes=len(label_encoder.classes_))
y_val = to_categorical(y_val, num_classes=len(label_encoder.classes_))

# Load the VGG16 model pre-trained on ImageNet
base_model = VGG16(include_top=False, weights='imagenet', input_shape=(64, 64, 3))
for layer in base_model.layers:
    layer.trainable = False  # Freeze the layers

# Build the model
model = Sequential([
    base_model,
    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Data Augmentation
train_datagen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest')

validation_datagen = ImageDataGenerator()

batch_size = 32
if len(X_train) < batch_size:
    batch_size = len(X_train)  # Reduce batch size if training data is small

# Adjust train_generator and validation_generator
train_generator = train_datagen.flow(X_train, y_train, batch_size=batch_size)
validation_generator = validation_datagen.flow(X_val, y_val, batch_size=batch_size)

# Calculate steps per epoch
train_steps_per_epoch = np.ceil(len(X_train) / batch_size)
val_steps_per_epoch = np.ceil(len(X_val) / batch_size)

# Check if steps per epoch are zero
if train_steps_per_epoch == 0 or val_steps_per_epoch == 0:
    raise ValueError("Training or validation set is too small for the batch size.")

# Train the model
history = model.fit(
    train_generator,
    steps_per_epoch=train_steps_per_epoch,
    epochs=10,
    validation_data=validation_generator,
    validation_steps=val_steps_per_epoch)

# Evaluate the model
_, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Accuracy: {accuracy}')


y_val_pred = model.predict(X_val)
y_val_pred_classes = np.argmax(y_val_pred, axis=1)
y_val_true = np.argmax(y_val, axis=1)

# Calculate the accuracy
accuracy = accuracy_score(y_val_true, y_val_pred_classes)
print(f'Validation Accuracy: {accuracy}')

# Calculate the misclassification rate
misclassification_rate = 1 - accuracy
print(f'Misclassification Rate: {misclassification_rate}')

# Calculate precision, recall, and F1 score
precision = precision_score(y_val_true, y_val_pred_classes, average='weighted')
recall = recall_score(y_val_true, y_val_pred_classes, average='weighted')
f1 = f1_score(y_val_true, y_val_pred_classes, average='weighted')

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

# Confusion Matrix
cm = confusion_matrix(y_val_true, y_val_pred_classes)
print('Confusion Matrix:\n', cm)

# ... [previous code for training the model] ...

def predict_new_images_with_probabilities(new_image_urls, model, label_encoder, preprocess_func, size=(64, 64)):
    new_images = [preprocess_func(url, size) for url in new_image_urls]
    new_images = [image for image in new_images if image is not None]
    new_images = np.array(new_images)

    if new_images.size == 0:
        return "No valid images for prediction."

    probabilities = model.predict(new_images)
    predicted_labels = label_encoder.inverse_transform([np.argmax(p) for p in probabilities])
    max_probabilities = np.max(probabilities, axis=1)

    return [(url, label, prob) for url, label, prob in zip(new_image_urls, predicted_labels, max_probabilities)]

# Example Usage with New Image URLs
image_url_list = image_df['Image URL'].tolist()

predictions_with_prob = predict_new_images_with_probabilities(image_url_list, model, label_encoder, preprocess_image)

# Display the predictions
for item in predictions_with_prob:
    if isinstance(item, tuple):
        print(f"Image URL: {item[0]}, Predicted Label: {item[1]}, Probability: {item[2]:.2f}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   0.5s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=150; total time=   0.4s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=4, n_estimators=150; total time=   0.4s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=6, n_estimators=100; total time=   0.3s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time=   0.5s
[CV] END max_depth=10, min_samples_leaf=3, min_samples_split=2, n_estimators=200; total time=   0.5s
[CV] END max_depth=10, min_samples_leaf=3, min_samples_split=6, n_estimators=100; total time=   0.3s
[CV] END max_depth=10, min_samples_leaf=3, min_samples_split=6, n_estimators=200; total time=   0.5s
[CV] END max_depth=15, min_samples_l

In [14]:
predictions_df = pd.DataFrame(predictions_with_prob, columns=['Image URL', 'predicted_label', 'probability'])

# Merge the predictions DataFrame with the original image_df DataFrame
# Ensure that the column names used in 'on=' are the same in both DataFrames
final_df = pd.merge(image_df, predictions_df, on='Image URL')

# # Display the final DataFrame
# print(final_df)

In [15]:
final_df
final_df['predicted_label'] = np.where(final_df['probability']< 0.2,'Others',final_df['predicted_label'])
final_df.to_csv('cnn_data.csv')

## Random Forest

In [10]:
import numpy as np
import requests
from PIL import Image
from io import BytesIO
from keras.applications.vgg16 import VGG16, preprocess_input
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

# Function Definitions
def preprocess_image(url, size=(64, 64)):
    try:
        response = requests.get(url)
        image = Image.open(BytesIO(response.content))
        image = image.resize(size)
        image = image.convert('RGB')
        return np.array(image) / 255.0
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return None

def extract_features_vgg16(images):
    base_model = VGG16(weights='imagenet', include_top=False, input_shape=(64, 64, 3))
    base_model.trainable = False
    features = base_model.predict(preprocess_input(np.array(images)))
    return features.reshape(features.shape[0], -1)

# Load dataset
 # Update the path accordingly
label_encoder = LabelEncoder()

# Preprocess images and extract features
X, y = [], []
for url, label in zip(amazon_df['Image URL'], amazon_df['Product']):  # Update column names if different
    processed_image = preprocess_image(url)
    if processed_image is not None:
        X.append(processed_image)
        y.append(label)



X_features = extract_features_vgg16(np.array(X))
y_encoded = label_encoder.fit_transform(y)

# Check shapes of extracted features
print("Shape of Extracted Features:", X_features.shape)

# Split the dataset
X_train, X_val, y_train, y_val = train_test_split(X_features, y_encoded, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 3]
}


# Initialize the GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42, class_weight='balanced'), 
                           param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)



# Retrieve the best model
best_model = grid_search.best_estimator_

# Cross-validation
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))

# Evaluate the best model on validation set
y_val_pred = best_model.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy}")

# Feature importance
importances = best_model.feature_importances_
print("Top 10 Feature Importances:\n", importances[:10])

# Prediction function for new images using Random Forest
def predict_with_rf(new_image_urls, model, label_encoder, preprocess_func):
    new_images = [preprocess_func(url) for url in new_image_urls]
    new_images = [img for img in new_images if img is not None]
    new_features = extract_features_vgg16(np.array(new_images))
    probabilities = model.predict_proba(new_features)
    predicted_labels = label_encoder.inverse_transform([np.argmax(p) for p in probabilities])
    max_probabilities = np.max(probabilities, axis=1)
    return [(url, label, prob) for url, label, prob in zip(new_image_urls, predicted_labels, max_probabilities)]

# Example usage with new image URLs
image_url_list = image_df['Image URL'].tolist()  # Make sure this matches your dataframe
predictions_with_prob = predict_with_rf(image_url_list, grid_search.best_estimator_, label_encoder, preprocess_image)

# Display the predictions
for item in predictions_with_prob:
    print(f"Image URL: {item[0]}, Predicted Label: {item[1]}, Probability: {item[2]:.2f}")
# Example usage with new image URLsx
image_url_list = image_df['Image URL'].tolist()  # Make sure this matches your dataframe
predictions_with_prob = predict_with_rf(image_url_list, best_model, label_encoder, preprocess_image)

# Display the predictions
for item in predictions_with_prob:
    print(f"Image URL: {item[0]}, Predicted Label: {item[1]}, Probability: {item[2]:.2f}")

# Evaluate the best model on the validation set
y_val_pred = best_model.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred, average='weighted')
recall = recall_score(y_val, y_val_pred, average='weighted')
f1 = f1_score(y_val, y_val_pred, average='weighted')

print(f"Validation Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Shape of Extracted Features: (1512, 2048)
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Cross-validation scores: [0.85123967 0.84297521 0.82644628 0.82644628 0.85477178]
Mean CV Score: 0.8403758444497788
Validation Accuracy: 0.8316831683168316
Top 10 Feature Importances:
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Image URL: https://images.craigslist.org/00000_kytbIPSKICf_0bC0fu_600x450.jpg, Predicted Label: Printers, Probability: 0.38
Image URL: https://images.craigslist.org/00202_dIpxHWfxESV_0hR0CI_600x450.jpg, Predicted Label: Laptop & Accessories, Probability: 0.33
Image URL: https://images.craigslist.org/00303_5lylQ8p0sVb_08I0fu_600x450.jpg, Predicted Label: Tablets, Probability: 0.18
Image URL: https://images.craigslist.org/00303_kXrm10haP2O_084084_600x450.jpg, Predicted Label: Monitors, Probability: 0.47
Image URL: https://images.craigslist.org/00404_eztDG3JpPve_0cu09W_600x450.jpg, Predicted Label: Laptop & Accessories, Probability: 0.24
Image URL: https://images.craigslist

In [11]:
predictions_df = pd.DataFrame(predictions_with_prob, columns=['Image URL', 'predicted_label', 'probability'])

# Merge the predictions DataFrame with the original image_df DataFrame
# Ensure that the column names used in 'on=' are the same in both DataFrames
final_df = pd.merge(image_df, predictions_df, on='Image URL')

# # Display the final DataFrame
# print(final_df)

In [12]:
final_df
final_df['predicted_label'] = np.where(final_df['probability']< 0.2,'Others',final_df['predicted_label'])
final_df.to_csv('rf_data.csv')

## XGBoost Classifier

In [27]:
import pandas as pd
import numpy as np
import requests
from PIL import Image
from io import BytesIO
from keras.applications.vgg16 import VGG16, preprocess_input
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

# Function to preprocess images
def preprocess_image(url, size=(64, 64)):
    try:
        response = requests.get(url)
        image = Image.open(BytesIO(response.content))
        image = image.resize(size)
        image = image.convert('RGB')
        return np.array(image) / 255.0
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return None

# Function to extract VGG16 features
def extract_features_vgg16(images):
    base_model = VGG16(weights='imagenet', include_top=False, input_shape=(64, 64, 3))
    base_model.trainable = False
    features = base_model.predict(preprocess_input(np.array(images)))
    return features.reshape(features.shape[0], -1)

# Load your dataset here
# Example: amazon_df = pd.read_csv('your_dataset.csv')
# Make sure you have a column 'Image URL' for image URLs and 'Product' for labels


# Preprocess images and extract features
X, y = [], []
for url, label in zip(amazon_df['Image URL'], amazon_df['Product']):
    processed_image = preprocess_image(url)
    if processed_image is not None:
        X.append(processed_image)
        y.append(label)

# Encoding labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Extract features
X_features = extract_features_vgg16(np.array(X))

# Split the dataset
X_train, X_val, y_train, y_val = train_test_split(X_features, y_encoded, test_size=0.2, random_state=42)

# Initialize XGBoost Classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.5, 0.7, 1.0]
}

# Initialize the GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Retrieve the best model
best_model = grid_search.best_estimator_

# Evaluate the best model on validation set
y_val_pred = best_model.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred, average='weighted')
recall = recall_score(y_val, y_val_pred, average='weighted')
f1 = f1_score(y_val, y_val_pred, average='weighted')

print(f"Validation Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# Prediction function for new images
def predict_with_rf(new_image_urls, model, label_encoder, preprocess_func):
    new_images = [preprocess_func(url) for url in new_image_urls if preprocess_func(url) is not None]
    new_features = extract_features_vgg16(np.array(new_images))
    probabilities = model.predict_proba(new_features)
    predicted_labels = label_encoder.inverse_transform([np.argmax(p) for p in probabilities])
    max_probabilities = np.max(probabilities, axis=1)
    return [(url, label, prob) for url, label, prob in zip(new_image_urls, predicted_labels, max_probabilities)]

# Using the DataFrame's image URLs for prediction
image_url_list = amazon_df['Image URL'].tolist()
predictions_with_prob = predict_with_rf(image_url_list, best_model, label_encoder, preprocess_image)

# Display the predictions
for item in predictions_with_prob:
    print(f"Image URL: {item[0]}, Predicted Label: {item[1]}, Probability: {item[2]:.2f}")
    
# Evaluate the best model on the validation set
y_val_pred = best_model.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred, average='weighted')
recall = recall_score(y_val, y_val_pred, average='weighted')
f1 = f1_score(y_val, y_val_pred, average='weighted')

print(f"Validation Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Fitting 3 folds for each of 81 candidates, totalling 243 fits
Validation Accuracy: 0.8382838283828383
Precision: 0.8512996580676094
Recall: 0.8382838283828383
F1 Score: 0.8414283509798943
Image URL: https://images-na.ssl-images-amazon.com/images/I/81L8quiJXhL._AC_UL300_SR300,200_.jpg, Predicted Label: Laptop & Accessories, Probability: 0.99
Image URL: https://images-na.ssl-images-amazon.com/images/I/51cpWg2Ay7L._AC_UL300_SR300,200_.jpg, Predicted Label: Laptop & Accessories, Probability: 0.96
Image URL: https://images-na.ssl-images-amazon.com/images/I/71cWPw6TxnL._AC_UL300_SR300,200_.jpg, Predicted Label: Laptop & Accessories, Probability: 0.67
Image URL: https://images-na.ssl-images-amazon.com/images/I/711J5f5DBhL._AC_UL300_SR300,200_.jpg, Predicted Label: Laptop & Accessories, Probability: 0.98
Image URL: https://images-na.ssl-images-amazon.com/images/I/51A2y8WC7mL._AC_UL300_SR300,200_.jpg, Predicted Label: Laptop & Accessories, Probability: 0.96
Image URL: https://images-na.ssl-imag

In [None]:
predictions_df = pd.DataFrame(predictions_with_prob, columns=['Image URL', 'predicted_label', 'probability'])

# Merge the predictions DataFrame with the original image_df DataFrame
# Ensure that the column names used in 'on=' are the same in both DataFrames
final_df = pd.merge(image_df, predictions_df, on='Image URL')

# # Display the final DataFrame
# print(final_df)

In [None]:
final_df
final_df['predicted_label'] = np.where(final_df['probability']< 0.2,'Others',final_df['predicted_label'])
final_df.to_csv('xgb_data.csv')

## Final model selected is Random Forest which is then compared with text classification results for final classification