# Combine text and image feature vectors and use different clustering algorithm
**This notebook runs on Kaggle.**
In this notebook we combine image features and text features to do clustering with best performing algorithms. We use both Bag of Words approach and Sentence Transformers

Needed for this notebook:
* Pre-processed CSV file using [data_exploration_and_cleaning.ipynb](data_exploration_and_cleaning.ipynb) with name in English and German (both clean) 
* Image features dictionary in a pickle file generated in [finetuning-resnet50.ipynb](finetuning-resnet50.ipynb) or [finetuning-vgg16.ipynb](finetuning-vgg16.ipynb)

In [None]:
# Line below is in case you are executing in collab or kaggle and need to install sentence transformers
!pip install sentence_transformers

In [None]:
# import os
import pickle
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# For embedding text
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from sklearn.cluster import DBSCAN, KMeans
from sklearn.feature_extraction.text import CountVectorizer

# models
import tensorflow as tf
print('TF Version:', tf.__version__)

## Helper functions

In [None]:
def dump_to_pickle_file(parts_feature, dump_path):
    '''
    Dump a dictionary to a pickle file.

    Parameters:
    parts_feature (dict): Dictionary to be saved.
    dump_path (str): Path to the pickle file.
    '''
    with open(dump_path, 'wb') as file:
        pickle.dump(parts_feature, file)

def load_from_pickle_file(pickle_path):
    '''
    Load a dictionary from a pickle file.

    Parameters:
    pickle_path (str): Path to the pickle file.

    Returns:
    dict: Loaded dictionary.
    '''
    with open(pickle_path, 'rb') as file: 
        return pickle.load(file)

In [None]:
# This set of functions helps in creating BOW
import unicodedata
import re

def remove_accented_chars(text):
    """
    Remove accented characters from a string.

    Args:
        text (str): Input text containing accented characters.

    Returns:
        str: Text with accented characters replaced with their ASCII equivalents.
    """
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def pre_process_corpus(docs):
    """
    Pre-process a list of documents for text analysis.

    Args:
        docs (list of str): List of documents to be pre-processed.

    Returns:
        list of str: List of pre-processed documents.
    """
    norm_docs = []
    for doc in docs:
        doc = doc.translate(doc.maketrans("\n\t\r", "   "))
        doc = doc.lower()
        doc = remove_accented_chars(doc)
        # lower case and remove special characters\whitespaces
        doc = re.sub(r'[^a-zA-Z0-9\s]', ' ', doc, flags=re.I|re.A)
        doc = re.sub(' +', ' ', doc)
        doc = doc.strip()
        norm_docs.append(doc)
    return norm_docs

In [None]:
# This helper function helps evaluate
def print_evaluation(target, labels, itemnumber, remove_noise = False):
    """
    Print evaluation metrics for clustering results.

    Args:
        target (list): List of true target labels.
        labels (list): List of predicted cluster labels.
        itemnumber (list): List of item numbers corresponding to the samples.
        remove_noise (bool, optional): Whether to remove noise clusters. Defaults to False.
    """
    # create df with inputs
    d = {"target" : target, "cluster":labels,"ItemNumber":itemnumber}
    cluster_labels = pd.DataFrame(d)
    if remove_noise:
        cluster_labels = cluster_labels[~cluster_labels['cluster'].isin([-1])]
    cluster_nums=cluster_labels.cluster.unique()
    # We create a ccluster map assignning each cluster to most frequent target_subfamily class in it
    cluster_map = {}
    for cluster in cluster_nums:
        cluster_map[cluster] = cluster_labels[cluster_labels.cluster.isin([cluster])].target.value_counts().index[0]
    #  We print results
    print("-------------------------------------")
    cluster_labels["predicted_target"] = cluster_labels.cluster.map(cluster_map) 
    cluster_labels["correct"] = cluster_labels.apply(lambda x: 1 if x["target"]== x["predicted_target"] else 0, axis =1)
    print("Number of samples",cluster_labels.correct.count())
    print("Number of clusters:",cluster_labels.cluster.nunique())
    print(cluster_labels.correct.value_counts())
    print("Percentage correct:",cluster_labels[cluster_labels["correct"] ==1].correct.count()/cluster_labels.correct.count())
    print("------------------------------------------")

## Data and image features loading

In [None]:
# import cleaned csv file
PROCESSED_DATA_CSV_PATHFILE="/kaggle/input/syrus-data/SyrusMasterDataAnonymisedProc.csv"
df = pd.read_csv(PROCESSED_DATA_CSV_PATHFILE)
df = df[df['EnglishItemNameClean'].notna()]
df = df[df['hasImage']==1]
df['FileName'] = df['ItemNumber'].apply(lambda x: x+".jpeg")

In [None]:
# load image feature dictionary
IMAGE_FEATURES_PATH = "/kaggle/input/syrus-data/features_resnet50_finetuned_segmented_final_full_data.pkl"
image_features = load_from_pickle_file(IMAGE_FEATURES_PATH)
image_feature_df = pd.DataFrame(list(image_features.items()), columns=['ItemNumber', 'ImageFeature'])
# Strip ".jpeg"
image_feature_df["ItemNumber"] = image_feature_df["ItemNumber"].apply(lambda x: x.strip('.jpeg'))
# merge image features into the CSV with other data
df_merged = pd.merge(df, image_feature_df, on='ItemNumber', how='inner')
# create a matrix with features
ImFeatureMatrix= np.vstack(df_merged['ImageFeature'])

## Feature extraction from English item names using Sentence Transformer or BOW and clustering using DBSCAN or k-means

In [None]:
# load embedder sentence transformer
EMBEDDER = SentenceTransformer('sentence-t5-large')

In [None]:
# create embeddings
embeddings = EMBEDDER.encode(pre_process_corpus(df_merged['EnglishItemNameClean']))

In [None]:
# add image features to text embeddings by vertically stacking all together. Each row is one spare part (one data point)
full_features_unnormalized = np.hstack((embeddings,ImFeatureMatrix))
# normalize so so that all features have same importance
full_features = normalize(full_features_unnormalized, axis=0, norm='l1')

In [None]:
# perform density based clustering on combined image features and sentence transormer embedding
dbscanModel = DBSCAN(eps = 0.1, min_samples = 5, n_jobs = -1, metric= "cosine")
labels = dbscanModel.fit_predict(full_features)
print_evaluation(df_merged.target, labels, df_merged.ItemNumber,  remove_noise = True )

In [None]:
# perform kmeans clustering on combined image features and sentence transormer embedding
true_k = 500
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=300)
labels = model.fit_predict(full_features)
print_evaluation(df_merged.target, labels, df_merged.ItemNumber)

In [None]:
# create BOW
cv = CountVectorizer()
cv_features = cv.fit_transform(pre_process_corpus(df_merged['EnglishItemNameClean']))

In [None]:
# add image features to text BOW by vertically stacking all together. Each row is one spare part (one data point)
full_features_unnormalized = np.hstack((np.asarray(cv_features.todense()),ImFeatureMatrix))
# normalize so so that all features have same importance
full_features = normalize(full_features_unnormalized, axis=0, norm='l1')

In [None]:
# perform density based clustering with combined image features and BOW and evaluate removing noise points
dbscanModel = DBSCAN(eps = 0.1, min_samples = 5, n_jobs = -1, metric= "cosine")
labels = dbscanModel.fit_predict(full_features)
print_evaluation(df_merged.target, labels, df_merged.ItemNumber,  remove_noise = True )

In [None]:
# perform kmeans clustering with combined image features and BOW and evaluate removing noise points
true_k = 500
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=300)
labels = model.fit_predict(full_features)
print_evaluation(df_merged.target, labels, df_merged.ItemNumber)