# Extracting Feature Vectors from Text
**This notebook runs locally.**

It extracts feature vectors from the text file name. It uses both Bag of Words approach and Sentence Transformer.

Needed for this notebook:
* Pre-processed CSV file using [data_exploration_and_cleaning.ipynb](data_exploration_and_cleaning.ipynb) with name in English and German (both clean)


In [8]:
# import packages
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans, DBSCAN

In [4]:
# import csv file
PROCESSED_DATA_CSV_PATHFILE="../../data/processed/SyrusMasterDataAnonymisedProc.csv"
df = pd.read_csv(PROCESSED_DATA_CSV_PATHFILE)
df = df[df['GermanItemNameClean'].notna()]

## Helper functions

In [16]:
import unicodedata
import re

def remove_accented_chars(text):
    """
    Remove accented characters from a string.

    Args:
        text (str): Input text containing accented characters.

    Returns:
        str: Text with accented characters replaced with their ASCII equivalents.
    """
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def pre_process_corpus(docs):
    """
    Pre-process a list of documents for text analysis.

    Args:
        docs (list of str): List of documents to be pre-processed.

    Returns:
        list of str: List of pre-processed documents.
    """
    norm_docs = []
    for doc in docs:
        doc = doc.translate(doc.maketrans("\n\t\r", "   "))
        doc = doc.lower()
        doc = remove_accented_chars(doc)
        # lower case and remove special characters\whitespaces
        doc = re.sub(r'[^a-zA-Z0-9\s]', ' ', doc, flags=re.I|re.A)
        doc = re.sub(' +', ' ', doc)
        doc = doc.strip()
        norm_docs.append(doc)
    return norm_docs

def print_evaluation(target, labels, itemnumber, remove_noise = False ):
    """
    Print evaluation metrics for clustering results.

    Args:
        target (list): List of true target labels.
        labels (list): List of predicted cluster labels.
        itemnumber (list): List of item numbers corresponding to the samples.
        remove_noise (bool, optional): Whether to remove noise clusters. Defaults to False.
    """
    # create df with inputs
    d = {"target" : target, "cluster":labels,"ItemNumber":itemnumber  }
    cluster_labels = pd.DataFrame(d)
    if remove_noise:
        cluster_labels = cluster_labels[~cluster_labels['cluster'].isin([-1])]
    cluster_nums=cluster_labels.cluster.unique()
    # create a cluster map assignning each cluster to most frequent target_subfamily class in it
    cluster_map = {}
    for cluster in cluster_nums:
        cluster_map[cluster] = cluster_labels[cluster_labels.cluster.isin([cluster])].target.value_counts().index[0]
    # print results
    print("-------------------------------------")
    cluster_labels["predicted_target"] = cluster_labels.cluster.map(cluster_map) 
    cluster_labels["correct"] = cluster_labels.apply(lambda x: 1 if x["target"]== x["predicted_target"] else 0, axis =1)
    print("Number of samples",cluster_labels.correct.count())
    print("Number of clusters:",cluster_labels.cluster.nunique())
    print(cluster_labels.correct.value_counts())
    print("Percentage correct:",cluster_labels[cluster_labels["correct"] ==1].correct.count()/cluster_labels.correct.count())
    print("------------------------------------------")

In [10]:
# generate clean text
pre_proc_text = pre_process_corpus(df.EnglishItemName.values)
# create Bag of Words (BOW)
cv = CountVectorizer()
cv_features = cv.fit_transform(pre_proc_text)

## K-means clustering using BOW

In [12]:
# do a Kmeans clustering only on BOW features and print resutls using helper function
true_k = 500
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=300)
labels = model.fit_predict(cv_features)
print_evaluation(df.target, labels, df.ItemNumber )

  super()._check_params_vs_input(X, default_n_init=10)


-------------------------------------
Number of samples 13369
Number of clusters: 500
correct
1    9186
0    4183
Name: count, dtype: int64
Percentage correct: 0.6871119754656294
------------------------------------------


## Clustering using Sentence Transformer instead of BOW

In [13]:
# import and load sentence transformer
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('sentence-t5-large')

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
# use loaded embedder to extract feature vectors from cleaned text
embeddings = embedder.encode(pre_proc_text)

In [17]:
true_k = 500
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=300)
labels = model.fit_predict(embeddings)
print_evaluation(df.target, labels, df.ItemNumber )

  super()._check_params_vs_input(X, default_n_init=10)


-------------------------------------
Number of samples 13369
Number of clusters: 500
correct
1    9133
0    4236
Name: count, dtype: int64
Percentage correct: 0.6831475802229038
------------------------------------------


## Clustering using DBSCAN algorithm and removing noise labelled data points for evaluation

In [18]:
# First with BOW
dbscanModel = DBSCAN(eps = 0.1, min_samples = 5, n_jobs = -1, metric= "cosine")
labels = dbscanModel.fit_predict(cv_features)
print_evaluation(df.target, labels, df.ItemNumber,  remove_noise = True )

-------------------------------------
Number of samples 6325
Number of clusters: 196
correct
1    4386
0    1939
Name: count, dtype: int64
Percentage correct: 0.6934387351778656
------------------------------------------


In [20]:
# second with transformers
dbscanModel = DBSCAN(eps = 0.1, min_samples = 5, n_jobs = -1, metric= "cosine")
labels = dbscanModel.fit_predict(embeddings)
print_evaluation(df.target, labels, df.ItemNumber,  remove_noise = True )

-------------------------------------
Number of samples 12089
Number of clusters: 68
correct
0    9755
1    2334
Name: count, dtype: int64
Percentage correct: 0.19306807841839688
------------------------------------------
