# **Sentence Transformers installation**
The "sentence-transformers" library is a Python package that provides pre-trained models for converting sentences or texts into dense vector representations. These vector representations, also known as embeddings, capture the semantic meaning of the input text.



In [None]:
!pip install -U sentence-transformers

# **Tweet-preprocessor installation**
 The "tweet-preprocessor" package is a useful tool when working with tweet data, allowing you to preprocess and clean tweet text for a variety of applications, including text analysis, machine learning, data mining, and social media analytics.

In [2]:
!pip install tweet-preprocessor

Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


# **Load librairies**





In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import re
import preprocessor as p
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# **SentenceTransformer instance Initialization**
 Initialization of an instance of the SentenceTransformer class from the "sentence-transformers" package, specifically using the 'all-MiniLM-L6-v2' model as the underlying embedding model.

In [24]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

# **Read the Dataset**

In [5]:
import pandas as pd

#creating a dataframe for our dataset "subject_questions"
df = pd.read_csv("subjects-questions.csv")
df=df.head(7000)
#printing first 5 rows of our dataset/dataframe
df.head()

Unnamed: 0,eng,Subject
0,An anti-forest measure is\nA. Afforestation\nB...,Biology
1,"Among the following organic acids, the acid pr...",Chemistry
2,If the area of two similar triangles are equal...,Maths
3,"In recent year, there has been a growing\nconc...",Biology
4,Which of the following statement\nregarding tr...,Physics


In [6]:
df.shape

(7000, 2)

# **Transform subject labels by numerical values (cluster number)**

In [7]:
unique_values = df['Subject'].unique()
print(unique_values)

['Biology' 'Chemistry' 'Maths' 'Physics']


In [8]:
#encoding labels

df['Subject']=df['Subject'].map({'Biology':0,
                             'Chemistry':1,
                             'Physics':2,
                                 'Maths':3})

In [9]:
df.head()

Unnamed: 0,eng,Subject
0,An anti-forest measure is\nA. Afforestation\nB...,0
1,"Among the following organic acids, the acid pr...",1
2,If the area of two similar triangles are equal...,3
3,"In recent year, there has been a growing\nconc...",0
4,Which of the following statement\nregarding tr...,2


# **Cleaning the Data Frame**




In [10]:
def clean_tweets(df):
    # punctuations we want to be replaced
    REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})")
    REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")
    corpus = []
    for line in df:
        # clean data by sending it to tweet_processor
        sentence = p.clean(line)
        # remove puctuation
        sentence = REPLACE_NO_SPACE.sub("", sentence.lower()) # convert all tweets to lower cases
        sentence = REPLACE_WITH_SPACE.sub(" ", sentence)
        corpus.append(sentence)
    return corpus

In [11]:
df['eng_clean'] = clean_tweets(df['eng'])
df.shape

(7000, 3)

In [12]:
df['eng_clean']

0       an anti forest measure isa afforestationb sele...
1       among the following organic acids the acid pre...
2       if the area of two similar triangles are equal...
3       in recent year there has been a growingconcern...
4       which of the following statementregarding tran...
                              ...                        
6995    as per brauns principle yield of ammonia will ...
6996    when spheres are raised to the sametemperature...
6997    what is the smallest positive numbergreater th...
6998    the strongest attractive forces area electrost...
6999    the wavelength of a matter wave isgiven bya he...
Name: eng_clean, Length: 7000, dtype: object

# **Corpus**

In [13]:
corpus = list(df['eng_clean'])

# **Word Embeddings using Sentence Transformer**

In [25]:
import numpy as np
corpus_embeddings= embedder.encode(corpus)

corpus_embeddings=corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)


# **K-Means Clustering**

In [26]:
num_clusters = 4
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_
cluster_assignment



array([1, 1, 0, ..., 0, 3, 3], dtype=int32)

# **Add clusters predicted values in the dataFrame**

In [16]:
df['cluster']=cluster_assignment
df.head(20)

Unnamed: 0,eng,Subject,eng_clean,cluster
0,An anti-forest measure is\nA. Afforestation\nB...,0,an anti forest measure isa afforestationb sele...,0
1,"Among the following organic acids, the acid pr...",1,among the following organic acids the acid pre...,0
2,If the area of two similar triangles are equal...,3,if the area of two similar triangles are equal...,3
3,"In recent year, there has been a growing\nconc...",0,in recent year there has been a growingconcern...,0
4,Which of the following statement\nregarding tr...,2,which of the following statementregarding tran...,2
5,Fern plants reproduce by\nA. Seeds\nB. Spores\...,0,fern plants reproduce bya seedsb sporesc layin...,0
6,Electric current flows through:\nA. a conducto...,2,electric current flows througha a conductorb a...,2
7,The sides of a right angled triangle are in A....,3,the sides of a right angled triangle are in ap...,3
8,If the mass of a body is \( M \) on the\nsurfa...,2,if the mass of a body is \ m \ on thesurface o...,2
9,A particle of mass \( m \) is made to move\nwi...,2,a particle of mass \ m \ is made to movewith u...,2


# **Mapping predicted clusters numbers**



In [17]:
import numpy as np

def tri_et_index(tableau):
    # Obtenir les indices des valeurs triées en ordre décroissant
    indices_tries = np.flip(np.argsort(tableau))

    return indices_tries

In [27]:
import numpy as np

def map_labels(predicted_labels, true_labels):
    unique_predicted_labels = np.unique(predicted_labels)
    label_mapping = {index: 10 for index in unique_predicted_labels}

    for predicted_label in unique_predicted_labels:
        i=0
        mask = predicted_labels == predicted_label
        cluster_true_labels = true_labels[mask]
        true_label_counts = np.bincount(cluster_true_labels)
        indexs=tri_et_index(true_label_counts)
        most_frequent_true_label=indexs[i]
        l=[label_mapping[label] for label in unique_predicted_labels]

        while most_frequent_true_label in l[:predicted_label]:
            most_frequent_true_label = indexs[i+1]
            i+=1
        label_mapping[predicted_label] = most_frequent_true_label

    return label_mapping

label_mapping = map_labels(cluster_assignment,df["Subject"])


In [28]:
mapped_predicted_labels = np.array([label_mapping[label] for label in cluster_assignment])
unique_values = np.unique(mapped_predicted_labels)
label_mapping

{0: 3, 1: 0, 2: 1, 3: 2}

# **Accuracy**

In [29]:
from sklearn.metrics import accuracy_score
clustering_accuracy = accuracy_score(df["Subject"], mapped_predicted_labels)

clustering_accuracy

0.7457142857142857

#**Silhouette_score**
The average Silhouette score is also used as an evaluation measure in clustering. The best silhouette score is 1 and the worst is -1. Values close to zero indicate that data points are on the boundary i.e overlapping the clusters.

In [35]:
from sklearn.metrics import silhouette_score

# Assuming 'corpus_embeddings' is the sentence representations and 'mapped_predicted_labels' is the predicted clusters
silhouette_avg = silhouette_score(corpus_embeddings, mapped_predicted_labels)
print("Silhouette Score:", silhouette_avg)

Silhouette Score: 0.026897624



# **F_measure (F1-score)**
 F-measure score, which is a metric commonly used to evaluate the clustering performance. It quantifies the trade-off between precision and recall, providing a single value that represents the overall clustering quality. The higher the F-measure score, the better the clustering performance.



In [34]:
from sklearn.metrics import f1_score

# Calculate F-measure
f_measure = f1_score(df["Subject"],  mapped_predicted_labels, average='weighted')

print("F-measure:", f_measure)

F-measure: 0.7576026641564053
