# **Install preprocessor**

In [2]:
!pip install tweet-preprocessor

Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


# **Load libraries**

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from gensim.models import KeyedVectors


In [4]:
import pandas as pd
import re
import preprocessor as p
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# **Read the dataset**

In [5]:
import pandas as pd

#creating a dataframe for our dataset "subject_questions"
df = pd.read_csv("subjects-questions.csv")
df=df.head(7000)
#printing first 5 rows of our dataset/dataframe
df.head()

Unnamed: 0,eng,Subject
0,An anti-forest measure is\nA. Afforestation\nB...,Biology
1,"Among the following organic acids, the acid pr...",Chemistry
2,If the area of two similar triangles are equal...,Maths
3,"In recent year, there has been a growing\nconc...",Biology
4,Which of the following statement\nregarding tr...,Physics


In [6]:
df.shape

(7000, 2)

# **Transform subject labels by numerical values (cluster number)**

In [7]:
unique_values = df['Subject'].unique()
print(unique_values)

['Biology' 'Chemistry' 'Maths' 'Physics']


In [8]:
#encoding labels

df['Subject']=df['Subject'].map({'Biology':0,
                             'Chemistry':1,
                             'Physics':2,
                                 'Maths':3})

In [9]:
df.head()

Unnamed: 0,eng,Subject
0,An anti-forest measure is\nA. Afforestation\nB...,0
1,"Among the following organic acids, the acid pr...",1
2,If the area of two similar triangles are equal...,3
3,"In recent year, there has been a growing\nconc...",0
4,Which of the following statement\nregarding tr...,2


# **Clean the dataFrame**

In [10]:
def clean_tweets(df):
    # punctuations we want to be replaced
    REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})")
    REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")
    corpus = []
    for line in df:
        # clean data by sending it to tweet_processor
        sentence = p.clean(line)
        # remove puctuation
        sentence = REPLACE_NO_SPACE.sub("", sentence.lower()) # convert all tweets to lower cases
        sentence = REPLACE_WITH_SPACE.sub(" ", sentence)
        corpus.append(sentence)
    return corpus

In [11]:
df['eng_clean'] = clean_tweets(df['eng'])
df.shape

(7000, 3)

In [12]:
df['eng_clean']

0       an anti forest measure isa afforestationb sele...
1       among the following organic acids the acid pre...
2       if the area of two similar triangles are equal...
3       in recent year there has been a growingconcern...
4       which of the following statementregarding tran...
                              ...                        
6995    as per brauns principle yield of ammonia will ...
6996    when spheres are raised to the sametemperature...
6997    what is the smallest positive numbergreater th...
6998    the strongest attractive forces area electrost...
6999    the wavelength of a matter wave isgiven bya he...
Name: eng_clean, Length: 7000, dtype: object

# **Corpus**

In [13]:
corpus = list(df['eng_clean'])

In [None]:
corpus


# **Word Embeddings using GloVe pre-trained model with spaCy**



In [None]:
!python -m spacy download en_core_web_md

In [16]:
import spacy
import numpy as np
from sklearn.cluster import KMeans

# Load the Word Embeddings GloVe pretrained model with spaCy
nlp = spacy.load('en_core_web_md')

# Get the embeddings for each sentence in the corpus
embeddings = []
for sentence in corpus:
    doc = nlp(sentence)
    sentence_embedding = doc.vector
    embeddings.append(sentence_embedding)

# Reshape the embeddings list
embeddings = np.array(embeddings)
embeddings = embeddings.reshape(-1, len(embeddings[0]))




## **GaussianMixture Clustering**

In [18]:
from sklearn.mixture import GaussianMixture

num_clusters = 4
clustering_model = GaussianMixture(n_components=num_clusters)
cluster_assignment = clustering_model.fit_predict(embeddings)


# **Add clusters predicted values in the dataFrame**:

In [19]:
df['cluster']=cluster_assignment
df.head(20)

Unnamed: 0,eng,Subject,eng_clean,cluster
0,An anti-forest measure is\nA. Afforestation\nB...,0,an anti forest measure isa afforestationb sele...,1
1,"Among the following organic acids, the acid pr...",1,among the following organic acids the acid pre...,1
2,If the area of two similar triangles are equal...,3,if the area of two similar triangles are equal...,1
3,"In recent year, there has been a growing\nconc...",0,in recent year there has been a growingconcern...,1
4,Which of the following statement\nregarding tr...,2,which of the following statementregarding tran...,2
5,Fern plants reproduce by\nA. Seeds\nB. Spores\...,0,fern plants reproduce bya seedsb sporesc layin...,1
6,Electric current flows through:\nA. a conducto...,2,electric current flows througha a conductorb a...,1
7,The sides of a right angled triangle are in A....,3,the sides of a right angled triangle are in ap...,2
8,If the mass of a body is \( M \) on the\nsurfa...,2,if the mass of a body is \ m \ on thesurface o...,2
9,A particle of mass \( m \) is made to move\nwi...,2,a particle of mass \ m \ is made to movewith u...,2


# **Mapping predicted clusters numbers**

In [21]:
import numpy as np

def tri_et_index(tableau):
    # Obtain values sorted indexs
    indices_tries = np.flip(np.argsort(tableau))

    return indices_tries

In [22]:
import numpy as np

def map_labels(predicted_labels, true_labels):
    unique_predicted_labels = np.unique(predicted_labels)
    label_mapping = {index: 10 for index in unique_predicted_labels}

    for predicted_label in unique_predicted_labels:
        i=0
        mask = predicted_labels == predicted_label
        cluster_true_labels = true_labels[mask]
        true_label_counts = np.bincount(cluster_true_labels)
        indexs=tri_et_index(true_label_counts)
        most_frequent_true_label=indexs[i]
        l=[label_mapping[label] for label in unique_predicted_labels]

        while most_frequent_true_label in l[:predicted_label]:
            most_frequent_true_label = indexs[i+1]
            i+=1
        label_mapping[predicted_label] = most_frequent_true_label

    return label_mapping

label_mapping = map_labels(cluster_assignment,df["Subject"])


In [23]:
mapped_predicted_labels = np.array([label_mapping[label] for label in cluster_assignment])
unique_values = np.unique(mapped_predicted_labels)
label_mapping

{0: 1, 1: 2, 2: 3, 3: 0}

# **Accuracy**

In [24]:
from sklearn.metrics import accuracy_score
clustering_accuracy = accuracy_score(df["Subject"], mapped_predicted_labels)

clustering_accuracy

0.26371428571428573

#**Silhouette_score**
The average Silhouette score is also used as an evaluation measure in clustering. The best silhouette score is 1 and the worst is -1. Values close to zero indicate that data points are on the boundary i.e overlapping the clusters.

In [25]:
from sklearn.metrics import silhouette_score

# Assuming 'corpus_embeddings' is the sentence representations and 'cluster_assignment' is the predicted clusters
silhouette_avg = silhouette_score(embeddings,mapped_predicted_labels)
print("Silhouette Score:", silhouette_avg)

Silhouette Score: 0.045924377



# **F_measure (F1-score)**
 F-measure score, which is a metric commonly used to evaluate the clustering performance. It quantifies the trade-off between precision and recall, providing a single value that represents the overall clustering quality. The higher the F-measure score, the better the clustering performance.



In [26]:
from sklearn.metrics import f1_score

# Calculate F-measure
f_measure = f1_score(df["Subject"],  mapped_predicted_labels, average='weighted')

print("F-measure:", f_measure)

F-measure: 0.2660663308302307
