In [None]:
!pip install -U sentence-transformers

In [2]:
!pip install tweet-preprocessor

Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import re
import preprocessor as p
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [None]:
embedder = SentenceTransformer('multi-qa-MiniLM-L6-dot-v1')

In [68]:
import pandas as pd

#creating a dataframe for our dataset "subject_questions"
df = pd.read_csv("subjects-questions.csv")
df=df.head(4000)
#printing first 5 rows of our dataset/dataframe
df.head()

Unnamed: 0,eng,Subject
0,An anti-forest measure is\nA. Afforestation\nB...,Biology
1,"Among the following organic acids, the acid pr...",Chemistry
2,If the area of two similar triangles are equal...,Maths
3,"In recent year, there has been a growing\nconc...",Biology
4,Which of the following statement\nregarding tr...,Physics


In [69]:
df.shape

(4000, 2)

In [70]:
unique_values = df['Subject'].unique()
print(unique_values)

['Biology' 'Chemistry' 'Maths' 'Physics']


In [71]:
#encoding labels

df['Subject']=df['Subject'].map({'Biology':0,
                             'Chemistry':1,
                             'Physics':2,
                                 'Maths':3})

In [72]:
df.head()

Unnamed: 0,eng,Subject
0,An anti-forest measure is\nA. Afforestation\nB...,0
1,"Among the following organic acids, the acid pr...",1
2,If the area of two similar triangles are equal...,3
3,"In recent year, there has been a growing\nconc...",0
4,Which of the following statement\nregarding tr...,2


In [73]:
def clean_tweets(df):
    # punctuations we want to be replaced
    REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})")
    REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")
    corpus = []
    for line in df:
        # clean data by sending it to tweet_processor
        sentence = p.clean(line)
        # remove puctuation
        sentence = REPLACE_NO_SPACE.sub("", sentence.lower()) # convert all tweets to lower cases
        sentence = REPLACE_WITH_SPACE.sub(" ", sentence)
        corpus.append(sentence)
    return corpus

In [74]:
df['eng_clean'] = clean_tweets(df['eng'])
df.shape

(4000, 3)

In [75]:
df['eng_clean']

0       an anti forest measure isa afforestationb sele...
1       among the following organic acids the acid pre...
2       if the area of two similar triangles are equal...
3       in recent year there has been a growingconcern...
4       which of the following statementregarding tran...
                              ...                        
3995    the distance between the ends of thewings of a...
3996    the radius of \ h e^+ \ ion is \ x \dota \ in ...
3997    one oscillation completed by a vibrating body ...
3998    three resistances of \ \omega \omega \ and \ \...
3999    a block of mass \ m \ moving at a speed \ v \c...
Name: eng_clean, Length: 4000, dtype: object

In [76]:
corpus = list(df['eng_clean'])

In [None]:
corpus

In [78]:
import numpy as np
corpus_embeddings1 = embedder.encode(corpus)

#corpus_embeddings=corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)


In [108]:
num_clusters = 4
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings1)
cluster_assignment = clustering_model.labels_
cluster_assignment



array([1, 1, 0, ..., 2, 2, 2], dtype=int32)

In [109]:
df['cluster']=cluster_assignment
df.head(20)

Unnamed: 0,eng,Subject,eng_clean,cluster
0,An anti-forest measure is\nA. Afforestation\nB...,0,an anti forest measure isa afforestationb sele...,1
1,"Among the following organic acids, the acid pr...",1,among the following organic acids the acid pre...,1
2,If the area of two similar triangles are equal...,3,if the area of two similar triangles are equal...,0
3,"In recent year, there has been a growing\nconc...",0,in recent year there has been a growingconcern...,1
4,Which of the following statement\nregarding tr...,2,which of the following statementregarding tran...,2
5,Fern plants reproduce by\nA. Seeds\nB. Spores\...,0,fern plants reproduce bya seedsb sporesc layin...,1
6,Electric current flows through:\nA. a conducto...,2,electric current flows througha a conductorb a...,2
7,The sides of a right angled triangle are in A....,3,the sides of a right angled triangle are in ap...,0
8,If the mass of a body is \( M \) on the\nsurfa...,2,if the mass of a body is \ m \ on thesurface o...,2
9,A particle of mass \( m \) is made to move\nwi...,2,a particle of mass \ m \ is made to movewith u...,2


In [102]:
#label_mapping={0:0,2:1,3:2,1:3}

In [106]:
#df['cluster']=df['cluster'].map(label_mapping)


In [110]:
import numpy as np

def tri_et_index(tableau):
    # Obtenir les indices des valeurs triées en ordre décroissant
    indices_tries = np.flip(np.argsort(tableau))

    return indices_tries

In [111]:
import numpy as np

def map_labels(predicted_labels, true_labels):
    unique_predicted_labels = np.unique(predicted_labels)
    label_mapping = {index: 10 for index in unique_predicted_labels}

    for predicted_label in unique_predicted_labels:
        i=0
        mask = predicted_labels == predicted_label
        cluster_true_labels = true_labels[mask]
        true_label_counts = np.bincount(cluster_true_labels)
        indexs=tri_et_index(true_label_counts)
        most_frequent_true_label=indexs[i]
        l=[label_mapping[label] for label in unique_predicted_labels]

        while most_frequent_true_label in l[:predicted_label]:
            most_frequent_true_label = indexs[i+1]
            i+=1
        label_mapping[predicted_label] = most_frequent_true_label

    return label_mapping

label_mapping = map_labels(cluster_assignment,df["Subject"])


In [112]:
mapped_predicted_labels = np.array([label_mapping[label] for label in cluster_assignment])
unique_values = np.unique(mapped_predicted_labels)
label_mapping

{0: 3, 1: 1, 2: 2, 3: 0}

In [113]:
from sklearn.metrics import accuracy_score
clustering_accuracy = accuracy_score(df["Subject"], mapped_predicted_labels)

clustering_accuracy

0.71575