In [None]:
import pandas as pd

In [None]:
from google.colab import files

In [None]:
df = pd.read_csv(r'/content/topics_sups.csv', encoding='latin-1') #reading the given data set

In [None]:
df.head()
# head() used to display first five tupels in a data set

Unnamed: 0,ID,Name,cleaned_text,lemmatized_text,Cluster
0,1,Air and Space Vehicles (5) NSc,r n pce Vehcle (5) Nc,r n pce Vehcle ( 5 ) Nc,0
1,2,Special Topics in Aeronautics and Astronautics...,"pecl pc n ernuc n rnuc (1-5, x. 10) Nc","pecl pc n ernuc n rnuc ( 1-5 , x . 10 ) Nc",2
2,3,Engineering Statics (4) NSc,Engneerng c (4) Nc,Engneerng c ( 4 ) Nc,0
3,4,Thermodynamics (4) NSc,hernc (4) Nc,hernc ( 4 ) Nc,0
4,5,"Undergraduate Research (1-5, max. 10)","Unergrue Reerch (1-5, x. 10)","Unergrue Reerch ( 1-5 , x . 10 )",1


# CHECK FOR NULL VALUES IN THE DATA SET

In [None]:
df.isnull().sum()
#isnull is used for checking whether a value is null or not
# sum() method is used for counting number of tuples

ID        0
Name    209
dtype: int64

We have found that we have 209 null values in the column 'Name'

Fill the null values with mean values

In [None]:
len(df) #finding length of data set

326728

In [None]:
df.dropna(inplace=True) # removing the null values

In [None]:
len(df) # length of data set after removing the null values

326519

Finding the duplicate vales in data set

In [None]:
d_n = df[df.duplicated(keep='first')]
#duplicated is used for finding duplicate values
#first will ensure that first value is preserved

In [None]:
d_n

Unnamed: 0,ID,Name


We found no duplicate values

Removing the stopwords and cleaning the columns in the data set


In [None]:
from nltk.corpus import stopwords
import nltk

In [None]:
nltk.download('stopwords')#downloading the stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
stop_words=set(stopwords.words('english'))#sorting the stopwords into the set to access it directly

In [None]:
df['cleaned_text'] = df['Name'].apply(lambda tokens: ''.join([word for word in tokens if word.lower() not in stop_words]))#removing the stopwords and adding it in another column named 'cleaned_text'

In [None]:
df['cleaned_text'].head()

0                     r n pce Vehcle (5) Nc
1    pecl pc n ernuc n rnuc (1-5, x. 10) Nc
2                        Engneerng c (4) Nc
3                              hernc (4) Nc
4              Unergrue Reerch (1-5, x. 10)
Name: cleaned_text, dtype: object

In [None]:
from nltk.stem import WordNetLemmatizer
# using lemmitization to get root words

In [None]:
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
def lemmatize_text(text):
    words = nltk.word_tokenize(text)  # Tokenize the text into words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatize each word
    return ' '.join(lemmatized_words)  # Join the lemmatized words back into a sentence

In [None]:
# Assuming 'cleaned_text' is the name of the column you want to lemmatize
df['lemmatized_text'] = df['cleaned_text'].apply(lemmatize_text)


In [None]:
df.head()

Unnamed: 0,ID,Name,cleaned_text,lemmatized_text
0,1,Air and Space Vehicles (5) NSc,r n pce Vehcle (5) Nc,r n pce Vehcle ( 5 ) Nc
1,2,Special Topics in Aeronautics and Astronautics...,"pecl pc n ernuc n rnuc (1-5, x. 10) Nc","pecl pc n ernuc n rnuc ( 1-5 , x . 10 ) Nc"
2,3,Engineering Statics (4) NSc,Engneerng c (4) Nc,Engneerng c ( 4 ) Nc
3,4,Thermodynamics (4) NSc,hernc (4) Nc,hernc ( 4 ) Nc
4,5,"Undergraduate Research (1-5, max. 10)","Unergrue Reerch (1-5, x. 10)","Unergrue Reerch ( 1-5 , x . 10 )"


In [None]:
from sklearn.cluster import KMeans
#importing kmeans for clustering

In [None]:
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=0)
#n_clusters is used to denote number of clusters
#init Initialization method for K-Means clustering, setting initial cluster centroids.
#`max_iter`  the max iterations for K-Means clustering convergence.
# `n_init` in K-Means: Number of times to reinitialize centroids.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# importing tfidfvectorizer for converting categorical data to numerical data

In [None]:
tfidf_vectorizer = TfidfVectorizer()


In [None]:
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])
# converting cleaned_text data to numerical data

In [None]:
tfidf_matrix

<326519x13296 sparse matrix of type '<class 'numpy.float64'>'
	with 1136816 stored elements in Compressed Sparse Row format>

In [None]:
kmeans.fit(tfidf_matrix)



In [None]:
cluster_labels = kmeans.labels_


In [None]:
cluster_labels

array([0, 2, 0, ..., 0, 2, 1], dtype=int32)

In [None]:
df['Cluster'] = cluster_labels


In [None]:
df.head()

Unnamed: 0,ID,Name,cleaned_text,lemmatized_text,Cluster
0,1,Air and Space Vehicles (5) NSc,r n pce Vehcle (5) Nc,r n pce Vehcle ( 5 ) Nc,0
1,2,Special Topics in Aeronautics and Astronautics...,"pecl pc n ernuc n rnuc (1-5, x. 10) Nc","pecl pc n ernuc n rnuc ( 1-5 , x . 10 ) Nc",2
2,3,Engineering Statics (4) NSc,Engneerng c (4) Nc,Engneerng c ( 4 ) Nc,0
3,4,Thermodynamics (4) NSc,hernc (4) Nc,hernc ( 4 ) Nc,0
4,5,"Undergraduate Research (1-5, max. 10)","Unergrue Reerch (1-5, x. 10)","Unergrue Reerch ( 1-5 , x . 10 )",1


In [None]:
df.drop('cleaned_text',axis=1)

Unnamed: 0,ID,Name,lemmatized_text,Cluster
0,1,Air and Space Vehicles (5) NSc,r n pce Vehcle ( 5 ) Nc,0
1,2,Special Topics in Aeronautics and Astronautics...,"pecl pc n ernuc n rnuc ( 1-5 , x . 10 ) Nc",2
2,3,Engineering Statics (4) NSc,Engneerng c ( 4 ) Nc,0
3,4,Thermodynamics (4) NSc,hernc ( 4 ) Nc,0
4,5,"Undergraduate Research (1-5, max. 10)","Unergrue Reerch ( 1-5 , x . 10 )",1
...,...,...,...,...
326723,326724,Current Issues in Feminist Theory,Curren ue n Fen her,0
326724,326725,Special Topics in Feminist Theory & Research,pecl pc n Fen her & Reerch,2
326725,326726,Cultural Study of Masculinities,Culurl u f culne,0
326726,326727,Special Study for Graduate Students,pecl u fr Grue uen,2
