In [27]:
import numpy as np
import pandas as pd
import nltk
# natural language toolkit
# https://www.nltk.org/
from nltk.corpus import stopwords 
#https://www.geeksforgeeks.org/removing-stop-words-nltk-python/

from nltk.stem.wordnet import WordNetLemmatizer


import string


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

data= pd.read_csv("Dataset/articles.csv" ,encoding='latin1')
print(data.head())

                                             Article  \
0  Data analysis is the process of inspecting and...   
1  The performance of a machine learning algorith...   
2  You must have seen the news divided into categ...   
3  When there are only two classes in a classific...   
4  The Multinomial Naive Bayes is one of the vari...   

                                               Title  
0                  Best Books to Learn Data Analysis  
1         Assumptions of Machine Learning Algorithms  
2          News Classification with Machine Learning  
3  Multiclass Classification Algorithms in Machin...  
4        Multinomial Naive Bayes in Machine Learning  


#### As we are working on a NLP problem, we need to clean the textual content by removing punctuation and stopwords. Here's how we can clean the textual data:

In [29]:
def preprocess_text(text):
    #convert text to lowercase
    text = text.lower()
    # remove punctuation
    text = text.translate(str.maketrans('','',string.punctuation))
    # Tokenize text 
    tokens = nltk.word_tokenize(text)
    #  Remove stop words 
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize tokens
    lemma = WordNetLemmatizer()
    tokens = [lemma.lemmatize(word)  for word in tokens]
    # print(tokens)
    # join tokens to form preprocessed text
    preprocess_text = ''.join(tokens)
    return preprocess_text
data['Article'] = data['Article'].apply(preprocess_text)


In [30]:
# Now we need to convert the textual data into a numerical representaion . We can use text vectorizer here:


In [31]:
vectorizer = TfidfVectorizer()
x= vectorizer.fit_transform(data['Article'].values)

Now we eill use an algorithm to identify relationships between the texual data to assign topic labels . We can use the Latent Disichelet Allocation algorithm (LDA) for this task . Latent Dirichet Allocation (LDA) is a generative probabilistic algorithm used to uncover the underlying topics in a corpus of textual data. Let's use the LDA algorithm to assign topic labels :

In [32]:
lda = LatentDirichletAllocation(n_components=5,random_state=42)
lda.fit(x)

topic_modelling = lda.transform(x)

topic_labels = np.argmax(topic_modelling,axis=1)
data['topic_labels'] = topic_labels

In [33]:
print(data.head())

                                             Article  \
0  dataanalysisprocessinspectingexploringdatagene...   
1  performancemachinelearningalgorithmparticulard...   
2  mustseennewsdividedcategorygonewswebsitepopula...   
3  twoclassclassificationproblemproblembinaryclas...   
4  multinomialnaivebayesonevariantnaivebayesalgor...   

                                               Title  topic_labels  
0                  Best Books to Learn Data Analysis             1  
1         Assumptions of Machine Learning Algorithms             3  
2          News Classification with Machine Learning             0  
3  Multiclass Classification Algorithms in Machin...             1  
4        Multinomial Naive Bayes in Machine Learning             1  


### Summary
Topic Modelling is a Natural Language Processing technique to uncover hidden topics from text documents. It helps identify topics of the text documents to find relationships between the content of a text document and the topic. I hope you liked this article on Topic Modelling with Machine Learning using Python. Feel free to ask valuable questions in the comments section below.