<a href="https://colab.research.google.com/github/humayun-mhk/Elevvo-NLP-Internship/blob/main/Topic_Modeling_on_News_Articles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Install Required Libraries

In [2]:
!pip install gensim pyLDAvis nltk wordcloud scikit-learn




## 2. Load and Preprocess Data

In [6]:
from google.colab import files
files.upload()  # Upload kaggle.json


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"humayunmhk","key":"9c7ea0e8fc7cfd58eccfc177898cad97"}'}

In [7]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [11]:
!kaggle datasets download -d gpreda/bbc-news
!unzip /content/bbc-news.zip


Dataset URL: https://www.kaggle.com/datasets/gpreda/bbc-news
License(s): CC0-1.0
bbc-news.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  /content/bbc-news.zip
  inflating: bbc_news.csv            


In [15]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

# Load CSV
df = pd.read_csv("/content/bbc_news.csv")  # Replace with actual filename

# Use the 'description' column for topic modeling
texts = df['description'].fillna('').tolist()

# Preprocessing
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = re.sub(r'\W', ' ', text.lower())
    tokens = text.split()
    return [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 3]

processed_docs = [preprocess(doc) for doc in texts]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 3. Prepare for LDA

In [16]:
from gensim import corpora

dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]


## 4. Train LDA Model

In [17]:
from gensim.models.ldamodel import LdaModel

lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=5,
    passes=10,
    random_state=42
)

# Print Topics
topics = lda_model.print_topics(num_words=10)
for i, topic in topics:
    print(f"Topic {i}: {topic}")


Topic 0: 0.010*"first" + 0.010*"year" + 0.007*"trump" + 0.006*"star" + 0.006*"medium" + 0.005*"scottish" + 0.005*"social" + 0.005*"president" + 0.004*"film" + 0.004*"said"
Topic 1: 0.016*"england" + 0.015*"world" + 0.012*"final" + 0.012*"league" + 0.012*"2024" + 0.011*"manchester" + 0.009*"first" + 0.009*"say" + 0.008*"match" + 0.008*"city"
Topic 2: 0.018*"say" + 0.013*"government" + 0.010*"people" + 0.010*"minister" + 0.009*"election" + 0.009*"party" + 0.007*"could" + 0.007*"israel" + 0.007*"labour" + 0.006*"said"
Topic 3: 0.013*"police" + 0.013*"year" + 0.012*"say" + 0.011*"people" + 0.008*"died" + 0.008*"attack" + 0.007*"killed" + 0.006*"family" + 0.006*"found" + 0.006*"child"
Topic 4: 0.015*"say" + 0.007*"wale" + 0.006*"year" + 0.006*"former" + 0.005*"school" + 0.005*"would" + 0.005*"first" + 0.005*"england" + 0.005*"king" + 0.004*"president"


## 5. Visualize with pyLDAvis

In [18]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)
