# Installing Necessary Libraries

In [None]:
! pip install gensim pandas nltk



# Imports

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from gensim import corpora, models
import os

# Loading and Processing the Text Data

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

df = pd.read_csv('/content/df_file.csv')

# Extracting text data
texts = df['Text'].tolist()

# Preprocess the texts
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))

def preprocess(text):
    # Tokenize and remove stopwords
    return [word for word in tokenizer.tokenize(text.lower()) if word not in stop_words and len(word) > 1]

processed_texts = [preprocess(text) for text in texts]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Creating Dictionary and Corpus

In [None]:
# Create a dictionary representation of the documents.
dictionary = corpora.Dictionary(processed_texts)

# Filter out extremes to limit the number of features
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)

# Create a corpus: list of bag-of-words vectors for each document
corpus = [dictionary.doc2bow(text) for text in processed_texts]


# Train the LDA model

In [None]:
lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15, random_state=100)

# Print the topics
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)


(0, '0.009*"people" + 0.006*"new" + 0.005*"technology" + 0.005*"mobile" + 0.004*"one"')
(1, '0.012*"us" + 0.009*"growth" + 0.008*"bank" + 0.007*"economy" + 0.006*"market"')
(2, '0.016*"mr" + 0.011*"government" + 0.006*"us" + 0.006*"new" + 0.006*"eu"')
(3, '0.006*"mr" + 0.005*"one" + 0.004*"best" + 0.004*"first" + 0.004*"time"')
(4, '0.010*"us" + 0.009*"sales" + 0.007*"new" + 0.005*"company" + 0.005*"market"')


# Save the Trained Model

In [None]:
# Directory for saving LDA model components
base_dir = 'lda_model_files'
os.makedirs(base_dir, exist_ok=True)  # Create the directory if it doesn't exist

# Save the dictionary
dictionary.save(os.path.join(base_dir, 'lda_dictionary.dict'))

# Save the corpus
corpora.MmCorpus.serialize(os.path.join(base_dir, 'lda_corpus.mm'), corpus)

# Save the LDA model
lda_model.save(os.path.join(base_dir, 'lda_model.lda'))

# Identifying Topics with LDA Model

In [None]:
# Load the dictionary
loaded_dictionary = corpora.Dictionary.load(os.path.join(base_dir, 'lda_dictionary.dict'))

# Load the corpus
loaded_corpus = corpora.MmCorpus(os.path.join(base_dir, 'lda_corpus.mm'))

# Load the LDA model
loaded_lda_model = models.LdaModel.load(os.path.join(base_dir, 'lda_model.lda'))

In [None]:
def preprocess_lda(text):
    tokenizer = RegexpTokenizer(r'\w+')
    stop_words = set(stopwords.words('english'))
    # Tokenize and remove stopwords
    return [word for word in tokenizer.tokenize(text.lower()) if word not in stop_words and len(word) > 1]

def identify_topics_lda(text, lda_model, dictionary):
    # Preprocess the text
    processed_text = preprocess_lda(text)
    # Transform text into the bag-of-words space
    bow_vector = dictionary.doc2bow(processed_text)
    # Perform LDA analysis
    lda_output = lda_model[bow_vector]
    # Sort topics by their contribution
    lda_output = sorted(lda_output, key=lambda tup: -1*tup[1])

    # Extract and print topics in a more readable format
    for topic_number, prob in lda_output:
        # Extract the topic
        topic = lda_model.show_topic(topic_number, 5)
        # Prepare a list of only words (ignore the probabilities)
        topic_words = ", ".join([word for word, prop in topic])
        print(f"Topic {topic_number} ({prob:.3f}): {topic_words}")

# Example usage


In [None]:
new_text ="to control immigration and asylum and criticised its record on the NHS, telling delegates Labour cannot be trusted on education or crime. A Tory government would sort out the shambles of immigration, put patients before statistics and bring discipline to schools, he said. Michael Howard, who had been due to welcome delegates to the conference on Friday, will address them in a lunchtime speech. His welcome address had to be postponed after he stayed in London to lead the party's opposition to the Prevention of Terrorism Bill in its lengthy progress through Parliament. The bill was finally passed on Friday evening, after more than 30 hours of debate. Mr Howard is likely to defend his party's handling of the bill, which was only passed after the Conservatives accepted Prime Minister Tony Blair's promise that MPs would be able to review it within a year."
identify_topics_lda(new_text, loaded_lda_model, loaded_dictionary)

Topic 3 (0.876): mr, one, best, first, time
Topic 2 (0.116): mr, government, us, new, eu


# Transfer to Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import shutil
shutil.move('/content/lda_model_files', '/content/drive/My Drive')

'/content/drive/My Drive/lda_model_files'