# Lab work №5

In [1]:
import pandas as pd
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import CoherenceModel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import random

# Loading the data
data = pd.read_csv('news.csv')

# Function to clean text
def clean_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

# Cleaning the text
data['cleaned_text'] = data['text'].apply(clean_text)

# Creating dictionary and corpus
dictionary = corpora.Dictionary(data['cleaned_text'])
corpus = [dictionary.doc2bow(text) for text in data['cleaned_text']]

# Function to compute coherence metric
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        model_list.append(model)
        coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherence_model.get_coherence())
    return model_list, coherence_values

# Computing coherence metric for various numbers of topics
limit = 25
start = 2
step = 1
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=data['cleaned_text'], start=start, limit=limit, step=step)

# Choosing the optimal number of topics
optimal_num_topics = start + coherence_values.index(max(coherence_values))
print("Optimal number of topics:", optimal_num_topics)

# Building the final model with the optimal number of topics
final_model = model_list[coherence_values.index(max(coherence_values))]

# Displaying the most important topics for four randomly selected documents
for i in range(4):
    random_index = random.randint(0, len(data) - 1)
    random_document = data.loc[random_index, 'text']
    print("\nDocument", i+1, ":")
    print(random_document)
    print("Most important topics:")
    topics = final_model.get_document_topics(dictionary.doc2bow(clean_text(random_document)), minimum_probability=0.2)
    for topic, prob in topics:
        print("Topic", topic, "probability:", prob)


Optimal number of topics: 24

Document 1 :
Appleton Partners Senior Vice President Whitney Fitts says too many Federal Reserve rate hikes are “cooked” into markets.   Speaking with Taylor Riggs on “Bloomberg Markets: The Close,” she adds that she sees a lot of opportunities in municipal bonds  https://t.co/K60bTZDXl9  https://t.co/m7U5vRIBOi
Most important topics:
Topic 9 probability: 0.2220063
Topic 14 probability: 0.36111036

Document 2 :
The global oil market is ‘walking a tightrope’ between scarce supply and the possibility of a recession, the International Energy Agency said, with higher prices and worsening economic conditions already taking a toll on demand  https://t.co/R5VV9ATIce
Most important topics:
Topic 14 probability: 0.39592344
Topic 16 probability: 0.22615068

Document 3 :
Tesla CEO Elon Musk sees inflation declining 'towards the end of this year'  https://t.co/Au0sTZAbAY by @BrianSozzi
Most important topics:
Topic 4 probability: 0.33242756
Topic 14 probability: 0.3083

In [2]:
from nltk.corpus import gutenberg
from nltk.util import ngrams
from collections import Counter

# Loading the text "Alice's Adventures in Wonderland" from the Gutenberg corpus
alice_text = gutenberg.raw('carroll-alice.txt')

# Splitting the text into tokens and cleaning the text
tokens = clean_text(alice_text)

# Generating trigrams
trigrams = list(ngrams(tokens, 3))

# Calculating the frequency of each trigram
trigram_freq = Counter(trigrams)

# Displaying trigrams that are considered key (most frequent)
print("Key trigrams:")
for trigram, freq in trigram_freq.most_common(30):
    print(' '.join(trigram), "-", freq, "occurrences")


Key trigrams:
said mock turtle - 20 occurrences
said march hare - 10 occurrences
said alice said - 8 occurrences
march hare said - 6 occurrences
mock turtle said - 6 occurrences
little golden key - 5 occurrences
poor little thing - 5 occurrences
white kid glove - 5 occurrences
certainly said alice - 4 occurrences
know said alice - 4 occurrences
might well say - 4 occurrences
mouse mouse mouse - 4 occurrences
join dance wo - 4 occurrences
dance wo wo - 4 occurrences
wo wo join - 4 occurrences
wo join dance - 4 occurrences
beau ootiful soo - 4 occurrences
ootiful soo oop - 4 occurrences
king white rabbit - 4 occurrences
said white rabbit - 4 occurrences
cat eat bat - 3 occurrences
thought poor alice - 3 occurrences
like said alice - 3 occurrences
said alice rather - 3 occurrences
much said alice - 3 occurrences
indeed said alice - 3 occurrences
alice looked round - 3 occurrences
took hookah mouth - 3 occurrences
said caterpillar alice - 3 occurrences
old father william - 3 occurrences
