In [2]:
import os
import glob
import pandas as pd
import nltk
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from wordcloud import WordCloud
!pip install gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

# Define directory containing 50 text files
dataset_path = "/path/to/your/dataset"  # Update this path
file_paths = glob.glob(os.path.join(dataset_path, "*.txt"))

# Step 1: Load and Preprocess Text Data
all_texts = []
for file in file_paths:
    with open(file, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()
        all_texts.append(text)

# Remove stopwords & tokenize
stop_words = set(stopwords.words('english'))
tokenized_texts = [[word.lower() for word in word_tokenize(text) if word.isalpha() and word.lower() not in stop_words]
                   for text in all_texts]

# Convert tokenized text back to strings
preprocessed_texts = [" ".join(tokens) for tokens in tokenized_texts]

# Step 2: Frequency-Based Methods (BoW & TF-IDF)
vectorizer = CountVectorizer(max_features=500)  # Limit to top 500 words
X_bow = vectorizer.fit_transform(preprocessed_texts)
bow_features = vectorizer.get_feature_names_out()

tfidf_vectorizer = TfidfVectorizer(max_features=500)
X_tfidf = tfidf_vectorizer.fit_transform(preprocessed_texts)
tfidf_features = tfidf_vectorizer.get_feature_names_out()

# Step 3: Word Embeddings (Word2Vec)
word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=2, workers=4)
word_vectors = word2vec_model.wv  # Access the word vectors

# Step 4: Topic Modeling (LDA)
num_topics = 5  # Set number of topics
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_model.fit(X_bow)

# Step 5: Display Top Words in Each Topic
def display_topics(model, feature_names, num_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
        print(f"Topic {topic_idx+1}: {', '.join(top_words)}")

print("\nTop Words in Each Topic:")
display_topics(lda_model, bow_features)

# Step 6: Generate Word Clouds
bow_word_freq = dict(zip(bow_features, np.asarray(X_bow.sum(axis=0)).flatten()))
tfidf_word_freq = dict(zip(tfidf_features, np.asarray(X_tfidf.sum(axis=0)).flatten()))

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title("Bag of Words - Word Cloud")
plt.imshow(WordCloud(width=800, height=400).generate_from_frequencies(bow_word_freq), interpolation="bilinear")
plt.axis("off")

plt.subplot(1, 2, 2)
plt.title("TF-IDF - Word Cloud")
plt.imshow(WordCloud(width=800, height=400).generate_from_frequencies(tfidf_word_freq), interpolation="bilinear")
plt.axis("off")

plt.show()

# Step 7: Word2Vec Similarity Check (Example)
if "franchise" in word_vectors:
    similar_words = word_vectors.most_similar("franchise", topn=10)
    print("\nTop 10 words similar to 'franchise':")
    for word, similarity in similar_words:
        print(f"{word}: {similarity:.4f}")
else:
    print("\n'franchise' not found in Word2Vec vocabulary.")




ModuleNotFoundError: No module named 'gensim'

In [None]:
!pip install gensim
import gensim
print(gensim.__version__)



In [8]:
import os
import glob
import pandas as pd
import nltk
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from wordcloud import WordCloud

import gensim
print(gensim.__version__)  # Check if it's accessible
import word2vec

from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

# Define directory containing 50 text files
dataset_path = "/path/to/your/dataset"  # Update this path
file_paths = glob.glob(os.path.join(dataset_path, "*.txt"))

# Step 1: Load and Preprocess Text Data
all_texts = []
for file in file_paths:
    with open(file, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()
        all_texts.append(text)

# Remove stopwords & tokenize
stop_words = set(stopwords.words('english'))
tokenized_texts = [[word.lower() for word in word_tokenize(text) if word.isalpha() and word.lower() not in stop_words]
                   for text in all_texts]

# Convert tokenized text back to strings
preprocessed_texts = [" ".join(tokens) for tokens in tokenized_texts]

# Step 2: Frequency-Based Methods (BoW & TF-IDF)
vectorizer = CountVectorizer(max_features=500)  # Limit to top 500 words
X_bow = vectorizer.fit_transform(preprocessed_texts)
bow_features = vectorizer.get_feature_names_out()

tfidf_vectorizer = TfidfVectorizer(max_features=500)
X_tfidf = tfidf_vectorizer.fit_transform(preprocessed_texts)
tfidf_features = tfidf_vectorizer.get_feature_names_out()

# Step 3: Word Embeddings (Word2Vec)
word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=2, workers=4)
word_vectors = word2vec_model.wv  # Access the word vectors

# Step 4: Topic Modeling (LDA)
num_topics = 5  # Set number of topics
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_model.fit(X_bow)

# Step 5: Display Top Words in Each Topic
def display_topics(model, feature_names, num_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
        print(f"Topic {topic_idx+1}: {', '.join(top_words)}")

print("\nTop Words in Each Topic:")
display_topics(lda_model, bow_features)

# Step 6: Generate Word Clouds
bow_word_freq = dict(zip(bow_features, np.asarray(X_bow.sum(axis=0)).flatten()))
tfidf_word_freq = dict(zip(tfidf_features, np.asarray(X_tfidf.sum(axis=0)).flatten()))

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title("Bag of Words - Word Cloud")
plt.imshow(WordCloud(width=800, height=400).generate_from_frequencies(bow_word_freq), interpolation="bilinear")
plt.axis("off")

plt.subplot(1, 2, 2)
plt.title("TF-IDF - Word Cloud")
plt.imshow(WordCloud(width=800, height=400).generate_from_frequencies(tfidf_word_freq), interpolation="bilinear")
plt.axis("off")

plt.show()

# Step 7: Word2Vec Similarity Check (Example)
if "franchise" in word_vectors:
    similar_words = word_vectors.most_similar("franchise", topn=10)
    print("\nTop 10 words similar to 'franchise':")
    for word, similarity in similar_words:
        print(f"{word}: {similarity:.4f}")
else:
    print("\n'franchise' not found in Word2Vec vocabulary.")


ModuleNotFoundError: No module named 'gensim'