In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import nltk
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
import os
import pandas as pd
df = pd.read_csv(r"/kaggle/input/kcctn-final-2/train_TamilNadu.csv")


In [None]:
df = df[df['QueryType'].isin(['Government Schemes'])]

In [None]:
import re

# Drop NaN values
df = df.dropna(subset=['QueryText_processed'])
df['paper_text_processed'] = df['QueryText_processed'].map(lambda x: re.sub('[,\.!?]', '', x).lower())
# Print out the first rows of papers
df['paper_text_processed'].head()


In [None]:
import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

data = df.paper_text_processed.values.tolist()
documents=data
data_words = list(sent_to_words(data))
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

In [None]:
import gensim.corpora as corpora
id2word = corpora.Dictionary(data_words)
texts = data_words
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[:1][0][:30])

In [None]:

tf_idf_vect = TfidfVectorizer(min_df=50, stop_words='english')
X = tf_idf_vect.fit_transform(documents)

model = NMF(n_components=6, random_state=5)

model.fit(X)

nmf_features = model.transform(X)

print(f'Input features matrix shape - {X.shape}')
print(f'NMF features shape - {nmf_features.shape}')

components_df = pd.DataFrame(model.components_, columns=tf_idf_vect.get_feature_names_out())

for topic in range(components_df.shape[0]):
    tmp = components_df.iloc[topic]
    print(f'For topic {topic+1} the words with the highest value are:')
    print(tmp.nlargest(10))
    print('\n')


In [None]:
components_df = pd.DataFrame(model.components_, columns=tf_idf_vect.get_feature_names_out())

topics = components_df.apply(lambda row: [tf_idf_vect.get_feature_names_out()[i] for i in row.argsort()[:-11:-1]], axis=1).tolist()

# Compute coherence score
coherence_model = CoherenceModel(topics=topics, texts=data_words, dictionary=id2word, coherence='c_v')
coherence_score = coherence_model.get_coherence()

print("Coherence Score:", coherence_score)

In [None]:
coherence_model = CoherenceModel(topics=topics, texts=data_words, dictionary=id2word, coherence='c_uci')
coherence_score = coherence_model.get_coherence()

print("Coherence Score:", coherence_score)

In [None]:
coherence_model = CoherenceModel(topics=topics, texts=data_words, dictionary=id2word, coherence='u_mass')
coherence_score = coherence_model.get_coherence()

print("Coherence Score:", coherence_score)

In [None]:
import matplotlib.pyplot as plt

topic_distribution = nmf_features.argmax(axis=1)  

topic_counts = pd.Series(topic_distribution).value_counts().sort_index()

plt.figure(figsize=(10, 6))
topic_counts.plot(kind='bar', color='skyblue')
plt.title('Number of Documents per Topic')
plt.xlabel('Topic')
plt.ylabel('Number of Documents')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


In [None]:
import matplotlib.pyplot as plt
from itertools import cycle

# Initialize a dictionary to store the topic counts for each year
yearly_topic_counts = {}
num_topics = 6

# Define the starting and ending years
start_year = 2009
end_year = 2018

# Initialize yearly topic counts for each year and each topic
for year in range(start_year, end_year + 1):
    yearly_topic_counts[year] = {topic: 0 for topic in range(num_topics)}

# Iterate over each example in the corpus
for index, (example, year) in enumerate(zip(corpus, df['Year'])):
    year = int(year)  # Convert year to integer

    # Check if the year is within our range
    if start_year <= year <= end_year:
        topic_distribution = lda_model[example]

        # Find the dominant topic for the example (topic with maximum probability)
        dominant_topic = max(topic_distribution, key=lambda x: x[1])[0]

        # Update the topic counts dictionary for the corresponding year
        yearly_topic_counts[year][dominant_topic] += 1

# Plotting all topics over the years 2009-2018
# Since there are 5 topics, you can choose how to split them into two graphs.

# IDs for the topics can be adjusted as needed, here's a split example:
top_topics = [0, 1, 2]  # Topics for the first graph
remaining_topics = [ 4,5,3]  # Topics for the second graph

# First graph for top 3 topics
plt.figure(figsize=(10, 6))
color_palette = cycle(['blue', 'orange', 'green'])
for topic, color in zip(top_topics, color_palette):
    topic_counts = [yearly_topic_counts[year][topic] for year in range(start_year, end_year + 1)]
    plt.plot(range(start_year, end_year + 1), topic_counts, color=color, label=f'Topic {topic}')

plt.xlabel('Year')
plt.ylabel('Number of Examples')
plt.title('Trends of Top 3 Topics (2009-2018)')
plt.legend(title='Topic', loc='upper left')
plt.grid(True)
plt.show()

# Second graph for remaining topics
plt.figure(figsize=(10, 6))
color_palette = cycle(['red', 'purple'])
for topic, color in zip(remaining_topics, color_palette):
    topic_counts = [yearly_topic_counts[year][topic] for year in range(start_year, end_year + 1)]
    plt.plot(range(start_year, end_year + 1), topic_counts, color=color, label=f'Topic {topic}')

plt.xlabel('Year')
plt.ylabel('Number of Examples')
plt.title('Trends of Remaining Topics (2009-2018)')
plt.legend(title='Topic', loc='upper left')
plt.grid(True)
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

topic_correlation = components_df.corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(topic_correlation, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Topic Correlation Heatmap')
plt.xlabel('Topics')
plt.ylabel('Topics')
plt.show()