In [None]:
import pandas as pd
import numpy as np
import string
import spacy
import matplotlib.pyplot as plt
from collections import Counter
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
import tensorflow as tf
from tensorflow import keras

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.util import ngrams
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import DBSCAN
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import NMF
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import LatentDirichletAllocation
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import seaborn as sns
import re

In [None]:
Kisan = pd.read_csv('/content/Kisan Call center Queries.csv',on_bad_lines='skip')

In [None]:
# Data Preprocessing
Kisan['questions'].fillna(' ',inplace=True)
vectorizer = TfidfVectorizer(stop_words='english',max_features=5000)
x = vectorizer.fit_transform(Kisan['questions'])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Kisan['questions'].fillna(' ',inplace=True)


In [None]:
Kisan['answers'].fillna(' ',inplace=True)
vectorizer = TfidfVectorizer(stop_words='english',max_features=5000)
X = vectorizer.fit_transform(Kisan['answers'])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Kisan['answers'].fillna(' ',inplace=True)


In [None]:
# Convert text to numerical representation using TF-IDF
svd = TruncatedSVD(n_components=100, random_state=42)
X_reduced = svd.fit_transform(X)


In [None]:
# Apply DBSCAN clustering
eps = 1.8  # Adjust based on dataset density
min_samples = 10  # Minimum points to form a cluster
dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean')
Kisan['Cluster'] = dbscan.fit_predict(X_reduced)


In [None]:
# Analyze clusters by creating word clouds
unique_clusters = set(Kisan['Cluster'])
for cluster in unique_clusters:
    if cluster == -1:
        continue  # Skip noise points
    cluster_text = " ".join(Kisan[Kisan['Cluster'] == cluster]['questions'])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(cluster_text)
    plt.figure(figsize=(8, 4))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(f"Cluster {cluster} Word Cloud")
    plt.show()

In [None]:
# Display sample questions from each cluster
for cluster in unique_clusters:
    if cluster == -1:
        print("Noise Points (Outliers):")
    else:
        print(f"Cluster {cluster} Sample Questions:")
    print(Kisan[Kisan['Cluster'] == cluster]['questions'].head(5).to_string(index=False))
    print("-" * 50)

In [None]:
 # Topic Modeling (LDA)

In [None]:
# Data Preprocessing
Kisan['questions'].fillna(' ',inplace=True)
vectorizer = TfidfVectorizer(stop_words='english',max_features=5000)
x = vectorizer.fit_transform(Kisan['questions'])

In [None]:
Kisan['answers'].fillna(' ',inplace=True)
vectorizer = TfidfVectorizer(stop_words='english',max_features=5000)
X = vectorizer.fit_transform(Kisan['answers'])

In [None]:
# Apply Latent Dirichlet Allocation (LDA) for topic modeling
num_topics = 5
lda = LatentDirichletAllocation(n_components=num_topics,random_state=42)
lda.fit(X)

In [None]:
#  Display top words for each topic
feature_names = vectorizer.get_feature_names_out()
for topic_idx,topic in enumerate(lda.components):
  print(f"Topic {topic_idx}:")
  top_words_idx = topic.argsort()[:10:-1]
  top_words_idx = topic.argsort()[:11:-1]
  top_words_idx = [feature_names[i] for i in top_words_idx]
  print(",".join(top_words))
  print("-"*50)

In [None]:
# Visualize topics using word clouds
for topic_idx, topic in enumerate(lda.components_):
    word_freqs = {feature_names[i]: topic[i] for i in topic.argsort()[:-51:-1]}  # Top 50 words
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freqs)
    plt.figure(figsize=(8, 4))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(f"Topic {topic_idx} Word Cloud")
    plt.show()


In [None]:
# Topic Modeling (NMF)

In [None]:
# Data Preprocessing
Kisan['questions'].fillna(' ',inplace=True)
vectorizer = TfidfVectorizer(stop_words='english',max_features=5000)
x = vectorizer.fit_transform(Kisan['questions'])

In [None]:
Kisan['answers'].fillna(' ',inplace=True)
vectorizer = TfidfVectorizer(stop_words='english',max_features=5000)
X = vectorizer.fit_transform(Kisan['answers'])

In [None]:
# Apply Latent Dirichlet Allocation (LDA) for topic modeling
num_topics = 5
nmf = NMF(n_components=num_topics,random_state=42,init='nndsvd')
nmf.fit(X)

# Apply Latent Dirichlet Allocation (LDA) for topic modeling
num_topics = 5
lda = LatentDirichletAllocation(n_components=num_topics,random_state=42)
lda

In [None]:
#  Display top words for each topic
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(nmf.components_):
    print(f"Topic {topic_idx}:")
    top_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]  # Top 10 words
    print(" ".join(top_words))
    print("-" * 50)

In [None]:
# Visualize topics using word clouds
for topic_idx, topic in enumerate(nmf.components_):
    word_freqs = {feature_names[i]: topic[i] for i in topic.argsort()[:-51:-1]}  # Top 50 words
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freqs)
    plt.figure(figsize=(8, 4))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(f"Topic {topic_idx} Word Cloud")
    plt.show()


In [None]:
#  Anomaly Detection Model

In [None]:
# Data Preprocessing
Kisan['questions'].fillna(' ',inplace=True)
vectorizer = TfidfVectorizer(stop_words='english',max_features=5000)
x = vectorizer.fit_transform(Kisan['questions'])

In [None]:
Kisan['answers'].fillna(' ',inplace=True)
vectorizer = TfidfVectorizer(stop_words='english',max_features=5000)
X = vectorizer.fit_transform(Kisan['answers'])

In [None]:
Kisan['text_length'] = Kisan['questions'].apply(len)
X = Kisan[['text_length']]
scaler = StandardScaler()
X = scaler.fit_transform(X)


In [None]:
iso_forest = IsolationForest(n_estimators=100,contamination=0.08,random_state=42)
Kisan['anomaly'] = iso_forest.fit_predict(X_scaled)

In [None]:
plt.figure(figsize=(10, 5))
plt.scatter(Kisan.index, Kisan['text_length'], c=Kisan['anomaly'], cmap='coolwarm', edgecolors='k')
plt.xlabel("Index")
plt.ylabel("Text Length")
plt.title("Anomaly Detection in Text Data")
plt.show()


In [None]:
anomalies = Kisan[Kisan['anomaly'] == -1]
print("Detected Anomalies:")
print(anomalies[['questions', 'text_length']].head(10).to_string(index=False))

In [None]:
# Apriori Algorithm

In [None]:
# Data Preprocessing
Kisan['questions'].fillna(' ',inplace=True)
vectorizer = TfidfVectorizer(stop_words='english',max_features=5000)
x = vectorizer.fit_transform(Kisan['questions'])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Kisan['questions'].fillna(' ',inplace=True)


In [None]:
Kisan_cleaned = Kisan.dropna(subset=['questions'])
Kisan_cleaned['questions'] = Kisan_cleaned['questions'].str.lower().str.split


In [None]:
transactions = Kisan_cleaned['questions'].tolist()

In [None]:
# Data Preprocessing
Kisan['questions'].fillna(' ',inplace=True)
vectorizer = TfidfVectorizer(stop_words='english',max_features=5000)
x = vectorizer.fit_transform(Kisan['questions'])

Kisan_cleaned = Kisan.dropna(subset=['questions'])
# Apply str.lower() and str.split() to the 'questions' column
# str.split() is called with no arguments to split on whitespace
# Tokenizing questions into individual words (as items for Apriori)
Kisan_cleaned['questions'] = Kisan_cleaned['questions'].str.lower().str.split()

transactions = Kisan_cleaned['questions'].tolist()

te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
Kisan_encoded = pd.DataFrame(te_ary,columns=te.columns_)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Kisan['questions'].fillna(' ',inplace=True)


In [None]:
# Apply the Apriori algorithm with a minimum support threshold
frequent_itemsets = apriori(Kisan_encoded,min_support=0.01,use_colnames=True)

In [None]:
# Generate association rules with a minimum confidence threshold
rules = association_rules(frequent_itemsets,metric='confidence',min_threshold=0.5)
print(rules[['antecedents','consequents','support','confidence','lift']].head(10))

     antecedents consequents   support  confidence      lift
0            (a)     (about)  0.017179    0.569998  0.980343
1            (a)    (asking)  0.019414    0.644168  0.986468
2            (a)        (of)  0.022483    0.745967  1.532240
3            (a)       (the)  0.017179    0.569998  1.736261
4          (and)     (about)  0.020895    0.586142  1.008109
5  (application)     (about)  0.010842    0.501681  0.862844
6        (asked)     (about)  0.018705    0.828670  1.425235
7       (asking)     (about)  0.545666    0.835624  1.437195
8        (about)    (asking)  0.545666    0.938495  1.437195
9    (bacterial)     (about)  0.012155    0.616322  1.060016
