<a href="https://colab.research.google.com/github/kushalkamboj/PortfolioProject/blob/main/NLP_Project_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing required libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel

In [2]:
# Downloading required NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
# Task 1: Reading the .csv file using Pandas
from google.colab import files
uploaded = files.upload()
df = pd.read_csv('K8 Reviews v0.2.csv')
print(df.head())

Saving K8 Reviews v0.2.csv to K8 Reviews v0.2.csv
   sentiment                                             review
0          1             Good but need updates and improvements
1          0  Worst mobile i have bought ever, Battery is dr...
2          1  when I will get my 10% cash back.... its alrea...
3          1                                               Good
4          0  The worst phone everThey have changed the last...


In [6]:
# Task 2: Normalizing casings and extracting text into a list
reviews = df['review'].str.lower().tolist()


In [8]:
# Task 3: Tokenizing the reviews
tokenized_reviews = [word_tokenize(review) for review in reviews]


In [10]:
# Task 4: Performing parts-of-speech tagging
tagged_reviews = [pos_tag(tokens) for tokens in tokenized_reviews]


In [11]:
# Task 5: Limiting data to only terms with noun tags
noun_tags = ['NN', 'NNS', 'NNP', 'NNPS']
nouns_only = [[token for token, tag in review if tag in noun_tags] for review in tagged_reviews]

In [12]:
# Task 6: Lemmatizing the tokens
lemmatizer = WordNetLemmatizer()
lemmatized_reviews = [[lemmatizer.lemmatize(token) for token in review] for review in nouns_only]

In [13]:
# Task 7: Removing stopwords and punctuation
stop_words = set(stopwords.words('english'))
cleaned_reviews = [[token for token in review if token not in stop_words and token.isalpha()] for review in lemmatized_reviews]

In [15]:
# Task 8: Creating a topic model using LDA with 12 topics
dictionary = Dictionary(cleaned_reviews)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in cleaned_reviews]
lda_model = LdaModel(doc_term_matrix, num_topics=12, id2word=dictionary, passes=50)
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx}\nWords: {topic}\n')

Topic: 0
Words: 0.086*"call" + 0.078*"option" + 0.067*"screen" + 0.062*"waste" + 0.032*"cast" + 0.032*"money" + 0.032*"work" + 0.024*"application" + 0.024*"notification" + 0.023*"contact"

Topic: 1
Words: 0.349*"product" + 0.062*"amazon" + 0.044*"charger" + 0.026*"experience" + 0.021*"service" + 0.021*"replacement" + 0.020*"delivery" + 0.018*"lenovo" + 0.018*"purchase" + 0.015*"customer"

Topic: 2
Words: 0.095*"sim" + 0.083*"everything" + 0.066*"budget" + 0.050*"jio" + 0.043*"volta" + 0.039*"delivery" + 0.037*"side" + 0.033*"excellent" + 0.028*"support" + 0.024*"awesome"

Topic: 3
Words: 0.376*"phone" + 0.048*"feature" + 0.035*"hai" + 0.025*"h" + 0.014*"ho" + 0.013*"plz" + 0.010*"killer" + 0.010*"hi" + 0.009*"function" + 0.009*"company"

Topic: 4
Words: 0.104*"money" + 0.097*"device" + 0.052*"value" + 0.040*"return" + 0.040*"update" + 0.036*"dolby" + 0.027*"lenovo" + 0.021*"atmos" + 0.021*"policy" + 0.019*"version"

Topic: 5
Words: 0.265*"battery" + 0.095*"phone" + 0.052*"backup" + 0.0

In [16]:
# Task 8.2: Computing the coherence of the model with the c_v metric
coherence_model_lda = CoherenceModel(model=lda_model, texts=cleaned_reviews, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}')

Coherence Score: 0.5313391596196517


In [29]:
# create a list of all the topics and their top keywords
num_topics=12
topic_keywords = lda_model.show_topics(num_topics=num_topics, num_words=10, formatted=False)

In [30]:
# create a dictionary to map topic numbers to their coherence scores
topic_coherences = {}
for i in range(num_topics):
    # convert the topic keywords to a list of token ids
    topic_tokens = [token[0] for token in topic_keywords[i][1]]
    topic_ids = [dictionary.token2id[token] for token in topic_tokens if token in dictionary.token2id]
    
    # calculate the coherence score using the token ids
    cm = CoherenceModel(topics=[topic_ids], texts=cleaned_reviews, dictionary=dictionary, coherence='c_v')
    topic_coherences[i] = cm.get_coherence()

In [26]:
# sort the topics by their coherence scores (in descending order)
sorted_topics = sorted(topic_coherences.items(), key=lambda x: x[1], reverse=True)


In [27]:
# print out the topics and their coherence scores
print("Topics and Coherence Scores:")
for topic in sorted_topics:
    print(f"Topic {topic[0]}: {topic[1]}")


Topics and Coherence Scores:
Topic 7: 0.8215510499955329
Topic 11: 0.7135453479962782
Topic 1: 0.7127067479765388
Topic 8: 0.6925356483974601
Topic 5: 0.6922234386603883
Topic 0: 0.6272983178915674
Topic 4: 0.5442960437009777
Topic 2: 0.4184792210702492
Topic 6: 0.38841443587324326
Topic 9: 0.3572777713938146
Topic 3: 0.3240975900223187
Topic 10: 0.3130043966634967


In [28]:
# determine which topics can be combined
# we can use a threshold coherence score to decide which topics are coherent enough to be combined
threshold = 0.4
print(f"\nTopics to Combine (coherence score >= {threshold}):")
for i in range(num_topics):
    if topic_coherences[sorted_topics[i][0]] >= threshold:
        print(f"Topic {sorted_topics[i][0]}: {topic_keywords[sorted_topics[i][0]]}")


Topics to Combine (coherence score >= 0.4):
Topic 7: (7, [('camera', 0.17169197), ('quality', 0.08288315), ('phone', 0.06487694), ('feature', 0.02376089), ('mode', 0.022651972), ('sound', 0.022288702), ('display', 0.020143854), ('speaker', 0.019672541), ('music', 0.013392466), ('video', 0.013227598), ('note', 0.012360801), ('speed', 0.012237436)])
Topic 11: (11, [('note', 0.15336715), ('lenovo', 0.07887897), ('software', 0.06673406), ('ram', 0.04422346), ('system', 0.03579163), ('card', 0.030276945), ('smartphone', 0.028753325), ('memory', 0.025523987), ('update', 0.025511593), ('gb', 0.021250602), ('apps', 0.019339235), ('model', 0.018802516)])
Topic 1: (1, [('product', 0.3493943), ('amazon', 0.062465373), ('charger', 0.04415039), ('experience', 0.026239965), ('service', 0.021369549), ('replacement', 0.020739608), ('delivery', 0.019622166), ('lenovo', 0.018209707), ('purchase', 0.01802833), ('customer', 0.015337502), ('day', 0.014608677), ('month', 0.0133802295)])
Topic 8: (8, [('pro

In [32]:
lda_model = LdaModel(corpus=doc_term_matrix,id2word=dictionary,num_topics=num_topics,random_state=42,passes=10,per_word_topics=True)

# calculate the coherence score of the model
cm = CoherenceModel(model=lda_model, texts=cleaned_reviews, dictionary=dictionary, coherence='c_v')
coherence = cm.get_coherence()

print(f"Coherence score: {coherence}")

Coherence score: 0.629370123043419


In [33]:
# Interpret the topics and name them based on the top terms
topic_names = []
for topic in topic_keywords:
    top_terms = [term[0] for term in topic[1]]
    topic_name = ', '.join(top_terms)
    topic_names.append(topic_name)

# Create a table with the topic name and the top 10 terms for each topic

terms_table = pd.DataFrame(columns=['Topic Name', 'Top 10 Terms'])

for i, topic in enumerate(topic_keywords):
    top_terms = [term[0] for term in topic[1]]
    topic_name = topic_names[i]
    row = {'Topic Name': topic_name, 'Top 10 Terms': top_terms}
    terms_table = terms_table.append(row, ignore_index=True)

# Print the table
print(terms_table)

                                           Topic Name  \
0   call, option, screen, waste, cast, money, work...   
1   product, amazon, charger, experience, service,...   
2   sim, everything, budget, jio, volta, delivery,...   
3   phone, feature, hai, h, ho, plz, killer, hi, f...   
4   money, device, value, return, update, dolby, l...   
5   battery, phone, backup, time, day, hour, life,...   
6   performance, camera, look, processor, star, su...   
7   camera, quality, phone, feature, mode, sound, ...   
8   problem, phone, issue, heating, network, servi...   
9   mobile, glass, superb, expectation, gorilla, s...   
10  phone, price, range, feature, headphone, earph...   
11  note, lenovo, software, ram, system, card, sma...   

                                         Top 10 Terms  
0   [call, option, screen, waste, cast, money, wor...  
1   [product, amazon, charger, experience, service...  
2   [sim, everything, budget, jio, volta, delivery...  
3   [phone, feature, hai, h, ho, p

  terms_table = terms_table.append(row, ignore_index=True)
  terms_table = terms_table.append(row, ignore_index=True)
  terms_table = terms_table.append(row, ignore_index=True)
  terms_table = terms_table.append(row, ignore_index=True)
  terms_table = terms_table.append(row, ignore_index=True)
  terms_table = terms_table.append(row, ignore_index=True)
  terms_table = terms_table.append(row, ignore_index=True)
  terms_table = terms_table.append(row, ignore_index=True)
  terms_table = terms_table.append(row, ignore_index=True)
  terms_table = terms_table.append(row, ignore_index=True)
  terms_table = terms_table.append(row, ignore_index=True)
  terms_table = terms_table.append(row, ignore_index=True)
