In [65]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize  # For tokenization

from textblob import TextBlob           # For sentiment analysis
# from openai import OpenAI    # For OpenAI API calls
import os
import re                              # For regular expressions
from transformers import pipeline
from tqdm.auto import tqdm 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import emoji
# from spellchecker import SpellChecker
from deep_translator import GoogleTranslator  
#import spacy                          
#import matplotlib.pyplot as plt         
#import seaborn as sns                  

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')



[nltk_data] Downloading package punkt to /Users/lalit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/lalit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/lalit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/lalit/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [66]:
# df = pd.read_csv('review.txt', sep='\n', header=None, names=['review'])
# # sep('\n') is used to separate the reviews by new line
# # header=None is used to specify that there is no header in the file
# # names=['review'] is used to specify the name of the column


In [67]:
# Read reviews from the file
with open('reviews.txt', 'r') as file:
    reviews = file.readlines()

# Create DataFrame
df = pd.DataFrame(reviews, columns=['reviews'])

In [68]:
# Display the first 5 rows
df.head()

Unnamed: 0,reviews
0,I've been using the ShibaSolv app for a while ...
1,Worst application. It doesn't let me connect m...
2,I have a crypto portfolio worth close to $70K ...
3,Bahut hi vdiya app hai chote mote traders ko b...
4,Exchange API key. hey I'm unable to add my exc...


In [69]:
len(df)

100

Preprocessing

In [70]:
# Lowercase
df['clean_reviews'] = df['reviews'].astype(str).apply(lambda x: x.lower())

In [71]:
# Removing punctuation
df['clean_reviews'] = df['clean_reviews'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [72]:
# Removing numbers
df['clean_reviews'] = df['clean_reviews'].apply(lambda x: re.sub(r'\d+', '', x))

In [73]:
# Removing whitespace
df['clean_reviews'] = df['clean_reviews'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

In [74]:
# Replace emojis with their descriptions
df['clean_reviews'] = df['clean_reviews'].apply(emoji.demojize)

In [75]:
# Tokenization (splitting into words)
df['tokens'] = df['clean_reviews'].apply(word_tokenize)

In [76]:
# # Load words to ignore from the file
# with open("most_common_filtered_words.txt", "r") as file:
#     words_to_ignore = set(file.read().splitlines())

# # Spell Correction (excluding words to ignore)
# spell = SpellChecker()
# df['tokens'] = df['tokens'].apply(lambda tokens: [SpellChecker().correction(w) or w if w not in words_to_ignore else w for w in tokens])

In [77]:
df.head()

Unnamed: 0,reviews,clean_reviews,tokens
0,I've been using the ShibaSolv app for a while ...,ive been using the shibasolv app for a while n...,"[ive, been, using, the, shibasolv, app, for, a..."
1,Worst application. It doesn't let me connect m...,worst application it doesnt let me connect my ...,"[worst, application, it, doesnt, let, me, conn..."
2,I have a crypto portfolio worth close to $70K ...,i have a crypto portfolio worth close to k sti...,"[i, have, a, crypto, portfolio, worth, close, ..."
3,Bahut hi vdiya app hai chote mote traders ko b...,bahut hi vdiya app hai chote mote traders ko b...,"[bahut, hi, vdiya, app, hai, chote, mote, trad..."
4,Exchange API key. hey I'm unable to add my exc...,exchange api key hey im unable to add my excha...,"[exchange, api, key, hey, im, unable, to, add,..."


In [78]:
# Stopword Removal (remove common, less meaningful words)
stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda tokens: [w for w in tokens if not w in stop_words])

In [79]:
# Lemmatization (reduce words to their base form)
lemmatizer = WordNetLemmatizer()
df['tokens'] = df['tokens'].apply(lambda tokens: [lemmatizer.lemmatize(w) for w in tokens])

In [80]:
# Filter Short Words (remove very short words, often noise)
df['tokens'] = df['tokens'].apply(lambda tokens: [token for token in tokens if len(token) > 2])

In [81]:
# Join Tokens Back into Sentences
df['clean_reviews'] = df['tokens'].apply(lambda tokens: ' '.join(tokens)) 

In [82]:
# Display preprocessed reviews
print(df['clean_reviews'].head().to_markdown(index=False, numalign="left", stralign="left"))

| clean_reviews                                                                                                                                                                                                                                                                                                                                                                               |
|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| ive using shibasolv app find really interesting fun app managing cryptocurrencies give lot opportunity one track investment improve trading strategy                                                                                  

In [83]:
df.head()

Unnamed: 0,reviews,clean_reviews,tokens
0,I've been using the ShibaSolv app for a while ...,ive using shibasolv app find really interestin...,"[ive, using, shibasolv, app, find, really, int..."
1,Worst application. It doesn't let me connect m...,worst application doesnt let connect wallet ne...,"[worst, application, doesnt, let, connect, wal..."
2,I have a crypto portfolio worth close to $70K ...,crypto portfolio worth close still cant find t...,"[crypto, portfolio, worth, close, still, cant,..."
3,Bahut hi vdiya app hai chote mote traders ko b...,bahut vdiya app hai chote mote trader invest k...,"[bahut, vdiya, app, hai, chote, mote, trader, ..."
4,Exchange API key. hey I'm unable to add my exc...,exchange api key hey unable add exchange api k...,"[exchange, api, key, hey, unable, add, exchang..."


In [84]:
with open('most_common_filtered_words.txt', 'r') as file:
    loanwords = set(word.strip() for word in file)

print(loanwords)



{'months', 'best', 'account', 'trade', 'wallet', 'advanced', 'good', 'traders', 'shibasolv', 'crypto', 'profitable', 'features', 'worst', 'even', 'app', 'trading', 'hai', 'connect', 'strategies', 'exchange'}


In [85]:
def detect_language_char(text):
    latin_chars = re.findall(r'[a-zA-Z]', text)
    non_latin_ratio = 1 - (len(latin_chars) / len(text))
    return 'hi' if non_latin_ratio > 0.155 else 'en' # 0.155 is the threshold

In [86]:
df['language'] = df['clean_reviews'].apply(detect_language_char)


In [87]:
df_hinglish = df[df['language'] == 'hi'].copy()
df_english = df[df['language'] == 'en'].copy()

In [88]:
translator = GoogleTranslator(source='hi', target='en')  # Specify source and target languages


In [89]:
tqdm.pandas(desc="Translating Hinglish Reviews")
def translate_with_progress(text, translator):
    return translator.translate(text)
df_hinglish['clean_reviews'] = [translate_with_progress(text, translator) for text in tqdm(df_hinglish['clean_reviews'])]

  0%|          | 0/12 [00:00<?, ?it/s]

In [90]:
df_hinglish['review_trans'] = [translate_with_progress(text, translator) for text in tqdm(df_hinglish['reviews'])]
df_hinglish['clean_review_trans'] = df_hinglish['clean_reviews']

  0%|          | 0/12 [00:00<?, ?it/s]

In [91]:
# Create new columns for original English reviews and cleaned English reviews
df_english['review_trans'] = df_english['reviews']
df_english['clean_review_trans'] = df_english['clean_reviews']

In [92]:
# Combine clean_review_trans from both DataFrames into the original df
df = pd.concat([df_english, df_hinglish], ignore_index=True)
df.reset_index(drop=True, inplace=True)

In [93]:
df['clean_review_trans'] = df.apply(
    lambda row: row['clean_reviews'] if row['language'] == 'en' else row['clean_review_trans'],
    axis=1
)

In [94]:
print(df.tail(50).to_markdown(index=False, numalign="left", stralign="left"))


| reviews                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            | clean_reviews                                                                                                                                                                                                                                                                                                                                                                                      | tokens                                                                      

In [95]:
# #Initialize OpenAI Client
# client = OpenAI()


In [96]:
# # @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# def get_sentiment_and_summary(review_text):
#     response = client.chat.completions.create(
#         model="gpt-3.5-turbo",
#         messages=[
#             {"role": "system", "content": "You are a helpful assistant."},
#             {"role": "user", "content": f"Analyze the sentiment of this app review and summarize its key points in a few words:\n{review_text}"}
#         ],
#         max_tokens=30
#     )
#     summary = response.choices[0].message['content'].strip()
#     sentiment = TextBlob(summary).sentiment.polarity
#     return sentiment, summary

# # Apply the function to the DataFrame
# df[['sentiment', 'summary']] = df['clean_reviews'].apply(lambda x: pd.Series(get_sentiment_and_summary(x)))


In [97]:
# Summarization
summarization_pipeline = pipeline("summarization", model="facebook/bart-large-cnn") 


In [98]:
# df['summary'] = df['clean_reviews'].apply(lambda x: summarization_pipeline(x, max_length=30, min_length=10)[0]['summary_text']) 
tqdm.pandas()

df['summary'] = df['clean_reviews'].progress_apply(lambda x: summarization_pipeline(x, max_length=30, min_length=10)[0]['summary_text'])

  0%|          | 0/100 [00:00<?, ?it/s]

Your max_length is set to 30, but your input_length is only 27. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)
Your max_length is set to 30, but your input_length is only 13. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)
Your max_length is set to 30, but your input_length is only 12. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)
Your max_length is set to 30, but your input_length is only 23. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)
Your max_l

In [99]:
print(df['summary'].head(10).to_markdown(index=False, numalign="left", stralign="left"))

| summary                                                                                                                                                    |
|:-----------------------------------------------------------------------------------------------------------------------------------------------------------|
| Shibasolv app managing cryptocurrencies give lot opportunity one track investment improve trading strategy.                                                |
| The worst application doesnt let connect wallet neither exchange account nothing work. Worst application lets you connect wallet but not exchange account. |
| crypto portfolio worth close still cant find transaction connect wallet.                                                                                   |
| playstore app say minimum investment needed to download open search trading feature.                                                                       |
| Exchange owner shows app individual trader. 

In [100]:
# Sentiment Analysis
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")


In [101]:

df['sentiment'] = df['summary'].apply(lambda x: sentiment_pipeline(x)[0]['label'])


In [102]:
# Display the first 5 rows of the DataFrame
print(df[['summary', 'sentiment']].head(10).to_markdown(index=False, numalign="left", stralign="left"))


| summary                                                                                                                                                    | sentiment   |
|:-----------------------------------------------------------------------------------------------------------------------------------------------------------|:------------|
| Shibasolv app managing cryptocurrencies give lot opportunity one track investment improve trading strategy.                                                | POSITIVE    |
| The worst application doesnt let connect wallet neither exchange account nothing work. Worst application lets you connect wallet but not exchange account. | NEGATIVE    |
| crypto portfolio worth close still cant find transaction connect wallet.                                                                                   | NEGATIVE    |
| playstore app say minimum investment needed to download open search trading feature.                                                 

In [103]:
# Sentiment Analysis (using TextBlob)
df['sentiment_dec'] = df['summary'].astype(str).apply(lambda x: TextBlob(x).sentiment.polarity)
# Display DataFrame
print(df[['summary', 'sentiment_dec']].head(10).to_markdown(index=False, numalign="left", stralign="left"))


| summary                                                                                                                                                    | sentiment_dec   |
|:-----------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------|
| Shibasolv app managing cryptocurrencies give lot opportunity one track investment improve trading strategy.                                                | 0               |
| The worst application doesnt let connect wallet neither exchange account nothing work. Worst application lets you connect wallet but not exchange account. | -1              |
| crypto portfolio worth close still cant find transaction connect wallet.                                                                                   | 0.3             |
| playstore app say minimum investment needed to download open search trading feature.                             

In [104]:
# Sort by sentiment_dec and get top 5 positive reviews
top_5_positive_reviews = df.nlargest(5, 'sentiment_dec')[['reviews', 'clean_reviews', 'summary', 'sentiment_dec']]

print("\nTop 5 Positive Reviews:")
print(top_5_positive_reviews.to_markdown(index=False, numalign="left", stralign="left"))





Top 5 Positive Reviews:
| reviews                                                                                                                                                                                                                                          | clean_reviews                                                                                                                                                        | summary                                                                                                                                                                | sentiment_dec   |
|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------

In [105]:
# Sort by sentiment_dec and get top 5 negative reviews
top_5_negative_reviews = df.nsmallest(5, 'sentiment_dec')[['reviews', 'clean_review_trans', 'summary', 'sentiment_dec']]

print("\nTop 5 Negative Reviews:")
print(top_5_negative_reviews.to_markdown(index=False, numalign="left", stralign="left"))


Top 5 Negative Reviews:
| reviews                                                                                                                                                                                                     | clean_review_trans                                                                                                                      | summary                                                                                                                                                        | sentiment_dec   |
|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------