# Importing Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import parse
import multiprocessing
from bs4 import BeautifulSoup # Text Cleaning
import re, string # Regular Expressions, String
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords # stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer # for word stemming
from nltk.stem import WordNetLemmatizer # for word lemmatization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from pad_sequences import pad_sequences_multi
import unicodedata
import html
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from keras.callbacks import EarlyStopping

from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

%matplotlib inline

[nltk_data] Downloading package stopwords to /home/jasiah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2023-09-12 21:34:23.701424: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-12 21:34:23.740980: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-12 21:34:24.016662: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-12 21:34:24.018158: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appr

In [2]:
def getAppNames(path):
    list_of_files = os.listdir(path)
    app_names = []
    
    for file in list_of_files:
        match = re.search(r'submission_(.+)\.csv', file)
        if match:
            app_name = match.group(1)
            app_names.append(app_name)
    
    return app_names

path = r'./data'
appNames = getAppNames(path)
print(appNames)

['toffee', 'bongobd', 'bioscope', 'hoichoi', 'chorki']


In [3]:

def creating_df(path, appNames):
    dfs = []
    file_names = [file for file in os.listdir(path) if os.path.isfile(os.path.join(path, file))]
    
    for file_name in file_names:
        print("Reading " + file_name)
        data = pd.read_csv(os.path.join(path, file_name))
        df = pd.DataFrame(data)
        print("Current Data Frame shape ")
        print(df.shape)
        
        # Extract the app name from the file name
        app_name = os.path.splitext(file_name)[0].replace('submission_', '')
        
        # Check if the app name exists in the provided list
        if app_name in appNames:
            df['appName'] = app_name
            dfs.append(df)
    
    print("Total Files found: ", len(dfs))
    
    final_df = pd.concat(dfs, axis=0, ignore_index=True)
    
    return final_df

    
final_df = creating_df(path, appNames)
final_df


Reading submission_toffee.csv
Current Data Frame shape 
(54538, 11)
Reading submission_bongobd.csv
Current Data Frame shape 
(13720, 11)
Reading submission_bioscope.csv
Current Data Frame shape 
(28058, 11)
Reading submission_hoichoi.csv
Current Data Frame shape 
(39815, 11)
Reading final_submission.csv
Current Data Frame shape 
(143050, 12)
Reading submission_chorki.csv
Current Data Frame shape 
(6919, 11)
Total Files found:  5


Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,appName
0,d633624c-8763-4cef-aa63-e7dea7bcc757,MD Mahin,https://play-lh.googleusercontent.com/a/ACg8oc...,"Dear developer, may i get your kind attention ...",1,56,5.1.0,2023-08-26 19:50:43,"Dear Mahin, We are apologizing for the inconve...",2023-08-26 19:56:35,5.1.0,toffee
1,dca507a9-dc8e-4fb2-a9df-d1942e8715ba,MD:Najim Babu,https://play-lh.googleusercontent.com/a-/ALV-U...,So much ads if I click on a channel which is v...,1,192,4.9.0,2023-06-27 20:50:44,"Dear Najim, We are apologizing for the inconve...",2023-06-27 20:57:11,4.9.0,toffee
2,b608c585-4e55-4d56-bce5-f8a33076cc3d,Asadujjaman Asif,https://play-lh.googleusercontent.com/a/ACg8oc...,It's a stupid kind of app. This app is very sl...,1,21,5.1.0,2023-08-31 15:56:45,"Dear Asadujjaman, We are apologizing for the i...",2023-08-31 16:50:32,5.1.0,toffee
3,80008754-834c-49b2-b52b-bafe418d8539,Fahad's Notebook,https://play-lh.googleusercontent.com/a-/ALV-U...,Two years ago I have given a review with 5⭐ bu...,1,73,4.9.0,2023-07-11 17:13:40,"Dear Fahad, We are apologizing for the inconve...",2023-07-11 17:16:47,4.9.0,toffee
4,13357764-ddec-4b5f-b3e9-859b5a61a9e3,Shahriar Kabir,https://play-lh.googleusercontent.com/a-/ALV-U...,I am using Toffee for two months now. It has a...,4,622,4.9.0,2023-08-17 02:18:31,"Dear Kabir, Thank you for your valuable feedback!",2023-08-17 02:21:42,4.9.0,toffee
...,...,...,...,...,...,...,...,...,...,...,...,...
143045,8f2d170e-df9f-4a9a-81bd-eacec4d5ec46,Tarun Bhowmick,https://play-lh.googleusercontent.com/a-/ALV-U...,💝💝💝,5,0,,2021-10-02 22:02:38,Thank you very much for your feedback. Keep us...,2021-10-05 22:58:02,,chorki
143046,b4a295c5-d052-4666-9a9c-59753268f994,Shohag Molla,https://play-lh.googleusercontent.com/a/ACg8oc...,😍😍😍😍,5,0,,2023-07-21 20:26:38,,,,chorki
143047,511c2f48-c361-48b6-b8bc-0a9aecd74b46,Sojib Rayhan,https://play-lh.googleusercontent.com/a/ACg8oc...,💿💿💿💿,5,0,,2023-01-18 19:21:36,Thanks for taking out time to rate us. It real...,2023-01-20 11:50:44,,chorki
143048,721ce821-80a0-4a7a-bb22-246f46524cf3,Sariya Jahan,https://play-lh.googleusercontent.com/a/ACg8oc...,❤❤❤❤,5,0,,2021-08-20 19:39:17,Thanks for your love.,2021-10-21 12:00:52,,chorki


In [4]:
final_df.to_csv(path+'/'+'final_submission.csv', index=False) 
print("Final Submission File SAVED!")
print("Final File Shape: ", final_df.shape)
final_df.head()

Final Submission File SAVED!
Final File Shape:  (143050, 12)


Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,appName
0,d633624c-8763-4cef-aa63-e7dea7bcc757,MD Mahin,https://play-lh.googleusercontent.com/a/ACg8oc...,"Dear developer, may i get your kind attention ...",1,56,5.1.0,2023-08-26 19:50:43,"Dear Mahin, We are apologizing for the inconve...",2023-08-26 19:56:35,5.1.0,toffee
1,dca507a9-dc8e-4fb2-a9df-d1942e8715ba,MD:Najim Babu,https://play-lh.googleusercontent.com/a-/ALV-U...,So much ads if I click on a channel which is v...,1,192,4.9.0,2023-06-27 20:50:44,"Dear Najim, We are apologizing for the inconve...",2023-06-27 20:57:11,4.9.0,toffee
2,b608c585-4e55-4d56-bce5-f8a33076cc3d,Asadujjaman Asif,https://play-lh.googleusercontent.com/a/ACg8oc...,It's a stupid kind of app. This app is very sl...,1,21,5.1.0,2023-08-31 15:56:45,"Dear Asadujjaman, We are apologizing for the i...",2023-08-31 16:50:32,5.1.0,toffee
3,80008754-834c-49b2-b52b-bafe418d8539,Fahad's Notebook,https://play-lh.googleusercontent.com/a-/ALV-U...,Two years ago I have given a review with 5⭐ bu...,1,73,4.9.0,2023-07-11 17:13:40,"Dear Fahad, We are apologizing for the inconve...",2023-07-11 17:16:47,4.9.0,toffee
4,13357764-ddec-4b5f-b3e9-859b5a61a9e3,Shahriar Kabir,https://play-lh.googleusercontent.com/a-/ALV-U...,I am using Toffee for two months now. It has a...,4,622,4.9.0,2023-08-17 02:18:31,"Dear Kabir, Thank you for your valuable feedback!",2023-08-17 02:21:42,4.9.0,toffee


In [5]:
final_df = final_df.drop(['reviewCreatedVersion', 'replyContent', 'repliedAt','appVersion','at','userImage'], axis=1)

In [6]:
final_df

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,appName
0,d633624c-8763-4cef-aa63-e7dea7bcc757,MD Mahin,"Dear developer, may i get your kind attention ...",1,56,toffee
1,dca507a9-dc8e-4fb2-a9df-d1942e8715ba,MD:Najim Babu,So much ads if I click on a channel which is v...,1,192,toffee
2,b608c585-4e55-4d56-bce5-f8a33076cc3d,Asadujjaman Asif,It's a stupid kind of app. This app is very sl...,1,21,toffee
3,80008754-834c-49b2-b52b-bafe418d8539,Fahad's Notebook,Two years ago I have given a review with 5⭐ bu...,1,73,toffee
4,13357764-ddec-4b5f-b3e9-859b5a61a9e3,Shahriar Kabir,I am using Toffee for two months now. It has a...,4,622,toffee
...,...,...,...,...,...,...
143045,8f2d170e-df9f-4a9a-81bd-eacec4d5ec46,Tarun Bhowmick,💝💝💝,5,0,chorki
143046,b4a295c5-d052-4666-9a9c-59753268f994,Shohag Molla,😍😍😍😍,5,0,chorki
143047,511c2f48-c361-48b6-b8bc-0a9aecd74b46,Sojib Rayhan,💿💿💿💿,5,0,chorki
143048,721ce821-80a0-4a7a-bb22-246f46524cf3,Sariya Jahan,❤❤❤❤,5,0,chorki


In [7]:
def stringcast(text):
    return str(text)
final_df['content'] = final_df['content'].apply(stringcast) #typecast

In [8]:
import emoji
def emoji_to_text(text):
    return emoji.demojize(text)

In [9]:
final_df['content'] = final_df['content'].apply(emoji_to_text)

In [10]:
final_df

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,appName
0,d633624c-8763-4cef-aa63-e7dea7bcc757,MD Mahin,"Dear developer, may i get your kind attention ...",1,56,toffee
1,dca507a9-dc8e-4fb2-a9df-d1942e8715ba,MD:Najim Babu,So much ads if I click on a channel which is v...,1,192,toffee
2,b608c585-4e55-4d56-bce5-f8a33076cc3d,Asadujjaman Asif,It's a stupid kind of app. This app is very sl...,1,21,toffee
3,80008754-834c-49b2-b52b-bafe418d8539,Fahad's Notebook,Two years ago I have given a review with 5:sta...,1,73,toffee
4,13357764-ddec-4b5f-b3e9-859b5a61a9e3,Shahriar Kabir,I am using Toffee for two months now. It has a...,4,622,toffee
...,...,...,...,...,...,...
143045,8f2d170e-df9f-4a9a-81bd-eacec4d5ec46,Tarun Bhowmick,:heart_with_ribbon::heart_with_ribbon::heart_w...,5,0,chorki
143046,b4a295c5-d052-4666-9a9c-59753268f994,Shohag Molla,:smiling_face_with_heart-eyes::smiling_face_wi...,5,0,chorki
143047,511c2f48-c361-48b6-b8bc-0a9aecd74b46,Sojib Rayhan,:optical_disk::optical_disk::optical_disk::opt...,5,0,chorki
143048,721ce821-80a0-4a7a-bb22-246f46524cf3,Sariya Jahan,:red_heart::red_heart::red_heart::red_heart:,5,0,chorki


In [11]:
import pandas as pd
import langid

# Function to detect the language of a text
def detect_language(text):
    try:
        return langid.classify(text)[0]
    except:
        return None

# Apply language detection to the "content" column
final_df['language'] = final_df['content'].apply(detect_language)

# Filter for Bengali comments
bengali_subset = final_df[final_df['language'] == 'bn']

# bengali_subset now contains only rows with Bengali comments



In [12]:
bengali_subset

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,appName,language
1142,2af5b27a-8f1a-414f-b460-6f5869ea6001,Nura Siddika,This app is very Fine app.but i wish any other...,5,0,toffee,bn
1193,fd592ed3-6ddf-4a5f-b0af-bae27ca901de,Abu Raihan,এটা একটা খুবই ভালো লাইভ টিভি অ্যাপ্লিকেশন। ধন্...,5,26,toffee,bn
1518,73a20207-84cd-49b3-8cc4-c74c54cf867c,MD Monir Hussain,কি এ্যাপ বানাইলেন এক ফোনের এ্যাপে সেলজুক ভিডিও...,1,8,toffee,bn
1738,da83e6d5-e04d-481d-9d6b-799e0deb4d62,Md Yeasin Arafat (Arafat),Very bad app. We can't watching our main footb...,1,0,toffee,bn
1768,aa180034-1c19-4b62-bb1c-235bf201f9f3,Tanvir Ahmed,App টা ঠিক নাই loser app.I love free fire .I h...,1,0,toffee,bn
...,...,...,...,...,...,...,...
143003,f5e04e48-6ce4-4343-b0bb-984f54944192,Md. Riyad Hassan,নাইস,5,0,chorki,bn
143007,8283d272-9977-4c62-99a6-3e21826e6377,imtiaz kamal,ভালো,5,0,chorki,bn
143037,f78a9845-d913-4abb-8ed1-1fc7c2b72320,Sabbir Hosen,ইন্টারনেটের এই যুগে এসেও মনে হচ্ছে যেন পৃথিবীর...,1,0,chorki,bn
143038,298ec689-ceef-4278-b4bf-b0d18c3b35bb,Muntaha M,সাবস্ক্রাইবাররা এখানে তাদের ফিডব্যাক দেয়। আপনা...,1,0,chorki,bn


In [13]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("shihab17/bengali-bn-to-en")
model = AutoModelForSeq2SeqLM.from_pretrained("shihab17/bengali-bn-to-en")


In [14]:
def translate_to_english(text):
    inputs = tokenizer.encode(">>bn<< " + text, return_tensors="pt", max_length=512, truncation=True)
    translations = model.generate(inputs, max_length=512, num_return_sequences=1, num_beams=4, no_repeat_ngram_size=3)

    translated_text = tokenizer.decode(translations[0], skip_special_tokens=True)
    return translated_text

In [15]:
# Create a copy of bengali_subset to avoid the SettingWithCopyWarning
bengali_subset = bengali_subset.copy()
bengali_subset["content"] = bengali_subset["content"].apply(translate_to_english)

# save bengali_subset to a CSV file named "bengali_subset.csv"
bengali_subset.to_csv("bengali_subset.csv", index=False)