In [3]:
!pip install pandas



In [4]:
!pip install openpyxl



In [5]:
import pandas as pd

In [6]:
df = pd.read_excel("review.xlsx")

In [7]:
df.head()

Unnamed: 0,id,review,class
0,39,Make it like better with a giant pig bigger th...,feature request
1,42,These screens are small enough without crowdin...,information giving
2,49,Dear Rovio; If you absolutely must continue tr...,information giving
3,56,App crashes when new power up notice pops up.,problem discovery
4,62,It would be nice to have an update that didn ...,information giving


In [8]:
df['class'].value_counts()

class
information giving     603
problem discovery      494
feature request        192
information seeking    101
Name: count, dtype: int64

In [9]:
#removing extra id column
df_id_removed = df.drop(columns=['id'])
df_id_removed.head()

Unnamed: 0,review,class
0,Make it like better with a giant pig bigger th...,feature request
1,These screens are small enough without crowdin...,information giving
2,Dear Rovio; If you absolutely must continue tr...,information giving
3,App crashes when new power up notice pops up.,problem discovery
4,It would be nice to have an update that didn ...,information giving


In [10]:
#encoding target labels
#reference https://pbpython.com/categorical-encoding.html
find_replace = {"class":     {"feature request": 0, "problem discovery": 1, "information giving": 2, "information seeking": 3}}
target_encoded_df = df_id_removed.replace(find_replace)
target_encoded_df.head()

  target_encoded_df = df_id_removed.replace(find_replace)


Unnamed: 0,review,class
0,Make it like better with a giant pig bigger th...,0
1,These screens are small enough without crowdin...,2
2,Dear Rovio; If you absolutely must continue tr...,2
3,App crashes when new power up notice pops up.,1
4,It would be nice to have an update that didn ...,2


In [11]:
target_encoded_df['class'].value_counts()

class
2    603
1    494
0    192
3    101
Name: count, dtype: int64

In [12]:
#train test split

In [13]:
from sklearn.model_selection import train_test_split

X = target_encoded_df['review']
y = target_encoded_df['class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [14]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1112,), (278,), (1112,), (278,))

In [15]:
X_train

821     \nBut you should be able to rearrange your boa...
1285    I  m planning my wedding and getting ideas fro...
859     I recommend this app to all my friends and wil...
430     boards/subjects should have some type of restr...
1102    I have never had an in issue with this App it ...
                              ...                        
785     s problems with this app , 5,  ptcgkm ,  This ...
909     Pintrest provides an easy way to find and stor...
622     I love this site, but it has been crashing sin...
1312    Wish the app was available on my Philips smart...
79                                         Please fix it!
Name: review, Length: 1112, dtype: object

In [16]:
X_test

627     I  ve found so many good ideas and recipes and...
595     No matter what you  re into and what you  re l...
1065                               I can  t stop pinning!
725     I have been a Dropbox user for years and, unfo...
1031    It  s awesome that you can find almost anythin...
                              ...                        
550     I liked the previous version of the app, but n...
860            Please add a back to top of page" button."
97      I even have the Premium version and bought sev...
159          They updated the app and it is a lot better.
982     Some day my things will be on this app this ap...
Name: review, Length: 278, dtype: object

In [17]:
#TF-IDF Feature Extraction
#TF - Term Frequency
#IDF - Inverse Document Frequecy

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    max_df=0.9,
    min_df=5,
    ngram_range=(1, 2)
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [19]:
#naive bayes

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

y_pred_nb = nb_model.predict(X_test_tfidf)

print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


Naive Bayes Accuracy: 0.6510791366906474
              precision    recall  f1-score   support

           0       0.69      0.29      0.41        38
           1       0.73      0.64      0.68        99
           2       0.60      0.86      0.71       121
           3       1.00      0.15      0.26        20

    accuracy                           0.65       278
   macro avg       0.76      0.48      0.51       278
weighted avg       0.69      0.65      0.62       278



In [24]:
sample_reviews = [
    "The app crashes whenever I open it",
    "Please add multiplayer mode",
    "This update explains everything clearly",
    "Please add joystick input"
]

sample_tfidf = tfidf.transform(sample_reviews)

label_map = {
    0: "feature request",
    1: "problem discovery",
    2: "information giving",
    3: "information seeking"
}

for review, pred in zip(sample_reviews, nb_model.predict(sample_tfidf)):
    print(f"Review: {review}")
    print(f"Predicted Class: {label_map[pred]}\n")

Review: The app crashes whenever I open it
Predicted Class: problem discovery

Review: Please add multiplayer mode
Predicted Class: feature request

Review: This update explains everything clearly
Predicted Class: problem discovery

Review: Please add joystick input
Predicted Class: feature request

