In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to C:\Users\Jerome
[nltk_data]     Pintucan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Jerome
[nltk_data]     Pintucan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Jerome
[nltk_data]     Pintucan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package vader_lexicon to C:\Users\Jerome
[nltk_data]     Pintucan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
path = 'CommentKeywords.xlsx'

data = pd.read_excel(path)

def column_cleaner(selected_column):
    data[selected_column] = data[selected_column].str.lower()
    data[selected_column] = data[selected_column].str.strip()
    data[selected_column].drop_duplicates(keep='first', inplace=True)
    return data

data = column_cleaner('Key Words')

lemmatizer = WordNetLemmatizer()
english_stop_words = set(stopwords.words('english'))
filipino_stop_words = set(stopwords.words('filipino'))
chinese_stop_words = set(stopwords.words('chinese'))
stop_words = english_stop_words | filipino_stop_words | chinese_stop_words

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

data['Key Words'] = data['Key Words'].apply(preprocess_text)

# Adding a feature: Length of comments
data['Comment Length'] = data['Key Words'].apply(lambda x: len(x.split()))

X_train, X_val, y_train, y_val = train_test_split(data[['Key Words', 'Comment Length']], data['Category'], test_size=0.2, random_state=42)

# Create a pipeline with TF-IDF vectorization and Multinomial Naive Bayes
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000)),
    ('clf', MultinomialNB())
])

param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__max_df': [0.7, 0.8, 0.9],
    'clf__alpha': [0.1, 0.5, 1.0]
}

# Use StratifiedKFold for cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
cross_val_scores = cross_val_score(text_clf, X_train['Key Words'], y_train, cv=cv, scoring='accuracy')

grid_search = GridSearchCV(text_clf, param_grid, cv=cv, n_jobs=-1)
grid_search.fit(X_train['Key Words'], y_train)

print("Best hyperparameters: ", grid_search.best_params_)

y_pred = grid_search.predict(X_val['Key Words'])

accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

print(f'Accuracy: {accuracy}')
print(report)

# Sentiment Analysis
sid = SentimentIntensityAnalyzer()

def analyze_sentiment(comment):
    sentiment_scores = sid.polarity_scores(comment)
    if sentiment_scores['compound'] >= 0.05:
        return 'positive'
    elif sentiment_scores['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

Best hyperparameters:  {'clf__alpha': 0.1, 'tfidf__max_df': 0.7, 'tfidf__ngram_range': (1, 2)}
Accuracy: 0.825
                        precision    recall  f1-score   support

Food Quality and Taste       0.85      0.94      0.89        54
        Order Accuracy       0.73      0.67      0.70        12
               Portion       0.83      0.83      0.83         6
               Pricing       0.67      0.25      0.36         8

              accuracy                           0.82        80
             macro avg       0.77      0.67      0.70        80
          weighted avg       0.81      0.82      0.81        80



In [4]:
data['Sentiment'] = data['Key Words'].apply(analyze_sentiment)

target_path = 'JKT Reviews.xlsx'
target_comment = pd.read_excel(target_path)
target_comments = target_comment['Review'].tolist()

sentiments = [analyze_sentiment(comment) for comment in target_comments]
X_target = pd.DataFrame({'Comments': target_comments, 'Sentiment': sentiments})
predicted_categories = grid_search.predict(X_target['Comments'])

df = pd.DataFrame({'Comments': target_comments, 'Predicted Category': predicted_categories, 'Sentiment': sentiments})

df.head(10)

Unnamed: 0,Comments,Predicted Category,Sentiment
0,Best,Food Quality and Taste,positive
1,The Best Stewed Pork :),Food Quality and Taste,positive
2,New Jap Comfort Food,Food Quality and Taste,positive
3,Super Yummy!!!! My,Food Quality and Taste,positive
4,Great Food As Always!!,Food Quality and Taste,positive
5,Tasty,Food Quality and Taste,neutral
6,Perfect!❤️❤️❤️,Food Quality and Taste,neutral
7,Their Lu Rou Fan Is The Bomb!!! Their,Food Quality and Taste,negative
8,I Super Like The Gyoza And The Chicken,Food Quality and Taste,positive
9,I Requested Utensils But The Store Did Not,Order Accuracy,neutral
