In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
# Download NLTK stopwords
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

In [3]:
# Stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Fungsi untuk preprocessing teks
def preprocess_text(text):
	# Lowercase
	text = text.lower()
	# Remove special characters
	text = re.sub(r'[^a-z0-9\s]', '', text)
	# Remove stopwords
	text = ' '.join([word for word in word_tokenize(text) if word not in stop_words])
	# Lemmatization
	text = ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text)])
	return text

In [5]:
# Load data
df = pd.read_csv('../datasets/top_anime_reviews_labeled.csv')
data = df.copy()
data.head()

Unnamed: 0,title,review,score,sentiment
0,Beastars Final Season,First review here so please bear with meBeen a...,0.993,positive
1,"Clannad: Mou Hitotsu no Sekai, Tomoyo-hen",I've come to realize that I kind of do like th...,0.9076,positive
2,Kimi wa Meido-sama.,nah you gotta pass this one honestly its way t...,0.9657,positive
3,Cyberpunk: Edgerunners,Bro... I ain't even know where to begin with t...,-0.9618,negative
4,Vivy: Fluorite Eye's Song,Why are you reading this review and not watchi...,0.9878,positive


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   title      1000 non-null   object 
 1   review     1000 non-null   object 
 2   score      1000 non-null   float64
 3   sentiment  1000 non-null   object 
dtypes: float64(1), object(3)
memory usage: 31.4+ KB


In [7]:
# Apply preprocessing to the 'review' column
data['review'] = data['review'].apply(preprocess_text)
data['sentiment'] = data['sentiment'].apply(lambda x: 1 if x == 'positive' else (0 if x == 'neutral' else -1))
data.head()

Unnamed: 0,title,review,score,sentiment
0,Beastars Final Season,first review please bear mebeen beastars fan s...,0.993,1
1,"Clannad: Mou Hitotsu no Sekai, Tomoyo-hen",ive come realize kind like format story isnt a...,0.9076,1
2,Kimi wa Meido-sama.,nah got ta pas one honestly way generic slice ...,0.9657,1
3,Cyberpunk: Edgerunners,bro aint even know begin absolute atrocity nev...,-0.9618,-1
4,Vivy: Fluorite Eye's Song,reading review watching thisvivy fluorite eye ...,0.9878,1


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      1000 non-null   object
 1   review     1000 non-null   object
 2   sentiment  1000 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 23.6+ KB


In [8]:
# Split data into training and testing sets
X = data['review']
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
# Create pipeline with TfidfVectorizer and LogisticRegression
model = Pipeline([
	('tfidf', TfidfVectorizer()),
	('lr', LogisticRegression(class_weight='balanced'))
])

In [10]:
# Train the model
model.fit(X_train, y_train)

In [11]:
# Evaluate model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.64      0.40      0.49        53
           0       0.00      0.00      0.00         2
           1       0.87      0.95      0.91       245

    accuracy                           0.85       300
   macro avg       0.50      0.45      0.47       300
weighted avg       0.83      0.85      0.83       300

[[ 21   0  32]
 [  0   0   2]
 [ 12   0 233]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
