In [2]:
import json
import pandas as pd
import dataset_util
from tqdm import tqdm

[nltk_data] Downloading package punkt to
[nltk_data]     /home/user/welzs0/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/user/welzs0/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/user/welzs0/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Load Datasets

In [4]:
rest_train = dataset_util.load_dataset(split="train", domain="rest") + dataset_util.load_dataset(split="dev", domain = "rest")
rest_test = dataset_util.load_dataset(split="test", domain="rest")
laptop_train = dataset_util.load_dataset(split="train", domain="laptop") + dataset_util.load_dataset(split="dev", domain = "laptop")
laptop_test = dataset_util.load_dataset(split="test", domain="laptop")

### Preprocess 

In [5]:
dataset_util.preprocess_dataset(rest_train)
dataset_util.preprocess_dataset(laptop_train)
dataset_util.preprocess_dataset(rest_test)
dataset_util.preprocess_dataset(laptop_test)

Processing Text: 100%|█████████████████████████████████████████████████████████████| 3602/3602 [00:05<00:00, 674.37it/s]
Processing Text: 100%|█████████████████████████████████████████████████████████████| 2313/2313 [00:03<00:00, 760.91it/s]
Processing Text: 100%|█████████████████████████████████████████████████████████████| 1120/1120 [00:01<00:00, 852.88it/s]
Processing Text: 100%|███████████████████████████████████████████████████████████████| 638/638 [00:00<00:00, 890.01it/s]


In [6]:
print(rest_train[0])

{'polarity': 'positive', 'term': 'server', 'id': '1592_0', 'sentence': ['our', 'server', 'be', 'very', 'helpful', 'and', 'friendly']}


### TFIDF with Logistic Regression

In [7]:
rest_train_sentences = [" ".join(item['sentence']) for item in rest_train]
rest_train_polarities = [item['polarity'] for item in rest_train]

rest_test_sentences = [" ".join(item['sentence']) for item in rest_test]
rest_test_polarities = [item['polarity'] for item in rest_test]

In [8]:
laptop_train_sentences = [" ".join(item['sentence']) for item in laptop_train]
laptop_train_polarities = [item['polarity'] for item in laptop_train]

laptop_test_sentences = [" ".join(item['sentence']) for item in laptop_test]
laptop_test_polarities = [item['polarity'] for item in laptop_test]

Use Tfidf vectorizer to encode sentences.

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'sentences' is a list of text data (your reviews)
tfidf_vectorizer = TfidfVectorizer()

all_sentences = laptop_test_sentences + laptop_test_sentences
tfidf_vectorizer.fit(all_sentences)
X_train = tfidf_vectorizer.transform(laptop_train_sentences)
X_test = tfidf_vectorizer.transform(laptop_test_sentences)

In [10]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(laptop_train_polarities)
y_test = label_encoder.transform(laptop_test_polarities)

Use Logistic regression to classify the sentiments. To use a SVM simply replace logistic regression by SVC in the cell below.

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_reg_model = LogisticRegression(solver='lbfgs', max_iter=1000)
log_reg_model.fit(X_train, y_train)

In [12]:
# Predict on test data
y_pred = log_reg_model.predict(X_test)

In [13]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

    negative       0.40      0.75      0.52       128
     neutral       0.80      0.12      0.21       169
    positive       0.76      0.83      0.79       341

    accuracy                           0.63       638
   macro avg       0.65      0.57      0.51       638
weighted avg       0.70      0.63      0.58       638

