In [1]:
#import dataset_utils
#import preprocessing_utils
from tqdm import tqdm

In [2]:
import json
import pandas as pd

### Load Datasets

In [3]:
def load_dataset(split={"train", "test", "dev"}, domain={"rest", "laptop"}):
    with open(f'asc/{domain}/{split}.json', 'r') as file:
        dataset = json.load(file)
    return list(dataset.values())

In [4]:
import random
def print_random_element(dataset):
    random_element = random.choice(dataset)
    print(random_element)

In [5]:
rest_train = load_dataset(split="train", domain="rest") + load_dataset(split="dev", domain = "rest")
rest_test = load_dataset(split="test", domain="rest")
laptop_train = load_dataset(split="train", domain="laptop") + load_dataset(split="dev", domain = "laptop")
laptop_test = load_dataset(split="test", domain="laptop")

In [6]:
print_random_element(rest_train)

{'polarity': 'positive', 'term': 'staff', 'id': '2170_0', 'sentence': 'The staff is no nonsense.'}


### Preprocess 

In [7]:
import re
import string

def clean_text(text):
    """
    Function to clean text data.
    :param text: The input string containing the text to be cleaned.
    :return: Cleaned text.
    """

    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove extra whitespaces
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)

    return text

In [8]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/user/welzs0/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/user/welzs0/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/user/welzs0/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [9]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """
    Map POS tag to the first character lemmatize() accepts.
    """
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def tokenize_and_lemmatize(text):
    """
    Function to tokenize and lemmatize the input text.
    :param text: The input string containing the text to be processed.
    :return: A list of lemmatized tokens.
    """
    lemmatizer = WordNetLemmatizer()
    tokenized_text = word_tokenize(text)
    lemmatized_text = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in tokenized_text]
    return lemmatized_text


In [10]:
def preprocess_text(text):
    cleaned_text = clean_text(text)
    preprocessed_text = tokenize_and_lemmatize(cleaned_text)
    
    return preprocessed_text

In [11]:
def clean_dataset(dataset):
    for element in tqdm(dataset, desc="Processing Text"):
        element['sentence'] = clean_text(element['sentence'])

In [12]:
def preprocess_dataset(dataset):
    for element in tqdm(dataset, desc="Processing Text"):
        element['sentence'] = preprocess_text(element['sentence'])

In [13]:
preprocess_dataset(rest_train)
preprocess_dataset(laptop_train)
preprocess_dataset(rest_test)
preprocess_dataset(laptop_test)

Processing Text: 100%|█████████████████████████████████████████████████████████████| 3602/3602 [00:05<00:00, 654.21it/s]
Processing Text: 100%|█████████████████████████████████████████████████████████████| 2313/2313 [00:03<00:00, 753.14it/s]
Processing Text: 100%|█████████████████████████████████████████████████████████████| 1120/1120 [00:01<00:00, 861.90it/s]
Processing Text: 100%|███████████████████████████████████████████████████████████████| 638/638 [00:00<00:00, 891.32it/s]


In [14]:
print(rest_train[0])

{'polarity': 'positive', 'term': 'server', 'id': '1592_0', 'sentence': ['our', 'server', 'be', 'very', 'helpful', 'and', 'friendly']}


### TFIDF with Logistic Regression

In [22]:
rest_train_sentences = [" ".join(item['sentence']) for item in rest_train]
rest_train_polarities = [item['polarity'] for item in rest_train]

rest_test_sentences = [" ".join(item['sentence']) for item in rest_test]
rest_test_polarities = [item['polarity'] for item in rest_test]

In [46]:
laptop_train_sentences = [" ".join(item['sentence']) for item in laptop_train]
laptop_train_polarities = [item['polarity'] for item in laptop_train]

laptop_test_sentences = [" ".join(item['sentence']) for item in laptop_test]
laptop_test_polarities = [item['polarity'] for item in laptop_test]

Use Tfidf vectorizer to encode sentences.

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'sentences' is a list of text data (your reviews)
tfidf_vectorizer = TfidfVectorizer()

all_sentences = laptop_test_sentences + laptop_test_sentences
tfidf_vectorizer.fit(all_sentences)
X_train = tfidf_vectorizer.transform(laptop_train_sentences)
X_test = tfidf_vectorizer.transform(laptop_test_sentences)

In [48]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(laptop_train_polarities)
y_test = label_encoder.transform(laptop_test_polarities)

Use Logistic regression to classify the sentiments. To use a SVM simply replace logistic regression by SVM from sklearn.svm.SVC in the cell below.

In [50]:
from sklearn.linear_model import LogisticRegression

# Example: Training a Logistic Regression model
log_reg_model = LogisticRegression(solver='lbfgs', max_iter=1000)
log_reg_model.fit(X_train, y_train)

In [51]:
# Predict on test data
y_pred = log_reg_model.predict(X_test)

In [52]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

    negative       0.43      0.70      0.53       128
     neutral       0.63      0.20      0.30       169
    positive       0.76      0.85      0.80       341

    accuracy                           0.65       638
   macro avg       0.61      0.58      0.55       638
weighted avg       0.66      0.65      0.62       638

