In [1]:
pip install hazm



In [3]:
import numpy as np
import pandas as pd

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/ML/6.classification/comment_mining/data/train.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   comment      40000 non-null  object
 1   price_value  40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


In [7]:
# splitting the dataset
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size = 0.2)

In [10]:
# function to preprocess the text
import re
from hazm import *

normalizer = Normalizer()
stemmer = Stemmer()
stopwords = stopwords_list()

def preprocessing(text):

    # 1. remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # 2. normalization
    normalized_text = normalizer.normalize(text)

    # 3. apply stemming
    stemmed_text = ' '.join([stemmer.stem(word) for word in normalized_text.split()])

    # 4. tokenization
    tokens = word_tokenize(stemmed_text)

    # 5. convert to lowercase, remove digits and stopwords
    stopwords = stopwords_list()
    output_tokens = []
    for token in tokens:
        token = token.lower()
        if token.isdigit() or token in stopwords:
            continue
        output_tokens.append(token)

    return output_tokens

In [17]:
# calculating prior probabilities for each class (price_value = 0 or 1)
total_count = len(train)
count_0 = len(train[train['price_value'] == 0])
count_1 = len(train[train['price_value'] == 1])
prior_probability = {0: count_0 / total_count, 1: count_1 / total_count}
prior_probability

{0: 0.51915625, 1: 0.48084375}

In [18]:
# function to the occurrences of each token in the given texts
def token_counter(texts):
    count_dict = {}
    for text in texts:
        tokens = preprocessing(text)
        for token in tokens:
            if token in count_dict:
                count_dict[token] += 1
            else:
                count_dict[token] = 1

    return count_dict

In [19]:
negative_comments = train[train['price_value'] == 0]['comment']
negative_class_count = token_counter(negative_comments)

In [20]:
positive_comments = train[train['price_value'] == 1]['comment']
positive_class_count = token_counter(positive_comments)

In [21]:
# function to compute the probability of a class given a text
def compute_probability(text, cls):
    tokens = preprocessing(text)

    # Determine the class count based on the class (0 or 1)
    class_count = positive_class_count if cls == 1 else negative_class_count
    total_class_count = sum(class_count.values())

    probability = prior_probability[cls]

    # Multiply the probability of each token by the overall probability
    for token in tokens:
        token_count = class_count.get(token, 0)
        token_probability = (token_count + 1) / (total_class_count + len(class_count))
        probability *= token_probability

    return probability

In [22]:
# function to predict the class labels for a list of texts
def predict(texts):
    predictions = []

    for text in texts:

        prob_class_0 = compute_probability(text, 0)
        prob_class_1 = compute_probability(text, 1)

        predicted_class = 0 if prob_class_0 > prob_class_1 else 1
        predictions.append(predicted_class)

    return np.array(predictions)

In [23]:
# predicting on the train and test sets

train_texts = train['comment'].tolist()
train_labels = train['price_value'].values

test_texts = test['comment'].tolist()
test_labels = test['price_value'].values

y_train_pred = predict(train_texts)
y_test_pred = predict(test_texts)

In [24]:
# evaluation
from sklearn.metrics import accuracy_score

accuracy_1 = accuracy_score(train_labels, y_train_pred)
accuracy_2 = accuracy_score(test_labels, y_test_pred)

print(f"Accuracy on Training Data: {accuracy_1}")
print(f"Accuracy on Test Data: {accuracy_2}")

Accuracy on Training Data: 0.89509375
Accuracy on Test Data: 0.8325
