<a href="https://colab.research.google.com/github/iv-alex-glitch/labs-for-uni/blob/main/human-machine-interactionlab_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Лабораторна робота №2
## Наївний баєсів класифікатор + SentiWordNet
---
Цей ноутбук використовує Naive Bayes та словник SentiWordNet для аналізу тональності.

In [None]:
import numpy as np
import nltk
import re
import string
from nltk.corpus import twitter_samples, stopwords, sentiwordnet as swn
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

nltk.download('twitter_samples')
nltk.download('stopwords')
nltk.download('sentiwordnet')

In [None]:
def process_tweet(tweet):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')

    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    tweet = re.sub(r'#', '', tweet)

    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tokens = tokenizer.tokenize(tweet)

    clean = []
    for word in tokens:
        if word not in stopwords_english and word not in string.punctuation:
            clean.append(stemmer.stem(word))
    return clean

In [None]:
all_pos = twitter_samples.strings('positive_tweets.json')
all_neg = twitter_samples.strings('negative_tweets.json')

train_pos = all_pos[:4000]
test_pos  = all_pos[4000:]
train_neg = all_neg[:4000]
test_neg  = all_neg[4000:]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y  = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

print("Train size:", len(train_x))
print("Test size:", len(test_x))

In [None]:
def sentiwordnet_score(word):
    synsets = list(swn.senti_synsets(word))
    if len(synsets) == 0:
        return 0.0
    pos_score = np.mean([s.pos_score() for s in synsets])
    neg_score = np.mean([s.neg_score() for s in synsets])
    return pos_score - neg_score

In [None]:
def count_tweets(result, tweets, ys):
    for y, tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            key = (word, y)
            result[key] = result.get(key, 0) + 1
    return result

freqs = count_tweets({}, train_x, train_y)
print("Унікальних пар:", len(freqs))

In [None]:
def train_naive_bayes(freqs, train_x, train_y):
    loglikelihood = {}
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    N_pos = N_neg = 0
    for (w, cls), count in freqs.items():
        if cls == 1:
            N_pos += count
        else:
            N_neg += count

    D_pos = sum(train_y)
    D_neg = len(train_y) - D_pos

    logprior = np.log(D_pos) - np.log(D_neg)

    for word in vocab:
        freq_pos = freqs.get((word, 1), 0)
        freq_neg = freqs.get((word, 0), 0)

        p_pos = (freq_pos + 1) / (N_pos + V)
        p_neg = (freq_neg + 1) / (N_neg + V)

        base_ll = np.log(p_pos / p_neg)
        senti = sentiwordnet_score(word)

        loglikelihood[word] = base_ll + senti

    return logprior, loglikelihood

logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print("Навчено.")

In [None]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    words = process_tweet(tweet)
    p = logprior
    for w in words:
        if w in loglikelihood:
            p += loglikelihood[w]
    return p

In [None]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    y_hat = []
    for t in test_x:
        p = naive_bayes_predict(t, logprior, loglikelihood)
        y_hat.append(1 if p > 0 else 0)
    return np.mean(np.array(y_hat) == test_y)

acc = test_naive_bayes(test_x, test_y, logprior, loglikelihood)
print("Точність:", acc)

In [None]:
def get_ratio(freqs, word):
    pos = freqs.get((word, 1), 0)
    neg = freqs.get((word, 0), 0)
    return (pos + 1) / (neg + 1)

ratios = {w: get_ratio(freqs, w) for w in list(set([k[0] for k in freqs.keys()]))}

top_pos = sorted(ratios.items(), key=lambda x: -x[1])[:20]
top_neg = sorted(ratios.items(), key=lambda x: x[1])[:20]

print("ТОП позитивні слова:\n", top_pos)
print("\nТОП негативні слова:\n", top_neg)

In [None]:
print("Помилки класифікації:")
for x, y in zip(test_x, test_y):
    p = naive_bayes_predict(x, logprior, loglikelihood)
    y_hat = 1 if p > 0 else 0
    if y != y_hat:
        print(y, y_hat, process_tweet(x))

In [None]:
my_tweet = "This course is awesome and I love learning new things!"
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print("Мій твіт:", my_tweet)
print("Полярність:", "Позитивна" if p > 0 else "Негативна")