<a href="https://colab.research.google.com/github/iv-alex-glitch/labs-for-uni/blob/main/human-machineinteractionlab1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Лабораторна робота
## Логістична регресія для аналізу тональності текстів


In [None]:

import re
import string
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords, twitter_samples
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import matplotlib.pyplot as plt

nltk.download('stopwords')
nltk.download('twitter_samples')


### Попередня обробка тексту

In [None]:

stemmer = PorterStemmer()
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
stopwords_english = stopwords.words('english')

def process_tweet(tweet):
    tweet = re.sub(r'https?://\S+', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tokens = tokenizer.tokenize(tweet.lower())
    clean = []
    for w in tokens:
        if w not in stopwords_english and w not in string.punctuation:
            clean.append(stemmer.stem(w))
    return clean


### Побудова словника частотностей

In [None]:

def build_freqs(tweets, ys):
    freqs = {}
    for y, tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            pair = (word, int(y))
            freqs[pair] = freqs.get(pair, 0) + 1
    return freqs


### Функція ознак

In [None]:

def extract_features(tweet, freqs):
    words = process_tweet(tweet)
    x = np.zeros((1, 3))
    x[0,0] = 1
    for w in words:
        x[0,1] += freqs.get((w,1), 0)
        x[0,2] += freqs.get((w,0), 0)
    return x


### Логістична регресія

In [None]:

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def gradient_descent(X, y, theta, alpha, iters):
    m = X.shape[0]
    J_hist = []
    for i in range(iters):
        h = sigmoid(X @ theta)
        theta -= (alpha/m) * (X.T @ (h - y))
        if i % 50 == 0:
            loss = -(1/m) * np.sum(y*np.log(h+1e-12) + (1-y)*np.log(1-h+1e-12))
            J_hist.append(loss)
    return theta, J_hist


### Завантаження датасету Twitter Samples

In [None]:

pos = twitter_samples.strings('positive_tweets.json')[:4000]
neg = twitter_samples.strings('negative_tweets.json')[:4000]

train_x = pos + neg
train_y = np.append(np.ones(len(pos)), np.zeros(len(neg))).reshape(-1,1)

freqs = build_freqs(train_x, train_y)

X = np.vstack([extract_features(t, freqs) for t in train_x])
y = train_y

theta = np.zeros((3,1))
theta, losses = gradient_descent(X, y, theta, alpha=1e-9, iters=1500)

plt.plot(losses)
plt.title("Loss curve")
plt.show()


### Тестування моделі

In [None]:

def predict(tweet, freqs, theta):
    x = extract_features(tweet, freqs)
    return sigmoid(x @ theta)

print(predict("I love this movie!", freqs, theta))
print(predict("This is terrible", freqs, theta))
