In [27]:
import pandas as pd
import random
from nltk import classify, NaiveBayesClassifier
import nltk
from nltk.corpus import twitter_samples, stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
import re, string

In [28]:
nltk.download('twitter_samples')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [29]:
def process_tokens(tweet_tokens):
    """Функція для очищення та лематизації токенів."""
    cleaned_tokens = []
    stop_words = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('(@[A-Za-z0-9_]+)|(https?://[A-Za-z0-9./]+)', '', token)

        if token and token.lower() not in stop_words and token not in string.punctuation:
            if tag.startswith('NN'):
                pos = 'n'
            elif tag.startswith('VB'):
                pos = 'v'
            else:
                pos = 'a'
            cleaned_token = lemmatizer.lemmatize(token.lower(), pos)
            cleaned_tokens.append(cleaned_token)
    return cleaned_tokens

positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = [process_tokens(tokens) for tokens in positive_tweet_tokens]
negative_cleaned_tokens_list = [process_tokens(tokens) for tokens in negative_tweet_tokens]

# TASK 0 

In [30]:
def get_token_dict(tokens):
    """Перетворює список токенів у словник для моделі."""
    return dict([token, True] for token in tokens)

def get_tweets_for_model(cleaned_tokens_list):
    """Готує дані для класифікатора NLTK."""
    return [get_token_dict(tweet_tokens) for tweet_tokens in cleaned_tokens_list]

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset
random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))
print(classifier.show_most_informative_features(10))


Accuracy is: 0.9943333333333333
Most Informative Features
                     sad = True           Negati : Positi =     34.2 : 1.0
                follower = True           Positi : Negati =     23.1 : 1.0
                     bam = True           Positi : Negati =     21.5 : 1.0
                    sick = True           Negati : Positi =     19.2 : 1.0
                    cool = True           Positi : Negati =     18.1 : 1.0
              appreciate = True           Positi : Negati =     16.0 : 1.0
                    blog = True           Positi : Negati =     14.0 : 1.0
             opportunity = True           Positi : Negati =     13.3 : 1.0
                 welcome = True           Positi : Negati =     13.0 : 1.0
                    miss = True           Negati : Positi =     12.6 : 1.0
None


In [31]:
def get_sentiment(text):
    custom_tokens = process_tokens(word_tokenize(text))
    return classifier.classify(get_token_dict(custom_tokens))

test_phrases = ["The service was amazing", "I am very happy", "This is the worst experience ever", "It was a total disaster"]
for t in test_phrases:
    print(f'"{t}": {get_sentiment(t)}')

"The service was amazing": Positive
"I am very happy": Positive
"This is the worst experience ever": Negative
"It was a total disaster": Negative


# TASK 1

In [None]:
df = pd.read_csv('data1.csv', encoding='latin1')
df = df[['Sentiment', 'Sentence']]

df = df[df['Sentiment'].isin(['positive', 'negative'])]

df['Tokens'] = df['Sentence'].apply(lambda x: process_tokens(word_tokenize(x)))

new_data = []
for index, row in df.iterrows():
    new_data.append((get_token_dict(row['Tokens']), row['Sentiment']))

random.shuffle(new_data)

split_point = int(len(new_data) * 0.8)
train_data_new = new_data[:split_point]
test_data_new = new_data[split_point:]

financial_classifier = NaiveBayesClassifier.train(train_data_new)

print("\n--- Financial Sentiment Analysis ---")
print("Accuracy on financial dataset:", classify.accuracy(financial_classifier, test_data_new))
print(financial_classifier.show_most_informative_features(10))

# Тестування
financial_phrase = "The company's revenue increased significantly, leading to higher profits."
financial_tokens = process_tokens(word_tokenize(financial_phrase))
print(f'"{financial_phrase}": {financial_classifier.classify(get_token_dict(financial_tokens))}')



--- Financial Sentiment Analysis ---
Accuracy on financial dataset: 0.7697974217311234
Most Informative Features
                decrease = True           negati : positi =     25.2 : 1.0
                    fell = True           negati : positi =     22.8 : 1.0
                 decline = True           negati : positi =     21.9 : 1.0
                   staff = True           negati : positi =     17.7 : 1.0
                    fall = True           negati : positi =     16.7 : 1.0
                     lay = True           negati : positi =     16.3 : 1.0
                    drop = True           negati : positi =     14.3 : 1.0
                      25 = True           negati : positi =     13.5 : 1.0
               agreement = True           positi : negati =     13.0 : 1.0
                    sign = True           positi : negati =     12.4 : 1.0
None
"The company's revenue increased significantly, leading to higher profits.": positive


# TASK 2

In [33]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.linear_model import LogisticRegression

lr_classifier = SklearnClassifier(LogisticRegression(max_iter=1000))
lr_classifier.train(train_data)

accuracy_lr = classify.accuracy(lr_classifier, test_data)
print("\n--- Logistic Regression vs. Naive Bayes ---")
print(f"Logistic Regression Accuracy: {accuracy_lr}")
print(f"Naive Bayes Accuracy: {classify.accuracy(classifier, test_data)}")

print("\nПорівняння:")
if accuracy_lr > classify.accuracy(classifier, test_data):
    print("Логістична регресія показала кращу точність.")
else:
    print("Наївний Байєс показав кращу або таку ж точність.")


--- Logistic Regression vs. Naive Bayes ---
Logistic Regression Accuracy: 0.997
Naive Bayes Accuracy: 0.9943333333333333

Порівняння:
Логістична регресія показала кращу точність.
