In [3]:
!pip install nltk scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Using cached scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
Collecting numpy>=1.19.5
  Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Collecting scipy>=1.6.0
  Using cached scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
Collecting threadpoolctl>=3.1.0
  Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
[0mInstalling collected packages: threadpoolctl, numpy, scipy, scikit-learn
Successfully installed numpy-1.26.4 scikit-learn-1.5.0 scipy-1.13.1 threadpoolctl-3.5.0


In [4]:
import nltk
from nltk.corpus import movie_reviews
import random
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer


# Create and shuffle the reviews list
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

# Create a list of all words and sort by frequency
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
print(f"Найчастотніші слова з відповідними частотами: {all_words.most_common(20)}")
print()

# Find the frequency of the word 'radical'
word_frequency = all_words['radical']
print(f"Слово 'radical' зустрічається {word_frequency} рази/ів.")
print()

# Find the frequency of 'radical' in positive and negative reviews
positive_reviews = movie_reviews.fileids('pos')
negative_reviews = movie_reviews.fileids('neg')
positive_count = sum(1 for fileid in positive_reviews if 'radical' in movie_reviews.words(fileid))
negative_count = sum(1 for fileid in negative_reviews if 'radical' in movie_reviews.words(fileid))
print(f"'radical' серед позитивних відгуків: {positive_count}")
print(f"'radical' серед негативних відгуків: {negative_count}")

# Create a function to check word presence in a file
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features[word] = (word in document_words)
    return features

word_features = list(all_words)[:4000]
features = document_features(movie_reviews.words('pos/cv019_14482.txt'), word_features)
print({word: features[word] for word in word_features if features[word]})
print()

# Implement Naive Bayes Classifier
featuresets = [(document_features(d, word_features), c) for (d, c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(f"Точність алгоритма Naive Bayes: {nltk.classify.accuracy(classifier, test_set)}")
classifier.show_most_informative_features(20)
print()

# Implement Multinomial NB Classifier
vectorizer = CountVectorizer()
train_data = [movie_reviews.raw(fileid) for fileid in movie_reviews.fileids()[:1800]]
test_data = [movie_reviews.raw(fileid) for fileid in movie_reviews.fileids()[1800:]]
train_labels = [movie_reviews.categories(fileid)[0] for fileid in movie_reviews.fileids()[:1800]]
test_labels = [movie_reviews.categories(fileid)[0] for fileid in movie_reviews.fileids()[1800:]]

train_matrix = vectorizer.fit_transform(train_data)
test_matrix = vectorizer.transform(test_data)

multi_nb_classifier = MultinomialNB()
multi_nb_classifier.fit(train_matrix, train_labels)
accuracy = multi_nb_classifier.score(test_matrix, test_labels)
print(f"Точність Multinomial NB: {accuracy}")


Кличлієв К. С., Група №2, Лабораторна робота №3


[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/klychliiev/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595), (')', 11781), ('(', 11664), ('as', 11378), ('with', 10792), ('for', 9961)]
The word 'radical' appears 15 times.
'radical' in positive reviews: 11
'radical' in negative reviews: 3
{',': True, 'the': True, '.': True, 'a': True, 'and': True, 'of': True, 'to': True, "'": True, 'is': True, 'in': True, 's': True, '"': True, 'it': True, 'that': True, '-': True, ')': True, '(': True, 'as': True, 'with': True, 'for': True, 'his': True, 'this': True, 'film': True, 'i': True, 'he': True, 'but': True, 'on': True, 't': True, 'by': True, 'be': True, 'one': True, 'who': True, 'you': True, 'from': True, 'was': True, 'they': True, 'has': True, 'her': True, 'all': True, '?': True, 'there': True, 'like': True, 'so': True, 'out': True, 'about': True, 'up': True, 'what': True, 'when': Tr