<a href="https://colab.research.google.com/github/irwanmaulana42/google-collab/blob/master/Naive%20Bayes%20Movie%20Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import random
import nltk
nltk.download('movie_reviews')
nltk.download('punkt')

import pandas as pd
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize


# and the second element is the corresponding label
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the documents
random.shuffle(documents)

# Define the feature extractor
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

def extract_features(document):
  document_words = set(document)
  features = {}
  for word in word_features:
    features['contains({})'.format(word)] = (word in document_words)
  return features


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
# Extract the features for all reviews
featuresets = [(document_features(d), c) for (d,c) in documents]

# Split the dataset into training and testing sets
train_set, test_set = featuresets[100:], featuresets[:100]

pd.DataFrame(train_set)

Unnamed: 0,0,1
0,"{'contains(,)': True, 'contains(the)': True, '...",neg
1,"{'contains(,)': True, 'contains(the)': True, '...",pos
2,"{'contains(,)': True, 'contains(the)': True, '...",neg
3,"{'contains(,)': True, 'contains(the)': True, '...",neg
4,"{'contains(,)': True, 'contains(the)': True, '...",neg
...,...,...
1895,"{'contains(,)': True, 'contains(the)': True, '...",pos
1896,"{'contains(,)': True, 'contains(the)': True, '...",pos
1897,"{'contains(,)': True, 'contains(the)': True, '...",neg
1898,"{'contains(,)': True, 'contains(the)': True, '...",pos


In [13]:
# Train the Naive Bayes classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

# Evaluate the accuracy of the classifier
accuracy = nltk.classify.accuracy(classifier, test_set) * 100
print("Akurasi: {}%".format(accuracy))


Akurasi: 84.0%


In [14]:

# Teks yang ingin diuji
new_texts = [
    "This movie is great. The story is well-written and the actors gave outstanding performances.",
    "This movie was an absolute disaster. The plot was nonsensical and the acting was terrible.",
    "I found the film to be incredibly boring. The pacing was slow and nothing really happened.",
    "The special effects were the only redeeming feature of this movie. The story was poorly written and the acting was wooden.",
    "I was really disappointed with this film. The trailer looked great, but the actual movie fell flat.",
    "I found the characters to be completely unlikable. I didn't care what happened to any of them.",
    "This movie was a triumph. The acting was superb and the story was gripping from start to finish.",
    "I was on the edge of my seat the entire time. This film was a rollercoaster of emotions.",
    "The cinematography was stunning. This movie was a work of art.",
    "I couldn't believe how much I enjoyed this film. The writing was top-notch and the acting was brilliant.",
    "This movie was everything I wanted it to be and more. I can't recommend it highly enough."
];

for text in new_texts:
  # Lakukan tokenisasi pada teks
  new_text_tokens = word_tokenize(text)

  # Ekstraksi fitur dari teks
  new_text_features = extract_features(new_text_tokens)

  # Lakukan prediksi sentimen menggunakan model
  sentiment = classifier.classify(new_text_features)

  # Cetak hasil prediksi sentimen
  print("Sentimen teks \"{}\" adalah {}".format(text, sentiment))

Sentimen teks "This movie is great. The story is well-written and the actors gave outstanding performances." adalah neg
Sentimen teks "This movie was an absolute disaster. The plot was nonsensical and the acting was terrible." adalah neg
Sentimen teks "I found the film to be incredibly boring. The pacing was slow and nothing really happened." adalah neg
Sentimen teks "The special effects were the only redeeming feature of this movie. The story was poorly written and the acting was wooden." adalah neg
Sentimen teks "I was really disappointed with this film. The trailer looked great, but the actual movie fell flat." adalah neg
Sentimen teks "I found the characters to be completely unlikable. I didn't care what happened to any of them." adalah neg
Sentimen teks "This movie was a triumph. The acting was superb and the story was gripping from start to finish." adalah neg
Sentimen teks "I was on the edge of my seat the entire time. This film was a rollercoaster of emotions." adalah neg
Senti

In [None]:

import random
import nltk
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import movie_reviews, stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split


# and the second element is the corresponding label
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the documents
random.shuffle(documents)

# Define the feature extractor
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words() if w.lower() not in stopwords.words('english') and w.isalpha())
word_features = list(all_words)[:2000]
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

def extract_features(document):
    features = {}
    for word in word_tokenize(document):
        word = lemmatizer.lemmatize(stemmer.stem(word.lower()))
        features[word] = True
    return features

# Extract the features for all reviews
featuresets = [(extract_features(d), c) for (d,c) in documents]

# Split the dataset into training and testing sets
train_set, test_set = train_test_split(featuresets, test_size=0.1, random_state=42)

# Train the Naive Bayes classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

# Evaluate the accuracy of the classifier
accuracy = nltk.classify.accuracy(classifier, test_set) * 100
print("Akurasi: {}%".format(accuracy))


# Teks yang ingin diuji
new_texts = [
    "This movie is great. The story is well-written and the actors gave outstanding performances.",
    "This movie was an absolute disaster. The plot was nonsensical and the acting was terrible.",
    "I found the film to be incredibly boring. The pacing was slow and nothing really happened.",
    "The special effects were the only redeeming feature of this movie. The story was poorly written and the acting was wooden.",
    "I was really disappointed with this film. The trailer looked great, but the actual movie fell flat.",
    "I found the characters to be completely unlikable. I didn't care what happened to any of them.",
    "This movie was a triumph. The acting was superb and the story was gripping from start to finish.",
    "I was on the edge of my seat the entire time. This film was a rollercoaster of emotions.",
    "The cinematography was stunning. This movie was a work of art.",
    "I couldn't believe how much I enjoyed this film. The writing was top-notch and the acting was brilliant.",
    "This movie was everything I wanted it to be and more. I can't recommend it highly enough."
];

for text in new_texts:
    # Lakukan preprocessing pada teks
    text = text.lower()
    features = extract_features(text)

    # Kelas prediksi
    prediction = classifier.classify(features)

    # Tampilkan hasil prediksi
    if prediction == 'pos':
        print("{}\nHasil: Positive\n".format(text))
    else:
        print("{}\nHasil: Negative\n".format(text))


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
