In [98]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle
from sklearn.preprocessing import LabelBinarizer

In [14]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khanza\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\khanza\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\khanza\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [89]:
# Dataset IMDB
imdb_data = pd.read_csv('IMDB Dataset.csv')

In [90]:
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [91]:
tokenizer = nltk.RegexpTokenizer(r'\w+')
stopword_list = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub(r'\[[^]]*\]', '', text)

def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

def remove_special_characters(text, remove_digits=True):
    pattern = r'[^a-zA-Z0-9\s]' if remove_digits else r'[^a-zA-Z\s]'
    return re.sub(pattern, '', text)

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

def remove_stopwords(text):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    return ' '.join(filtered_tokens)

def preprocess_text(text):
    text = denoise_text(text)
    text = remove_special_characters(text)
    text = lemmatize_text(text)
    text = remove_stopwords(text)
    return text

# Apply the preprocessing function to the dataset
imdb_data['review'] = imdb_data['review'].apply(preprocess_text)

  soup = BeautifulSoup(text, "html.parser")


In [92]:
norm_train_reviews = imdb_data.review[:25000]
norm_test_reviews = imdb_data.review[25000:]

In [93]:
tv = TfidfVectorizer(min_df=0.01, max_df=0.9, ngram_range=(1, 2), use_idf=True)
tv_train_reviews = tv.fit_transform(norm_train_reviews)
tv_test_reviews = tv.transform(norm_test_reviews)

print('Tfidf_train:', tv_train_reviews.shape)
print('Tfidf_test:', tv_test_reviews.shape)

Tfidf_train: (25000, 1767)
Tfidf_test: (25000, 1767)


In [94]:
lb = LabelBinarizer()
sentiment_data = lb.fit_transform(imdb_data['sentiment'])
print(sentiment_data.shape)

# Split sentiment data
train_sentiments = sentiment_data[:25000]
test_sentiments = sentiment_data[25000:]
print(train_sentiments.shape)
print(test_sentiments.shape)

(50000, 1)
(25000, 1)
(25000, 1)


In [95]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Menggunakan TF-IDF
model_tfidf = MultinomialNB()
model_tfidf.fit(tv_train_reviews, train_sentiments.ravel())
y_pred_tfidf = model_tfidf.predict(tv_test_reviews)
print('Akurasi TF-IDF:', accuracy_score(test_sentiments, y_pred_tfidf))
print('Laporan Klasifikasi TF-IDF:')
print(classification_report(test_sentiments, y_pred_tfidf))

Akurasi TF-IDF: 0.84524
Laporan Klasifikasi TF-IDF:
              precision    recall  f1-score   support

           0       0.86      0.83      0.84     12474
           1       0.83      0.86      0.85     12526

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000



In [101]:
model = make_pipeline(TfidfVectorizer(min_df=0.01, max_df=0.9, ngram_range=(1, 2), use_idf=True), MultinomialNB())
model.fit(norm_train_reviews, train_sentiments.ravel())
y_pred = model.predict(norm_test_reviews)
print('Akurasi:', accuracy_score(test_sentiments, y_pred))
print('Laporan Klasifikasi:')
print(classification_report(test_sentiments, y_pred))

Akurasi: 0.84524
Laporan Klasifikasi:
              precision    recall  f1-score   support

           0       0.86      0.83      0.84     12474
           1       0.83      0.86      0.85     12526

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000



In [102]:
import pickle

# Save the model and vectorizer
with open('model.pkl', 'wb') as model_file:
    pickle.dump(model_tfidf, model_file)

with open('vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(tv, vectorizer_file)


In [96]:
def preprocess_review(review):
    review = denoise_text(review)
    review = remove_special_characters(review)
    review = lemmatize_text(review)
    review = remove_stopwords(review)
    return review

def predict_sentiment(review, model, vectorizer):
    # Praproses review pengguna
    review = preprocess_review(review)
    
    # Transformasikan review pengguna menggunakan vectorizer
    review_vector = vectorizer.transform([review])
    
    # Prediksi sentimen menggunakan model yang telah dilatih
    sentiment = model.predict(review_vector)
    
    # Kembalikan hasil prediksi
    return sentiment

In [97]:
# Review contoh dari pengguna
user_review = "So boring and too long. I will never watch this movie again."

# Prediksi menggunakan model TF-IDF
predicted_sentiment_tfidf = predict_sentiment(user_review, model_tfidf, tv)
print(f'Prediksi Sentimen (TF-IDF): {predicted_sentiment_tfidf}')

Prediksi Sentimen (TF-IDF): [0]


# TWITTER

In [None]:
import tweepy

# Masukkan kunci dan token API Twitter Anda
api_key = "LHTTHtP3FnALVIwjPHnagUB4T"
api_key_secret = "6dAGvU4o0aeu2fmTlXqzG9VTzNRD2dZKfbzt12x5ZCZPMuZJNI"
access_token = "1426060555853303810-JYn6p9TpRhwliryX39IozAc8WFg41v"
access_token_secret = "ok5JHdz2gXiF5YZetiKc3wUQHO6Asd2QJ2AffLPNEseT4"
bearer_token = "AAAAAAAAAAAAAAAAAAAAAEApigEAAAAAyT1ss8QtOmbKsbwePVusZt9u8pg%3DslkoL8lW5VOdMQnFKJH0Qi1JXk4D82s4nf9Do5EQdw10XTRQMQ"

# Autentikasi dengan Twitter API
# auth = tweepy.OAuth1UserHandler(api_key, api_key_secret, access_token, access_token_secret)
auth = tweepy.OAuth2AppHandler(
    api_key, api_key_secret
)
api = tweepy.API(auth)

# Kumpulkan tweet terkait film
query = "hit man"  # Ganti dengan nama film yang Anda inginkan
tweets = api.search_tweets(query, count=100, lang='en')

tweet_data = []
for tweet in tweets:
    tweet_data.append(tweet.text)

# Konversi tweet menjadi DataFrame
new_tweets_df = pd.DataFrame(tweet_data, columns=['review'])

In [3]:
# Praproses data
new_tweets_df['review'] = new_tweets_df['review'].apply(denoise_text)
new_tweets_df['review'] = new_tweets_df['review'].apply(remove_special_characters)
new_tweets_df['review'] = new_tweets_df['review'].apply(simple_stemmer)
new_tweets_df['review'] = new_tweets_df['review'].apply(remove_stopwords)

In [None]:
# Transformasi tweet baru menggunakan model TF-IDF
new_tweets_tfidf = tv.transform(new_tweets_df['review'])

# Prediksi sentimen menggunakan model TF-IDF
new_tweets_predictions = model_tfidf.predict(new_tweets_tfidf)

# Konversi prediksi menjadi label
new_tweets_df['sentiment'] = lb.inverse_transform(new_tweets_predictions)

print(new_tweets_df.head())