In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
# Define the labeled dataset (documents and corresponding labels)
# Load the CSV file containing the collection of documents
df = pd.read_csv('../data/Indonesian Sentiment Twitter Dataset Labeled.csv', sep="\t")

# Assuming the CSV file has a column named 'text' that contains the text of each document
corpus = df['Tweet'].tolist()
sentiments = df['sentimen'].values

# Convert sentiments to binary labels
binary_labels = np.array([1 if x == 1 else 0 for x in sentiments])

In [3]:
# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the documents and transform the documents into TF-IDF vectors
tfidf_matrix = vectorizer.fit_transform(corpus)

# Create a Multinomial Naive Bayes classifier
classifier = MultinomialNB()

# Fit the classifier to the TF-IDF vectors and labels
classifier.fit(tfidf_matrix, binary_labels)

In [4]:
# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Print TF-IDF matrix and feature names
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())
print("\nFeature Names:")
print(feature_names)

In [25]:
# Predict the labels for the same documents (for demonstration purposes)
predicted_labels = classifier.predict(tfidf_matrix)

In [26]:
# Print the classification report to see accuracy, precision, and recall
print("Classification Report:")
print(classification_report(binary_labels, predicted_labels))