import library

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

Load data

In [3]:
train_df = pd.read_csv('train_preprocess.tsv', sep='\t', header=None, names=['text','label'])
valid_df = pd.read_csv('valid_preprocess.tsv', sep='\t', header=None, names=['text','label'])
test_df = pd.read_csv('test_preprocess.tsv', sep='\t', header=None, names=['text','label'])
test_masked_df = pd.read_csv('test_preprocess_masked_label.tsv', sep='\t', header=None, names=['text','label'])

Pisahkan fitur dan label

In [4]:
X_train_text = train_df['text']
y_train = train_df['label']
X_valid_text = valid_df['text']
y_valid = valid_df['label']
X_test_text = test_df['text']
y_test_masked = test_masked_df['label']

fungsi model

In [5]:
def evaluate_model(model, X_train, X_valid, y_train, y_valid, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)

    print(f"Evaluasi untuk {model_name}")
    print(classification_report(y_valid, y_pred))
    print(f"Accuracy: {accuracy_score(y_valid, y_pred)}")
    print(f"Precision: {precision_score(y_valid, y_pred, average='weighted')}")
    print(f"Recall: {recall_score(y_valid, y_pred, average='weighted')}")
    print(f"F1-Score: {f1_score(y_valid, y_pred, average='weighted')}")
    print("\n")

Bag of words unigram

In [7]:
print("----Unigram Bag of Words----")
vectorizer_unigram = CountVectorizer(ngram_range=(1, 1))  # Unigram model
X_train_bow_unigram = vectorizer_unigram.fit_transform(X_train_text)
X_valid_bow_unigram = vectorizer_unigram.transform(X_valid_text)
# Train and evaluate models
evaluate_model(LogisticRegression(), X_train_bow_unigram, X_valid_bow_unigram, y_train, y_valid, "Logistic Regression - Unigram")
evaluate_model(MultinomialNB(), X_train_bow_unigram, X_valid_bow_unigram, y_train, y_valid, "Naive Bayes - Unigram")
evaluate_model(SVC(), X_train_bow_unigram, X_valid_bow_unigram, y_train, y_valid, "SVM - Unigram")

----Unigram Bag of Words----
Evaluasi untuk Logistic Regression - Unigram
              precision    recall  f1-score   support

    negative       0.81      0.87      0.84       394
     neutral       0.83      0.69      0.76       131
    positive       0.92      0.91      0.91       735

    accuracy                           0.87      1260
   macro avg       0.85      0.83      0.84      1260
weighted avg       0.88      0.87      0.87      1260

Accuracy: 0.8746031746031746
Precision: 0.875828414930559
Recall: 0.8746031746031746
F1-Score: 0.8741505866783319


Evaluasi untuk Naive Bayes - Unigram
              precision    recall  f1-score   support

    negative       0.77      0.85      0.81       394
     neutral       0.91      0.64      0.75       131
    positive       0.90      0.90      0.90       735

    accuracy                           0.86      1260
   macro avg       0.86      0.80      0.82      1260
weighted avg       0.86      0.86      0.86      1260

Accuracy: 0

In [8]:
# 2. Bigram Bag of Words
print("----Bigram Bag of Words----")
vectorizer_bigram = CountVectorizer(ngram_range=(1, 2))  # Bigram model
X_train_bow_bigram = vectorizer_bigram.fit_transform(X_train_text)
X_valid_bow_bigram = vectorizer_bigram.transform(X_valid_text)

# Train and evaluate models
evaluate_model(LogisticRegression(), X_train_bow_bigram, X_valid_bow_bigram, y_train, y_valid, "Logistic Regression - Bigram")
evaluate_model(MultinomialNB(), X_train_bow_bigram, X_valid_bow_bigram, y_train, y_valid, "Naive Bayes - Bigram")
evaluate_model(SVC(), X_train_bow_bigram, X_valid_bow_bigram, y_train, y_valid, "SVM - Bigram")


----Bigram Bag of Words----
Evaluasi untuk Logistic Regression - Bigram
              precision    recall  f1-score   support

    negative       0.85      0.90      0.87       394
     neutral       0.82      0.67      0.74       131
    positive       0.93      0.93      0.93       735

    accuracy                           0.89      1260
   macro avg       0.87      0.83      0.85      1260
weighted avg       0.89      0.89      0.89      1260

Accuracy: 0.8928571428571429
Precision: 0.8923324967857992
Recall: 0.8928571428571429
F1-Score: 0.8915856050526766


Evaluasi untuk Naive Bayes - Bigram
              precision    recall  f1-score   support

    negative       0.75      0.80      0.78       394
     neutral       0.96      0.38      0.55       131
    positive       0.86      0.93      0.89       735

    accuracy                           0.83      1260
   macro avg       0.86      0.70      0.74      1260
weighted avg       0.84      0.83      0.82      1260

Accuracy: 0.8

In [9]:
# 3. TF-IDF Bag of Words
print("----TF-IDF Bag of Words----")
vectorizer_tfidf = TfidfVectorizer(ngram_range=(1, 1))  # Unigram model with TF-IDF weighting
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train_text)
X_valid_tfidf = vectorizer_tfidf.transform(X_valid_text)

# Train and evaluate models
evaluate_model(LogisticRegression(), X_train_tfidf, X_valid_tfidf, y_train, y_valid, "Logistic Regression - TF-IDF")
evaluate_model(MultinomialNB(), X_train_tfidf, X_valid_tfidf, y_train, y_valid, "Naive Bayes - TF-IDF")
evaluate_model(SVC(), X_train_tfidf, X_valid_tfidf, y_train, y_valid, "SVM - TF-IDF")


----TF-IDF Bag of Words----
Evaluasi untuk Logistic Regression - TF-IDF
              precision    recall  f1-score   support

    negative       0.82      0.85      0.83       394
     neutral       0.88      0.66      0.76       131
    positive       0.91      0.92      0.92       735

    accuracy                           0.87      1260
   macro avg       0.87      0.81      0.84      1260
weighted avg       0.88      0.87      0.87      1260

Accuracy: 0.8746031746031746
Precision: 0.8751984219727619
Recall: 0.8746031746031746
F1-Score: 0.8731706074320327


Evaluasi untuk Naive Bayes - TF-IDF
              precision    recall  f1-score   support

    negative       0.76      0.73      0.75       394
     neutral       1.00      0.23      0.37       131
    positive       0.82      0.96      0.88       735

    accuracy                           0.81      1260
   macro avg       0.86      0.64      0.67      1260
weighted avg       0.82      0.81      0.79      1260

Accuracy: 0.8

In [10]:
# Additional: Evaluation on the masked test set
print("----Evaluation on Test Masked Data----")
X_test_bow_unigram = vectorizer_unigram.transform(X_test_text)  # Unigram feature transformation for test set
model = LogisticRegression()  # Replace with the model you want to use
model.fit(X_train_bow_unigram, y_train)
y_pred_test = model.predict(X_test_bow_unigram)
print(classification_report(y_test_masked, y_pred_test))

----Evaluation on Test Masked Data----
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         0
     neutral       1.00      0.10      0.18       500
    positive       0.00      0.00      0.00         0

    accuracy                           0.10       500
   macro avg       0.33      0.03      0.06       500
weighted avg       1.00      0.10      0.18       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
