<a href="https://colab.research.google.com/github/ihyaulumuddin044/ML_Portofolio/blob/main/sentimen_analysis/sentimen_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

import re
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('twitter_training_updeted.csv')
df.head()

Unnamed: 0,id,game,sentiment,text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [3]:
df.shape

(74681, 4)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         74681 non-null  int64 
 1   game       74681 non-null  object
 2   sentiment  74681 non-null  object
 3   text       73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [5]:
def preprocess_text(text):
    # mengatasi data yang hilang
    if isinstance(text, float):  # Periksa apakah nilainya adalah float (bisa berupa NaN)
        text = str(text)  # Ubah ke string jika perlu
    # Mengubah teks menjadi huruf kecil
    text = text.lower()
    # Menghilangkan karakter spesial, angka, dan tanda baca
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Menghapus ekstra spasi
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Terapkan preprocessing ke kolom 'text'
df['clean_text'] = df['text'].apply(preprocess_text)

# Encode label sentimen
le = LabelEncoder()
df['label'] = le.fit_transform(df['sentiment'])

# Pisahkan data menjadi train dan test
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

# Ekstraksi fitur menggunakan TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f"Train set size: {X_train_tfidf.shape}")
print(f"Test set size: {X_test_tfidf.shape}")
print("Label classes:", le.classes_)

Train set size: (59744, 5000)
Test set size: (14937, 5000)
Label classes: ['Irrelevant' 'Negative' 'Neutral' 'Positive']


#menggunkan Naive Bayes

In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Training model Naive Bayes
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Evaluasi model
predictions = model.predict(X_test_tfidf)
print("\nClassification Report:\n", classification_report(y_test, predictions, target_names=le.classes_))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, predictions))
print("\nAccuracy Score:", accuracy_score(y_test, predictions))


Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.75      0.34      0.47      2598
    Negative       0.62      0.81      0.70      4509
     Neutral       0.67      0.51      0.58      3664
    Positive       0.61      0.74      0.67      4166

    accuracy                           0.64     14937
   macro avg       0.66      0.60      0.61     14937
weighted avg       0.65      0.64      0.62     14937


Confusion Matrix:
 [[ 879  721  289  709]
 [  64 3663  304  478]
 [ 141  858 1872  793]
 [  82  645  340 3099]]

Accuracy Score: 0.6368748744727857


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
# reducing data dengan PCA
svd = TruncatedSVD(n_components=300, random_state=42)
X_train_reduced = svd.fit_transform(X_train_tfidf)
X_test_reduced = svd.transform(X_test_tfidf)


def train_and_evaluate(model, model_name, X_train_data, X_test_data): # Added X_train_data and X_test_data as parameters
    print(f"\nTraining {model_name}...")
    if model_name == 'LinearSVC':  # Changed from 'Linear SVM'
        X_train_data = X_train_reduced
        X_test_data = X_test_reduced
    else:
        X_train_data = X_train_tfidf
        X_test_data = X_test_tfidf
    model.fit(X_train_data, y_train) # Changed to use X_train_data
    predictions = model.predict(X_test_data) # Changed to use X_test_data
    print(f"\n{model_name} - Classification Report:\n", classification_report(y_test, predictions, target_names=le.classes_))
    print(f"\n{model_name} - Confusion Matrix:\n", confusion_matrix(y_test, predictions))
    print(f"\n{model_name} - Accuracy Score:", accuracy_score(y_test, predictions))

# Daftar model yang akan diuji
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    # 'Support Vector Machine': SVC(kernel='linear', random_state=42)
    'LinearSVC': LinearSVC(random_state=42)
}

# Melatih dan mengevaluasi setiap model
for model_name, model in models.items():
    if model_name == 'LinearSVC': # Changed from 'Linear SVM' to match the model dictionary key
        # Gunakan data yang sudah direduksi untuk SVM
        train_and_evaluate(model, model_name, X_train_reduced, X_test_reduced)
    else:
        # Gunakan data asli untuk model lainnya
        train_and_evaluate(model, model_name, X_train_tfidf, X_test_tfidf)


Training Naive Bayes...

Naive Bayes - Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.75      0.34      0.47      2598
    Negative       0.62      0.81      0.70      4509
     Neutral       0.67      0.51      0.58      3664
    Positive       0.61      0.74      0.67      4166

    accuracy                           0.64     14937
   macro avg       0.66      0.60      0.61     14937
weighted avg       0.65      0.64      0.62     14937


Naive Bayes - Confusion Matrix:
 [[ 879  721  289  709]
 [  64 3663  304  478]
 [ 141  858 1872  793]
 [  82  645  340 3099]]

Naive Bayes - Accuracy Score: 0.6368748744727857

Training Logistic Regression...

Logistic Regression - Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.67      0.52      0.59      2598
    Negative       0.72      0.78      0.75      4509
     Neutral       0.63      0.64      0.64      3664
    Positive       0.69    

In [8]:
# Training SVM tanpa PCA
print("\nTraining SVM without PCA...")
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_tfidf, y_train)
svm_predictions = svm_model.predict(X_test_tfidf)
print("\nSVM without PCA - Classification Report:\n", classification_report(y_test, svm_predictions, target_names=le.classes_))
print("\nSVM without PCA - Confusion Matrix:\n", confusion_matrix(y_test, svm_predictions))
print("\nSVM without PCA - Accuracy Score:", accuracy_score(y_test, svm_predictions))



Training SVM without PCA...

SVM without PCA - Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.69      0.58      0.63      2598
    Negative       0.74      0.79      0.77      4509
     Neutral       0.67      0.67      0.67      3664
    Positive       0.71      0.73      0.72      4166

    accuracy                           0.71     14937
   macro avg       0.70      0.69      0.70     14937
weighted avg       0.71      0.71      0.71     14937


SVM without PCA - Confusion Matrix:
 [[1499  347  315  437]
 [ 204 3566  406  333]
 [ 260  481 2438  485]
 [ 222  411  474 3059]]

SVM without PCA - Accuracy Score: 0.7071031666331927


In [10]:
# Training SVM tanpa PCA
print("\nTraining SVM without PCA...")
svm_model = LinearSVC(random_state=42)
svm_model.fit(X_train_tfidf, y_train)
svm_predictions = svm_model.predict(X_test_tfidf)
print("\nLinearSVC without PCA - Classification Report:\n", classification_report(y_test, svm_predictions, target_names=le.classes_))
print("\nLinearSVC without PCA - Confusion Matrix:\n", confusion_matrix(y_test, svm_predictions))
print("\nLinearSVC without PCA - Accuracy Score:", accuracy_score(y_test, svm_predictions))


Training SVM without PCA...

LinearSVC without PCA - Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.71      0.59      0.64      2598
    Negative       0.75      0.78      0.77      4509
     Neutral       0.66      0.66      0.66      3664
    Positive       0.70      0.74      0.72      4166

    accuracy                           0.71     14937
   macro avg       0.70      0.69      0.70     14937
weighted avg       0.71      0.71      0.70     14937


LinearSVC without PCA - Confusion Matrix:
 [[1520  319  324  435]
 [ 169 3538  429  373]
 [ 262  460 2420  522]
 [ 189  407  504 3066]]

LinearSVC without PCA - Accuracy Score: 0.7058981053759121


In [14]:
from scipy.stats import uniform
# Tuning Hyperparameter untuk LinearSVC
param_dist = {
    'C': uniform(0.01, 10),
    'penalty': ['l2'],  # 'l1' tidak didukung tanpa liblinear
    'loss': ['hinge', 'squared_hinge']
}

svc = LinearSVC(random_state=42, max_iter=1000)
random_search = RandomizedSearchCV(
    svc,
    param_distributions=param_dist,
    n_iter=20,
    scoring='accuracy',
    n_jobs=-1,
    cv=3,
    random_state=42,
    verbose=2
)

print("\nTuning LinearSVC with RandomizedSearchCV...")
random_search.fit(X_train_tfidf, y_train)
print("\nBest Parameters:", random_search.best_params_)
print("\nBest Cross-Validation Score:", random_search.best_score_)

# Evaluasi dengan parameter terbaik
best_svc = random_search.best_estimator_
predictions = best_svc.predict(X_test_tfidf)
print("\nBest LinearSVC - Classification Report:\n", classification_report(y_test, predictions, target_names=le.classes_))
print("\nBest LinearSVC - Confusion Matrix:\n", confusion_matrix(y_test, predictions))
print("\nBest LinearSVC - Accuracy Score:", accuracy_score(y_test, predictions))



Tuning LinearSVC with RandomizedSearchCV...
Fitting 3 folds for each of 20 candidates, totalling 60 fits

Best Parameters: {'C': np.float64(9.74755518841459), 'loss': 'hinge', 'penalty': 'l2'}

Best Cross-Validation Score: 0.697844119117175

Best LinearSVC - Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.65      0.61      0.63      2598
    Negative       0.76      0.79      0.77      4509
     Neutral       0.68      0.64      0.66      3664
    Positive       0.71      0.73      0.72      4166

    accuracy                           0.71     14937
   macro avg       0.70      0.69      0.69     14937
weighted avg       0.70      0.71      0.70     14937


Best LinearSVC - Confusion Matrix:
 [[1585  307  305  401]
 [ 238 3551  366  354]
 [ 350  457 2350  507]
 [ 266  387  454 3059]]

Best LinearSVC - Accuracy Score: 0.7059650532235389




In [3]:
# Install necessary libraries
!pip install torch transformers scikit-learn pandas numpy
!pip install --upgrade transformers

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import pandas as pd
import numpy as np
import random
import os



In [7]:
# Set random seeds for reproducibility
random.seed(42)
torch.manual_seed(42)
os.environ['PYTHONHASHSEED'] = str(42)

# Load and preprocess the dataset
file_path = 'twitter_training_updeted.csv'
df = pd.read_csv(file_path)

def preprocess_text(text):
    if isinstance(text, float):
        text = str(text)
    return text.lower()

df['clean_text'] = df['text'].apply(preprocess_text)

# Encode labels
label_mapping = {label: idx for idx, label in enumerate(df['sentiment'].unique())}
df['label'] = df['sentiment'].map(label_mapping)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

# Load Tiny BERT model and tokenizer
model_name = "prajjwal1/bert-tiny"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_mapping))

# Tokenize data
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)

# Create Torch datasets
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = SentimentDataset(train_encodings, y_train.tolist())
test_dataset = SentimentDataset(test_encodings, y_test.tolist())

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    save_total_limit=1, # Only keep the best model after each epoch
    eval_steps=100,  # Evaluate every 100 steps. Adjust as needed.
    report_to="none"  # Nonaktifkan wandb
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Evaluate the model
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=label_mapping.keys()))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
100,1.3673
200,1.3584
300,1.3423
400,1.3204
500,1.3048
600,1.2878
700,1.257
800,1.2189
900,1.2041
1000,1.1907



Classification Report:
               precision    recall  f1-score   support

    Positive       0.71      0.75      0.73      4166
     Neutral       0.71      0.60      0.65      3664
    Negative       0.76      0.81      0.79      4509
  Irrelevant       0.62      0.62      0.62      2598

    accuracy                           0.71     14937
   macro avg       0.70      0.70      0.70     14937
weighted avg       0.71      0.71      0.71     14937


Confusion Matrix:
 [[3142  343  412  269]
 [ 496 2207  474  487]
 [ 368  236 3673  232]
 [ 412  303  279 1604]]

Accuracy Score: 0.7113878288813015
