# INFORMASI
### LSTM
### Fine Tuning pada data Twitter, pergunakan weights nya lalu fine-tune kembali pada data YouTube

In [2]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from sklearn.model_selection import train_test_split

# --- Konfigurasi ---
MAX_NB_WORDS = 5000
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 100

# --- Load Data Twitter ---
df_twitter = pd.read_csv('final_data_twitter.csv')
df_twitter = df_twitter.dropna(subset=['clean_text_ML_2'])



In [3]:
# --- Tokenisasi (Fit pada data Twitter) ---
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True)
tokenizer.fit_on_texts(df_twitter['clean_text_ML_2'].values)

X_tw = tokenizer.texts_to_sequences(df_twitter['clean_text_ML_2'].values)
X_tw = pad_sequences(X_tw, maxlen=MAX_SEQUENCE_LENGTH)
y_tw = df_twitter['cyberbullying'].values

# Split Data Twitter
X_temp_tw, X_test_tw, y_temp_tw, y_test_tw = train_test_split(X_tw, y_tw, test_size=0.2, random_state=42)
X_train_tw, X_val_tw, y_train_tw, y_val_tw = train_test_split(X_temp_tw, y_temp_tw, test_size=1/8, random_state=42)


In [4]:
from tensorflow.keras.metrics import Precision, Recall

def f1_metric(y_true, y_pred):
    y_pred = tf.round(y_pred)

    tp = tf.reduce_sum(tf.cast(y_true * y_pred, 'float32'))
    fp = tf.reduce_sum(tf.cast((1 - y_true) * y_pred, 'float32'))
    fn = tf.reduce_sum(tf.cast(y_true * (1 - y_pred), 'float32'))

    precision = tp / (tp + fp + 1e-7)
    recall = tp / (tp + fn + 1e-7)

    return 2 * precision * recall / (precision + recall + 1e-7)


# --- Membangun Model LSTM ---
def create_model():
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',
        Precision(name="precision"),
        Recall(name="recall"),
        f1_metric
        ])
    return model


In [5]:
# Train Model pada Twitter
model_tw = create_model()
print("Training Model pada Data Twitter...")
model_tw.fit(X_train_tw, y_train_tw, epochs=8, batch_size=64, validation_data=(X_val_tw, y_val_tw), verbose=1)

Training Model pada Data Twitter...
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x2ac0bd9bc10>

In [6]:
model_tw.evaluate(X_test_tw, y_test_tw, verbose=1)




[0.6305282115936279,
 0.794773280620575,
 0.758152186870575,
 0.758152186870575,
 0.7529876232147217]

In [7]:

# --- Simpan Weights ---
model_tw.save_weights('twitter_weights.h5')
print("Bobot model Twitter berhasil disimpan.")

Bobot model Twitter berhasil disimpan.


In [8]:
# --- Load Data YouTube ---
df_yt = pd.read_csv('final_data_yt.csv')
df_yt = df_yt.dropna(subset=['clean_text_ML_2'])

# --- Preprocessing YouTube (Pakai Tokenizer Twitter) ---
X_yt = tokenizer.texts_to_sequences(df_yt['clean_text_ML_2'].values)
X_yt = pad_sequences(X_yt, maxlen=MAX_SEQUENCE_LENGTH)
y_yt = df_yt['cyberbullying'].values

# Split Data YouTube
X_temp_yt, X_test_yt, y_temp_yt, y_test_yt = train_test_split(X_yt, y_yt, test_size=0.2, random_state=42)
X_train_yt, X_val_yt, y_train_yt, y_val_yt = train_test_split(X_temp_yt, y_temp_yt, test_size=1/8, random_state=42)





In [9]:
# --- Load Model & Weights ---
model_finetune = create_model()

# Trik: Jalankan dummy input agar struktur model terbangun sebelum load weights
dummy_input = np.zeros((1, MAX_SEQUENCE_LENGTH))
model_finetune(dummy_input)



<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.50266695]], dtype=float32)>

In [10]:

# Load bobot dari Twitter
model_finetune.load_weights('twitter_weights.h5')
print("\nBobot Twitter dimuat. Mulai Fine-tuning pada Data YouTube...")


Bobot Twitter dimuat. Mulai Fine-tuning pada Data YouTube...


In [11]:
from tensorflow.keras.callbacks import EarlyStopping

# Early Stopping
es = EarlyStopping(
    monitor='val_loss',      # metric yang dipantau
    patience=3,              # stop jika 2 epoch berturut-turut tidak membaik
    restore_best_weights=True # kembalikan weight terbaik (recommended)
)

# --- Fine-Tuning ---
# Kita lanjutkan training dengan data YouTube
history_yt = model_finetune.fit(X_train_yt, y_train_yt, epochs=8, batch_size=64, validation_data=(X_val_yt, y_val_yt), callbacks=[es], verbose=1)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [12]:
model_finetune.evaluate(X_test_yt, y_test_yt, verbose=1)



[0.7284189462661743,
 0.6615384817123413,
 0.4878048896789551,
 0.4651162922382355,
 0.36546745896339417]