# INFORMASI
### Metode Machine Learning
### Ditrain dengan combined training setup (gabungan data YouTube dan Twitter)

In [10]:
import pandas as pd
import numpy as np

In [11]:
df_twitter = pd.read_csv('../data/final_data_twitter.csv')
df_yt = pd.read_csv('../data/final_data_yt.csv')

In [12]:
df_twitter = df_twitter.dropna()

In [13]:
df_twitter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13005 entries, 0 to 13016
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   text             13005 non-null  object
 1   cyberbullying    13005 non-null  int64 
 2   length           13005 non-null  int64 
 3   clean_text_bert  13005 non-null  object
 4   clean_text_ML    13005 non-null  object
 5   clean_text_ML_2  13005 non-null  object
dtypes: int64(2), object(4)
memory usage: 711.2+ KB


In [14]:
df_yt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 650 entries, 0 to 649
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   text             650 non-null    object
 1   cyberbullying    650 non-null    int64 
 2   clean_text_bert  650 non-null    object
 3   clean_text_ML    650 non-null    object
 4   clean_text_ML_2  650 non-null    object
dtypes: int64(1), object(4)
memory usage: 25.5+ KB


In [15]:
df_twitter['platform'] = 'twitter'
df_yt['platform'] = 'youtube'

df_all = pd.concat([df_twitter, df_yt], axis=0).reset_index(drop=True)
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13655 entries, 0 to 13654
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   text             13655 non-null  object 
 1   cyberbullying    13655 non-null  int64  
 2   length           13005 non-null  float64
 3   clean_text_bert  13655 non-null  object 
 4   clean_text_ML    13655 non-null  object 
 5   clean_text_ML_2  13655 non-null  object 
 6   platform         13655 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 746.9+ KB


In [16]:
from sklearn.model_selection import train_test_split

X = df_all['clean_text_ML_2']
y = df_all['cyberbullying']

# train + validation vs test
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# split lagi train vs validation
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.2, random_state=42, stratify=y_train_val
)


In [17]:
# Mask Twitter dan YouTube berdasarkan original dataset
mask_twitter = df_all['platform'] == 'twitter'
mask_youtube = df_all['platform'] == 'youtube'

X_test_twitter = X_test[mask_twitter.loc[X_test.index]]
y_test_twitter = y_test[mask_twitter.loc[X_test.index]]

X_test_youtube = X_test[mask_youtube.loc[X_test.index]]
y_test_youtube = y_test[mask_youtube.loc[X_test.index]]


# Text Representation

In [18]:
import gensim

def sentence_vector(text, embedding_model, dim=100):
    words = text.split()
    vectors = []
    for w in words:
        if w in embedding_model:
            vectors.append(embedding_model[w])
    
    # Jika tidak ada satupun kata punya embedding -> return zero vector
    if len(vectors) == 0:
        return np.zeros(dim)
    
    # Rata-rata embedding
    return np.mean(vectors, axis=0)

#### TF-IDF

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    ngram_range=(1,2),
    max_df=0.9,
    min_df=5
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf   = tfidf.transform(X_val)
X_test_tfidf  = tfidf.transform(X_test)

#### GloVe

In [20]:
from gensim.models import KeyedVectors
def load_glove(glove_file, dim=50):
    glove_model = KeyedVectors(vector_size=dim)
    
    with open(glove_file, 'r', encoding='utf8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            glove_model.add_vector(word, vector)
    
    return glove_model

# Load model
glove_vectors = load_glove("../model pretrained/glove/glove_50dim_wiki.id.case.text.txt", dim=50)

X_train_glove = np.vstack([sentence_vector(s, glove_vectors, dim=50) for s in X_train])
X_val_glove   = np.vstack([sentence_vector(s, glove_vectors, dim=50) for s in X_val])
X_test_glove  = np.vstack([sentence_vector(s, glove_vectors, dim=50) for s in X_test])



#### FastText

In [21]:
from gensim.models import FastText
fasttext_model = gensim.models.fasttext.load_facebook_model("../model pretrained/fasttext/cc.id.300.bin") # Dilatih dari data Common Crawl
fasttext_vectors = fasttext_model.wv
fasttext_dim = fasttext_vectors.vector_size


X_train_fasttext = np.vstack([sentence_vector(text, fasttext_vectors, dim=fasttext_dim) for text in X_train])
X_val_fasttext = np.vstack([sentence_vector(text, fasttext_vectors, dim=fasttext_dim) for text in X_val])
X_test_fasttext = np.vstack([sentence_vector(text, fasttext_vectors, dim=fasttext_dim) for text in X_test])

#### Word2Vec

In [22]:
from gensim.models import Word2Vec

w2v_wiki = Word2Vec.load("../model pretrained/word2vec/idwiki_word2vec_100/idwiki_word2vec_100.model")
w2v_vectors = w2v_wiki.wv
w2v_dim = w2v_vectors.vector_size

X_train_w2v = np.vstack([sentence_vector(text, w2v_vectors, dim=w2v_dim) for text in X_train])
X_val_w2v = np.vstack([sentence_vector(text, w2v_vectors, dim=w2v_dim) for text in X_val])
X_test_w2v = np.vstack([sentence_vector(text, w2v_vectors, dim=w2v_dim) for text in X_test])

# Modeling

#### Baseline

In [23]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Representasi teks
representations = {
    'TF-IDF': (X_train_tfidf.toarray(), X_val_tfidf.toarray()),
    'Glove Wikipedia': (X_train_glove, X_val_glove),
    'FastText CC (Common Crawl)': (X_train_fasttext, X_val_fasttext),
    'Word2Vec Wikipedia': (X_train_w2v, X_val_w2v)  
}

# Models
models = {
    'SVM': SVC(kernel='linear', random_state=42),
    'Naive Bayes': GaussianNB(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='auc', random_state=42)
}

results = []

for model_name, model in models.items():
    for rep_name, (X_tr, X_vl) in representations.items():
        model.fit(X_tr, y_train)
        
        y_pred = model.predict(X_vl)
        
        # Metrics
        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred, zero_division=0)
        rec = recall_score(y_val, y_pred, zero_division=0)
        f1 = f1_score(y_val, y_pred, average='macro', zero_division=0)
        
        results.append({
            'Model': model_name,
            'Text Representation': rep_name,
            'Accuracy': acc,
            'Precision': prec,
            'Recall': rec,
            'F1 Macro': f1
        })

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='F1 Macro', ascending=False)
results_df.reset_index(drop=True, inplace=True)

results_df

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Unnamed: 0,Model,Text Representation,Accuracy,Precision,Recall,F1 Macro
0,SVM,TF-IDF,0.826087,0.812283,0.762786,0.819962
1,XGBoost,TF-IDF,0.814645,0.841755,0.688792,0.803787
2,XGBoost,FastText CC (Common Crawl),0.767506,0.748489,0.673558,0.757726
3,SVM,FastText CC (Common Crawl),0.762929,0.764861,0.630033,0.749326
4,XGBoost,Word2Vec Wikipedia,0.700229,0.658654,0.5963,0.687916
5,SVM,Word2Vec Wikipedia,0.687414,0.65051,0.554951,0.671425
6,XGBoost,Glove Wikipedia,0.681007,0.632143,0.577802,0.668402
7,Naive Bayes,TF-IDF,0.639817,0.543478,0.897715,0.634961
8,SVM,Glove Wikipedia,0.657208,0.62574,0.460283,0.630249
9,Naive Bayes,Glove Wikipedia,0.623799,0.546147,0.624592,0.620121


#### Handling Imbalanced (Balanced Parameter)

In [24]:
pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

models_balanced = {
    'SVM': SVC(kernel='linear', class_weight='balanced', random_state=42),
    'Naive Bayes': GaussianNB(),
    'XGBoost': XGBClassifier(
        use_label_encoder=False,
        eval_metric='auc',
        scale_pos_weight=pos_weight,
        random_state=42
    )
}

results_class_weight = []

for model_name, model in models_balanced.items():
    for rep_name, (X_tr, X_vl) in representations.items():
        
        model.fit(X_tr, y_train)
        
        y_pred = model.predict(X_vl)
        
        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred, zero_division=0)
        rec = recall_score(y_val, y_pred, zero_division=0)
        f1 = f1_score(y_val, y_pred, average='macro', zero_division=0)
        
        results_class_weight.append({
            'Model': model_name,
            'Text Representation': rep_name,
            'Accuracy': acc,
            'Precision': prec,
            'Recall': rec,
            'F1 Macro': f1
        })

results_class_weight_df = pd.DataFrame(results_class_weight)
results_class_weight_df = results_class_weight_df.sort_values(by='F1 Macro', ascending=False)
results_class_weight_df.reset_index(drop=True, inplace=True)

results_class_weight_df

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Unnamed: 0,Model,Text Representation,Accuracy,Precision,Recall,F1 Macro
0,SVM,TF-IDF,0.827002,0.785037,0.810664,0.823283
1,XGBoost,TF-IDF,0.810526,0.800954,0.73123,0.803003
2,XGBoost,FastText CC (Common Crawl),0.767048,0.730856,0.706202,0.759861
3,SVM,FastText CC (Common Crawl),0.752403,0.698529,0.723613,0.747182
4,XGBoost,Word2Vec Wikipedia,0.696568,0.644796,0.620239,0.687001
5,SVM,Word2Vec Wikipedia,0.678261,0.602857,0.688792,0.675086
6,XGBoost,Glove Wikipedia,0.682838,0.623904,0.619151,0.674288
7,SVM,Glove Wikipedia,0.645767,0.568851,0.651795,0.642368
8,Naive Bayes,TF-IDF,0.639817,0.543478,0.897715,0.634961
9,Naive Bayes,Glove Wikipedia,0.623799,0.546147,0.624592,0.620121


# Testing dan Hasil

In [26]:
# 1. Mapping posisi test
test_idx = X_test.index
test_pos_map = {idx: pos for pos, idx in enumerate(test_idx)}

# 2. Posisi untuk twitter & youtube
twitter_positions = [test_pos_map[i] for i in X_test_twitter.index]
youtube_positions = [test_pos_map[i] for i in X_test_youtube.index]

# 3. Text representations untuk test per platform
representations_test_twitter = {
    'TF-IDF': X_test_tfidf.toarray()[twitter_positions],
    'Glove Wikipedia': X_test_glove[twitter_positions],
    'FastText CC (Common Crawl)': X_test_fasttext[twitter_positions],
    'Word2Vec Wikipedia': X_test_w2v[twitter_positions]
}

representations_test_youtube = {
    'TF-IDF': X_test_tfidf.toarray()[youtube_positions],
    'Glove Wikipedia': X_test_glove[youtube_positions],
    'FastText CC (Common Crawl)': X_test_fasttext[youtube_positions],
    'Word2Vec Wikipedia': X_test_w2v[youtube_positions]
}


In [27]:
results_final = []

for model_name, model in models_balanced.items():
    for rep_name, (X_tr, X_vl) in representations.items():
        
        # TRAIN ON FULL TRAINING SET (BALANCED)
        model.fit(X_tr, y_train)
        
        # TEST TWITTER 
        X_test_tw = representations_test_twitter[rep_name]
        y_pred_tw = model.predict(X_test_tw)

        acc_tw  = accuracy_score(y_test_twitter, y_pred_tw)
        prec_tw = precision_score(y_test_twitter, y_pred_tw, zero_division=0)
        rec_tw  = recall_score(y_test_twitter, y_pred_tw, zero_division=0)
        f1_tw   = f1_score(y_test_twitter, y_pred_tw, average='macro', zero_division=0)
        
        
        # TEST YOUTUBE 
        X_test_yt = representations_test_youtube[rep_name]
        y_pred_yt = model.predict(X_test_yt)

        acc_yt  = accuracy_score(y_test_youtube, y_pred_yt)
        prec_yt = precision_score(y_test_youtube, y_pred_yt, zero_division=0)
        rec_yt  = recall_score(y_test_youtube, y_pred_yt, zero_division=0)
        f1_yt   = f1_score(y_test_youtube, y_pred_yt, average='macro', zero_division=0)
        
        
        results_final.append({
            'Model': model_name,
            'Representation': rep_name,

            # Twitter test
            'Twitter_Accuracy': acc_tw,
            'Twitter_Precision': prec_tw,
            'Twitter_Recall': rec_tw,
            'Twitter_F1': f1_tw,

            # YouTube test
            'YouTube_Accuracy': acc_yt,
            'YouTube_Precision': prec_yt,
            'YouTube_Recall': rec_yt,
            'YouTube_F1': f1_yt
        })

results_final_df = pd.DataFrame(results_final)
results_final_df = results_final_df.sort_values(by='YouTube_F1', ascending=False)
results_final_df.reset_index(drop=True, inplace=True)

results_final_df

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Unnamed: 0,Model,Representation,Twitter_Accuracy,Twitter_Precision,Twitter_Recall,Twitter_F1,YouTube_Accuracy,YouTube_Precision,YouTube_Recall,YouTube_F1
0,SVM,TF-IDF,0.820414,0.775758,0.81086,0.817192,0.664,0.526316,0.454545,0.618902
1,SVM,FastText CC (Common Crawl),0.770146,0.716239,0.758371,0.766377,0.664,0.541667,0.295455,0.575792
2,XGBoost,FastText CC (Common Crawl),0.772448,0.738361,0.717647,0.76617,0.64,0.483871,0.340909,0.571429
3,SVM,Word2Vec Wikipedia,0.694167,0.616667,0.736652,0.692685,0.592,0.414634,0.386364,0.545455
4,XGBoost,Glove Wikipedia,0.698005,0.642729,0.647964,0.691194,0.608,0.424242,0.318182,0.5402
5,Naive Bayes,TF-IDF,0.64505,0.550056,0.895023,0.640379,0.528,0.411765,0.795455,0.527516
6,XGBoost,Word2Vec Wikipedia,0.706447,0.653153,0.656109,0.699686,0.568,0.368421,0.318182,0.510017
7,XGBoost,TF-IDF,0.816961,0.809055,0.743891,0.810394,0.6,0.384615,0.227273,0.503968
8,SVM,Glove Wikipedia,0.66155,0.585441,0.691403,0.659625,0.568,0.352941,0.272727,0.496869
9,Naive Bayes,Glove Wikipedia,0.654643,0.580266,0.670588,0.652073,0.536,0.315789,0.272727,0.473722
