In [4]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from gensim.models import Word2Vec
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# Load dataset
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return pd.DataFrame(data)

# Convert integer list to string
def int_list_to_str(int_list):
    return ' '.join(map(str, int_list))


# Load data
set1_human = load_dataset("./data/set1_human.json")
set1_machine = load_dataset("./data/set1_machine.json")
set2_human = load_dataset("./data/set2_human.json")
set2_machine = load_dataset("./data/set2_machine.json")

# Label data and combine
set1_human["label"] = 1
set1_machine["label"] = 0
set2_human["label"] = 1
set2_machine["label"] = 0

dataset1 = pd.concat([set1_human, set1_machine], ignore_index=True)
dataset2 = pd.concat([set2_human, set2_machine], ignore_index=True)

# Convert integer lists to strings
dataset1['txt'] = dataset1['txt'].apply(int_list_to_str)
dataset2['txt'] = dataset2['txt'].apply(int_list_to_str)
dataset1['prompt'] = dataset1['prompt'].apply(int_list_to_str)
dataset2['prompt'] = dataset2['prompt'].apply(int_list_to_str)

In [5]:
whole_dataset = pd.concat([dataset1, dataset2], ignore_index=True)
whole_dataset['combined'] = whole_dataset['txt'] + ' ' + whole_dataset['prompt']
# # Word2Vec
# all_text = [text.split() for text in whole_dataset['combined'].values]
# word2vec_model = Word2Vec(all_text, vector_size=100, window=5, min_count=1, workers=4)

# # Save model
# word2vec_model.save("word2vec.model")

# Load model
word2vec_model = Word2Vec.load("word2vec.model")

def vectorize(dataset, word2vec_model, has_label=True):
    X_prompt = dataset['prompt'].values
    X_txt = dataset['txt'].values
    X_prompt = [text.split() for text in X_prompt]
    X_txt = [text.split() for text in X_txt]
    # prompt may be empty, use zero vector to represent
    X_prompt = np.array([np.mean([word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv],
                                 axis=0) if tokens else np.zeros(100) for tokens in X_prompt])
    X_txt = np.array([np.mean([word2vec_model.wv[token] 
                               for token in tokens if token in word2vec_model.wv], axis=0) for tokens in X_txt])
    promt_len = np.expand_dims(np.array([len(text.split()) for text in dataset['prompt'].values]), axis=1)
    txt_len = np.expand_dims(np.array([len(text.split()) for text in dataset['txt'].values]), axis=1)
    X = np.concatenate((X_prompt, X_txt, promt_len, txt_len), axis=1)
    if has_label:
        y = dataset['label'].values
        return X, y
    else:
        return X

In [42]:
# Split dataset 1 and dataset 2
dataset1_train, dataset1_test = train_test_split(
    dataset1, test_size=0.2, random_state=90051)
dataset2_train, dataset2_test = train_test_split(
    dataset2, test_size=0.2, random_state=90051)

# Preprocess dataset 1 and dataset 2
X_train1, y_train1 = vectorize(dataset1_train, word2vec_model)
X_test1, y_test1 = vectorize(dataset1_test, word2vec_model)
X_train2, y_train2 = vectorize(dataset2_train, word2vec_model)
X_test2, y_test2 = vectorize(dataset2_test, word2vec_model)
X_train1_whole, y_train1_whole = vectorize(dataset1, word2vec_model)
X_train2_whole, y_train2_whole = vectorize(dataset2, word2vec_model)
print("done vectorize!")

done vectorize!


In [47]:
# Method 2: tansfer learning/ domain adaptation
# Train a LightGBM model on dataset 1
X_train1, y_train1 = vectorize(dataset1_train, word2vec_model)
X_test1, y_test1 = vectorize(dataset1_test, word2vec_model)
X_train2, y_train2 = vectorize(dataset2_train, word2vec_model)
X_test2, y_test2 = vectorize(dataset2_test, word2vec_model)

lgbm1 = LGBMClassifier(objective='binary', n_estimators=500, 
                       boosting_type='gbdt', is_unbalance=True,
                       reg_alpha=0, reg_lambda=0.1)
lgbm1.fit(X_train1, y_train1)

# Train a LightGBM model on dataset 2 using the new labels
lgbm2 = LGBMClassifier(objective='binary', n_estimators=500,
                       boosting_type='gbdt', is_unbalance=True,
                       reg_alpha=0.5, reg_lambda=1, learning_rate=0.01)
lgbm2.fit(X_train2, y_train2, init_model=lgbm1)

# Predict and evaluate on dataset 1 test set
print("*** Summary on dataset 1 test set:")
y_pred1 = lgbm1.predict(X_test1)
print(classification_report(y_test1, y_pred1))
print(confusion_matrix(y_test1, y_pred1))

# Predict and evaluate on dataset 2 test set
print("\n*** Summary on dataset 2 test set:")
y_pred2 = lgbm2.predict(X_test2)
print(classification_report(y_test2, y_pred2))
print(confusion_matrix(y_test2, y_pred2))

*** Summary on dataset 1 test set:
              precision    recall  f1-score   support

           0       0.89      0.82      0.85       691
           1       0.99      1.00      1.00     24526

    accuracy                           0.99     25217
   macro avg       0.94      0.91      0.93     25217
weighted avg       0.99      0.99      0.99     25217

[[  567   124]
 [   69 24457]]

*** Summary on dataset 2 test set:
              precision    recall  f1-score   support

           0       0.84      0.92      0.87        72
           1       0.71      0.54      0.61        28

    accuracy                           0.81       100
   macro avg       0.77      0.73      0.74       100
weighted avg       0.80      0.81      0.80       100

[[66  6]
 [13 15]]


In [48]:
# OUTPUT to CSV
# use the whole dataset2 to train model2
lgbm1 = LGBMClassifier(objective='binary', n_estimators=500,
                       boosting_type='gbdt', is_unbalance=True,
                       reg_alpha=0, reg_lambda=0.1)
lgbm1.fit(X_train1_whole, y_train1_whole)

lgbm2 = LGBMClassifier(objective='binary', n_estimators=500, 
                       boosting_type='gbdt', is_unbalance=True,
                       reg_alpha=0.5, reg_lambda=1, learning_rate=0.01)
lgbm2.fit(X_train2_whole, y_train2_whole, init_model=lgbm1)

# Load test set
test_data = load_dataset("./data/test.json")
test_data['txt'] = test_data['txt'].apply(int_list_to_str)
test_data['prompt'] = test_data['prompt'].apply(int_list_to_str)
test_data1 = test_data.iloc[:600]
test_data2 = test_data.iloc[600:]

# Preprocess test set
X_test1 = vectorize(test_data1, word2vec_model, has_label=False)
X_test2 = vectorize(test_data2, word2vec_model, has_label=False)

y_pred1 = lgbm1.predict(X_test1)
y_pred2 = lgbm2.predict(X_test2)

# Combine predictions for both domains
y_pred_test = np.concatenate([y_pred1, y_pred2])

# Save predictions to a CSV file
output_df = pd.DataFrame(
    {"Id": range(len(y_pred_test)), "Predicted": y_pred_test})
filename = "LGB_domain_adaption_l1l2_all_data.csv"
output_df.to_csv(filename, index=False)
print("Predictions saved to file:", filename)

Predictions saved to file: LGB_domain_adaption_l1l2_all_data.csv


In [41]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'objective': ['binary'],
    'boosting_type': ['gbdt'],
    'is_unbalance': [True],
    'reg_alpha': [0, 0.1, 0.5, 1.0],  # L1 regularization
    'reg_lambda': [0, 0.1, 0.5, 1.0]  # L2 regularization
}

lgbm = LGBMClassifier(n_estimators=500)
grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid,
                           cv=5, scoring='f1', verbose=1, n_jobs=-1)
grid_search.fit(X_train1_whole, y_train1_whole)
# Print the best parameters and corresponding score
print("Best parameters found: ", grid_search.best_params_)
print("Best score found: ", grid_search.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters found:  {'boosting_type': 'gbdt', 'is_unbalance': True, 'objective': 'binary', 'reg_alpha': 0, 'reg_lambda': 0}
Best score found:  0.9960543531170402
*** Summary on dataset 1 test set:
              precision    recall  f1-score   support

           0       0.91      0.88      0.89       332
           1       1.00      1.00      1.00     12277

    accuracy                           0.99     12609
   macro avg       0.95      0.94      0.95     12609
weighted avg       0.99      0.99      0.99     12609

[[  292    40]
 [   29 12248]]


In [43]:
lgbm1 = LGBMClassifier(objective='binary', n_estimators=500,
                       boosting_type='gbdt', is_unbalance=True,
                       reg_alpha=0, reg_lambda=0.1)
lgbm1.fit(X_train1, y_train1)
# Evaluate on dataset 1 test set
print("*** Summary on dataset 1 test set:")
y_pred1 = lgbm1.predict(X_test1)
print(classification_report(y_test1, y_pred1))
print(confusion_matrix(y_test1, y_pred1))

*** Summary on dataset 1 test set:
              precision    recall  f1-score   support

           0       0.89      0.82      0.85       691
           1       0.99      1.00      1.00     24526

    accuracy                           0.99     25217
   macro avg       0.94      0.91      0.93     25217
weighted avg       0.99      0.99      0.99     25217

[[  567   124]
 [   69 24457]]


In [44]:
# Fine-tune lgbm1 on dataset 2 with a smaller learning rate
# grid search for best alpha
from sklearn.model_selection import GridSearchCV

# Set up the parameter grid
param_grid = {
    'n_estimators': [500],
    'learning_rate': [0.01],
    'reg_alpha': [0, 0.1, 0.5, 1, 5, 10],
    'reg_lambda': [0, 0.1, 0.5, 1, 5, 10]
}

# Create a LightGBM model for fine-tuning
lgbm_finetuned = LGBMClassifier(objective='binary', boosting_type='gbdt', is_unbalance=True)

# Perform grid search with cross-validation
grid_search = GridSearchCV(lgbm_finetuned, param_grid, scoring='f1', cv=5)
grid_search.fit(X_train2_whole, y_train2_whole, init_model=lgbm1)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found:", best_params)

# Train a new LightGBM model with the best parameters
lgbm_best = LGBMClassifier(**best_params, objective='binary',
                           boosting_type='gbdt', is_unbalance=True)
lgbm_best.fit(X_train2, y_train2, init_model=lgbm1)

# Predict and evaluate on dataset 2 test set
print("*** Summary on dataset 2 test set (best model):")
y_pred2_best = lgbm_best.predict(X_test2)
print(classification_report(y_test2, y_pred2_best))
print(confusion_matrix(y_test2, y_pred2_best))

Best parameters found: {'learning_rate': 0.01, 'n_estimators': 500, 'reg_alpha': 1, 'reg_lambda': 1}
*** Summary on dataset 2 test set (best model):
              precision    recall  f1-score   support

           0       0.82      0.90      0.86        72
           1       0.67      0.50      0.57        28

    accuracy                           0.79       100
   macro avg       0.74      0.70      0.72       100
weighted avg       0.78      0.79      0.78       100

[[65  7]
 [14 14]]
