In [13]:
import json
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from gensim.models import Word2Vec
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE

# Load dataset
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return pd.DataFrame(data)

# Convert integer list to string
def int_list_to_str(int_list):
    return ' '.join(map(str, int_list))


# Load data
set1_human = load_dataset("./data/set1_human.json")
set1_machine = load_dataset("./data/set1_machine.json")
set2_human = load_dataset("./data/set2_human.json")
set2_machine = load_dataset("./data/set2_machine.json")

# print the ratio of human and machine
print("Domain1 - human:machine = {}:{}".format(len(set1_human), len(set1_machine)))
print("Domain2 - human:machine = {}:{}".format(len(set2_human), len(set2_machine)))
# Label data and combine
set1_human["label"] = 1
set1_machine["label"] = 0
set2_human["label"] = 1
set2_machine["label"] = 0

dataset1 = pd.concat([set1_human, set1_machine], ignore_index=True)
dataset2 = pd.concat([set2_human, set2_machine], ignore_index=True)

# Convert integer lists to strings
dataset1['txt'] = dataset1['txt'].apply(int_list_to_str)
dataset2['txt'] = dataset2['txt'].apply(int_list_to_str)
dataset1['prompt'] = dataset1['prompt'].apply(int_list_to_str)
dataset2['prompt'] = dataset2['prompt'].apply(int_list_to_str)

Domain1 - human:machine = 122584:3500
Domain2 - human:machine = 100:400


In [14]:
whole_dataset = pd.concat([dataset1, dataset2], ignore_index=True)
whole_dataset['combined'] = whole_dataset['txt'] + ' ' + whole_dataset['prompt']
# # Word2Vec
# all_text = [text.split() for text in whole_dataset['combined'].values]
# word2vec_model = Word2Vec(all_text, vector_size=100, window=5, min_count=1, workers=4)

# # Save model
# word2vec_model.save("word2vec.model")

# Load model
word2vec_model = Word2Vec.load("word2vec.model")

def vectorize(dataset, word2vec_model, has_label=True):
    X_prompt = dataset['prompt'].values
    X_txt = dataset['txt'].values
    X_prompt = [text.split() for text in X_prompt]
    X_txt = [text.split() for text in X_txt]
    # prompt may be empty, use zero vector to represent
    X_prompt = np.array([np.mean([word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv],
                                 axis=0) if tokens else np.zeros(100) for tokens in X_prompt])
    X_txt = np.array([np.mean([word2vec_model.wv[token] 
                               for token in tokens if token in word2vec_model.wv], axis=0) for tokens in X_txt])
    promt_len = np.expand_dims(np.array([len(text.split()) for text in dataset['prompt'].values]), axis=1)
    txt_len = np.expand_dims(np.array([len(text.split()) for text in dataset['txt'].values]), axis=1)
    X = np.concatenate((X_prompt, X_txt, promt_len, txt_len), axis=1)
    if has_label:
        y = dataset['label'].values
        return X, y
    else:
        return X

In [69]:
# Split dataset 1 and dataset 2
dataset1_train, dataset1_test = train_test_split(
    dataset1, test_size=0.2, random_state=42)

dataset2_train, dataset2_test = train_test_split(
    dataset2, test_size=0.2, random_state=42)

# Preprocess dataset 1 and dataset 2
X_train1, y_train1 = vectorize(dataset1_train, word2vec_model)
X_test1, y_test1 = vectorize(dataset1_test, word2vec_model)
X_train2, y_train2 = vectorize(dataset2_train, word2vec_model)
X_test2, y_test2 = vectorize(dataset2_test, word2vec_model)
X_train1_whole, y_train1_whole = vectorize(dataset1, word2vec_model)
X_train2_whole, y_train2_whole = vectorize(dataset2, word2vec_model)
print("done vectorize!")

done vectorize!


In [70]:
lgbm1 = LGBMClassifier(objective='binary', n_estimators=500,
                       boosting_type='gbdt', class_weight='balanced',
                       reg_alpha=0.5, reg_lambda=0)
lgbm1.fit(X_train1, y_train1)
# Predict and evaluate on dataset 1 test set
print("*** Summary on dataset 1 test set:")
y_pred1 = lgbm1.predict(X_test1)
print(classification_report(y_test1, y_pred1))
print(confusion_matrix(y_test1, y_pred1))
# f1 score
print("F1 score: ", f1_score(y_test1, y_pred1))

*** Summary on dataset 1 test set:
              precision    recall  f1-score   support

           0       0.89      0.84      0.87       691
           1       1.00      1.00      1.00     24526

    accuracy                           0.99     25217
   macro avg       0.94      0.92      0.93     25217
weighted avg       0.99      0.99      0.99     25217

[[  583   108]
 [   72 24454]]
F1 score:  0.9963331160365059


In [71]:
# # WAY2: SMOTE for oversampling
smote = SMOTE(random_state=90051)
X_train2_resampled, y_train2_resampled = smote.fit_resample(X_train2, y_train2)

lgbm2 = LGBMClassifier(objective='binary', n_estimators=500,
                           boosting_type='gbdt', class_weight='balanced',
                           reg_alpha=0.1, reg_lambda=1, learning_rate=0.01)
lgbm2.fit(X_train2_resampled, y_train2_resampled, init_model=lgbm1)

# Predict and evaluate on dataset 2 test set
print("\n*** Summary on dataset 2 test set:")
y_pred2 = lgbm2.predict(X_test2)
print(classification_report(y_test2, y_pred2))
print(confusion_matrix(y_test2, y_pred2))
# f1 score
print("F1 score: ", f1_score(y_test2, y_pred2))


*** Summary on dataset 2 test set:
              precision    recall  f1-score   support

           0       0.83      0.90      0.87        72
           1       0.68      0.54      0.60        28

    accuracy                           0.80       100
   macro avg       0.76      0.72      0.73       100
weighted avg       0.79      0.80      0.79       100

[[65  7]
 [13 15]]
F1 score:  0.6


In [67]:
# OUTPUT to CSV
# use the whole dataset2 to train model2
lgbm1 = LGBMClassifier(objective='binary', n_estimators=500,
                       boosting_type='gbdt', class_weight='balanced',
                       reg_alpha=0.5, reg_lambda=0)
lgbm1.fit(X_train1_whole, y_train1_whole)

smote = SMOTE(random_state=90051)
X_train2_resampled, y_train2_resampled = smote.fit_resample(
    X_train2_whole, y_train2_whole)

lgbm2 = LGBMClassifier(objective='binary', n_estimators=500, 
                       boosting_type='gbdt', class_weight='balanced',
                       reg_alpha=0.1, reg_lambda=1, learning_rate=0.01)
lgbm2.fit(X_train2_resampled, y_train2_resampled, init_model=lgbm1)

# Load test set
test_data = load_dataset("./data/test.json")
test_data['txt'] = test_data['txt'].apply(int_list_to_str)
test_data['prompt'] = test_data['prompt'].apply(int_list_to_str)
test_data1 = test_data.iloc[:600]
test_data2 = test_data.iloc[600:]

# Preprocess test set
X_test1 = vectorize(test_data1, word2vec_model, has_label=False)
X_test2 = vectorize(test_data2, word2vec_model, has_label=False)

y_pred1 = lgbm1.predict(X_test1)
y_pred2 = lgbm2.predict(X_test2)

# Combine predictions for both domains
y_pred_test = np.concatenate([y_pred1, y_pred2])

# Save predictions to a CSV file
output_df = pd.DataFrame(
    {"Id": range(len(y_pred_test)), "Predicted": y_pred_test})
filename = "LGB_domain_adaption_l1l2_all_data.csv"
output_df.to_csv(filename, index=False)
print("Predictions saved to file:", filename)

Predictions saved to file: LGB_domain_adaption_l1l2_all_data.csv


In [57]:
# Hyperparameter tuning via grid search CV
param_grid = {
    'objective': ['binary'],
    'boosting_type': ['gbdt'],
    'class_weight': ['balanced'],
    'reg_alpha': [0, 0.1, 0.3, 0.5, 0.7, 1.0],  # L1 regularization
    'reg_lambda': [0, 0.1, 0.3, 0.5, 0.7, 1.0]  # L2 regularization
}

lgbm = LGBMClassifier(n_estimators=500)
grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid,
                           cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train1_whole, y_train1_whole)
# Print the best parameters and corresponding score
print("Best parameters found: ", grid_search.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters found:  {'boosting_type': 'gbdt', 'class_weight': 'balanced', 'objective': 'binary', 'reg_alpha': 0.5, 'reg_lambda': 0}
Best score found:  0.9959149972679213


In [59]:
# Train model 1 with the best parameters
best_params = grid_search.best_params_
lgbm1 = LGBMClassifier(**best_params, n_estimators=500)
lgbm1.fit(X_train1, y_train1)
# Evaluate on dataset 1 test set
print("*** Summary on dataset 1 test set:")
y_pred1 = lgbm1.predict(X_test1)
print(classification_report(y_test1, y_pred1))
print(confusion_matrix(y_test1, y_pred1))

*** Summary on dataset 1 test set:
              precision    recall  f1-score   support

           0       0.89      0.84      0.87       691
           1       1.00      1.00      1.00     24526

    accuracy                           0.99     25217
   macro avg       0.94      0.92      0.93     25217
weighted avg       0.99      0.99      0.99     25217

[[  583   108]
 [   72 24454]]


In [65]:
# Fine-tune lgbm1 on dataset 2 with a smaller learning rate
# grid search for best alpha
# Set up the parameter grid
param_grid = {
    'n_estimators': [500],
    'learning_rate': [0.01],
    'objective': ['binary'],
    'boosting_type': ['gbdt'],
    'class_weight': ['balanced'],
    'reg_alpha': [0, 0.1, 0.5, 0.7, 1],
    'reg_lambda': [0, 0.1, 0.5, 0.7, 1]
}

# Create a LightGBM model for fine-tuning
lgbm_finetuned = LGBMClassifier()

# Perform grid search with cross-validation
grid_search = GridSearchCV(lgbm_finetuned, param_grid, scoring='f1', cv=5)

X_train2_resampled, y_train2_resampled = smote.fit_resample(
    X_train2_whole, y_train2_whole)

grid_search.fit(X_train2_resampled, y_train2_resampled, init_model=lgbm1)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found:", best_params)

Best parameters found: {'boosting_type': 'gbdt', 'class_weight': 'balanced', 'learning_rate': 0.01, 'n_estimators': 500, 'objective': 'binary', 'reg_alpha': 0.1, 'reg_lambda': 1}


In [66]:
smote = SMOTE(random_state=90051)
X_train2_resampled, y_train2_resampled = smote.fit_resample(X_train2, y_train2)

lgbm_best = LGBMClassifier(**best_params)
lgbm_best.fit(X_train2_resampled, y_train2_resampled)

# Evaluate on resampled dataset 2 test set
print("*** Summary on resampled dataset 2 test set:")
y_pred2 = lgbm_best.predict(X_test2)
print(classification_report(y_test2, y_pred2))
print(confusion_matrix(y_test2, y_pred2))
print("F1 score:", f1_score(y_test2, y_pred2))

*** Summary on resampled dataset 2 test set:
              precision    recall  f1-score   support

           0       0.86      0.94      0.90        72
           1       0.81      0.61      0.69        28

    accuracy                           0.85       100
   macro avg       0.84      0.78      0.80       100
weighted avg       0.85      0.85      0.84       100

[[68  4]
 [11 17]]
F1 score: 0.6938775510204083
