In [12]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
# Load dataset
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return pd.DataFrame(data)

# Convert sequence of integers to string
def int_list_to_str(int_list):
    return ' '.join(map(str, int_list))

# Load data
set1_human = load_dataset("./data/set1_human.json")
set1_machine = load_dataset("./data/set1_machine.json")
set1_machine = set1_machine.drop('machine_id', axis=1)
set2_human = load_dataset("./data/set2_human.json")
set2_machine = load_dataset("./data/set2_machine.json")
set2_machine = set2_machine.drop('machine_id', axis=1)

# Label data and combine
set1_human["label"] = 1
set1_machine["label"] = 0
set2_human["label"] = 1
set2_machine["label"] = 0

dataset1 = pd.concat([set1_human, set1_machine], ignore_index=True)
dataset2 = pd.concat([set2_human, set2_machine], ignore_index=True)

# Convert integer lists to strings
dataset1['txt'] = dataset1['txt'].apply(int_list_to_str)
dataset2['txt'] = dataset2['txt'].apply(int_list_to_str)
dataset1['prompt'] = dataset1['prompt'].apply(int_list_to_str)
dataset2['prompt'] = dataset2['prompt'].apply(int_list_to_str)

# Train/validation split
train_data1, val_data1 = train_test_split(dataset1, test_size=0.2, random_state=42, stratify=dataset1["label"])
train_data2, val_data2 = train_test_split(dataset2, test_size=0.2, random_state=42, stratify=dataset2["label"])

In [14]:
train_data1.head()

Unnamed: 0,prompt,txt,label
49944,1481 2430 4780 17 86 1607 68 70 1724 1722 2729...,10 1502 2157 1591 2850 15 1502 2157 1591 2451 ...,1
587,1502 2332 1725 1479 3125 15 1493 2209 3034 150...,76 1549 81 10 87 1640 1586 2325 1559 1569 76 1...,1
36306,1602 2158 1944 2437 68 2607 2234 17 1487 1574 ...,76 4356 1764 76 1515 1487 1479 1616 1500 1574 ...,1
6573,1479 1845 2633 1831 1479 3573 1491 2329 2995 1...,13 1520 1678 1479 2025 1557 1502 34 1518 13 15...,1
100102,2571 17 1514 74 1620 1493 3943 2035 1641 2396 ...,1518 3300 1656 4429 15 1520 4630 2145 2038 394...,1


In [15]:
# Vectorize input data using Bag-of-Words representation
vectorizer = CountVectorizer(max_features=5000)
X_train1 = vectorizer.fit_transform(train_data1['txt']).toarray()
y_train1 = train_data1["label"].values
X_val1 = vectorizer.transform(val_data1['txt']).toarray()
y_val1 = val_data1["label"].values

X_train2 = vectorizer.fit_transform(train_data2['txt']).toarray()
y_train2 = train_data2["label"].values
X_val2 = vectorizer.transform(val_data2['txt']).toarray()
y_val2 = val_data2["label"].values

X_train3 = vectorizer.fit_transform(train_data1['prompt']).toarray()
y_train3 = train_data1["label"].values
X_val3 = vectorizer.transform(val_data1['prompt']).toarray()
y_val3 = val_data1["label"].values

X_train4 = vectorizer.fit_transform(train_data2['prompt']).toarray()
y_train4 = train_data2["label"].values
X_val4 = vectorizer.transform(val_data2['prompt']).toarray()
y_val4 = val_data2["label"].values

In [None]:
# Train logistic regression model
model1 = LogisticRegression(max_iter = 1000)
model1.fit(X_train1, y_train1)

model2 = LogisticRegression(max_iter = 1000)
model2.fit(X_train2, y_train2)

# Make predictions and evaluate
y_pred_val1 = model1.predict(X_val1)
y_pred_val2 = model2.predict(X_val2)

print("\nTXT:\n")
print("Domain1 Validation Accuracy:", accuracy_score(y_val1, y_pred_val1))
print("Domain2 Validation Accuracy:", accuracy_score(y_val2, y_pred_val2))
print("\n" + "-" * 40 + "\n")

# Prompt fit
model3 = LogisticRegression(max_iter = 1000)
model3.fit(X_train3, y_train3)

model4 = LogisticRegression(max_iter = 1000)
model4.fit(X_train4, y_train4)

# Make predictions and evaluate
y_pred_val3 = model3.predict(X_val3)
y_pred_val4 = model4.predict(X_val4)

print("\PROMPT:\n")
print("Domain1 Validation Accuracy:", accuracy_score(y_val3, y_pred_val3))
print("Domain2 Validation Accuracy:", accuracy_score(y_val4, y_pred_val4))