In [1]:
import zlib
import gzip
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report


In [43]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall, AUC
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from kerastuner.tuners import Hyperband
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
import torch



  from kerastuner.tuners import Hyperband


In [2]:
# Calculating compression factor
def calculate_compression_factor(text, algorithm="lz"):
    original_size = len(text.encode('utf-8'))
    
    if algorithm == "lz":
        compressed_text = zlib.compress(text.encode('utf-8'))
    elif algorithm == "gzip":
        compressed_text = gzip.compress(text.encode('utf-8'))
    else:
        raise ValueError("Unsupported compression algorithm!")
    
    compressed_size = len(compressed_text)
    if compressed_size == 0:
        return 0  # Avoid division by zero
    return original_size / compressed_size

In [3]:
# Computing compression factors for different window sizes and algorithms
def sliding_window_compression_factors(text, window_sizes, algorithms):
    
    compression_factors = []
    for algo in algorithms:
        for window_size in window_sizes:
            windows = [
                text[i:i + window_size]
                for i in range(0, len(text) - window_size + 1, window_size)
            ]
            if windows:
                factors = [calculate_compression_factor(window, algorithm=algo) for window in windows]
                avg_factor = sum(factors) / len(factors)
            else:
                avg_factor = 0
            compression_factors.append(avg_factor)
    return compression_factors


In [4]:
df = pd.read_csv("Dataset_with_new_features.csv")

In [5]:
# Removing rows with NaN values
nan_summary = df.isna().sum()
print("\nCount of NaN values per column:")
print(nan_summary[nan_summary > 0]) 


Count of NaN values per column:
source        912
temp            1
perplexity      1
dtype: int64


In [6]:
df = df[~df['perplexity'].isna()]


In [7]:
# Removing rows with invalid values
rows_to_drop = df[df['generated'] == 'generated'].index
dataset = df.drop(rows_to_drop)

In [8]:
dataset.shape

(9610, 29)

In [9]:

df['cleaned_text'] = df['text'].str.lower().str.replace(r'[^a-z\s]', '', regex=True)

In [10]:
# Sliding windows for compression factors
window_sizes = [2**i for i in range(1, 11)] 
algorithms = ["lz", "gzip"] 

In [11]:
# Computing compression features for each text 
compression_features = df['cleaned_text'].apply(
    lambda x: sliding_window_compression_factors(x, window_sizes, algorithms)
)

In [12]:
# Creating a DataFrame with compression features
compression_df = pd.DataFrame(compression_features.tolist(), 
                               columns=[f'comp_{algo}_{w}' for algo in algorithms for w in window_sizes])

In [13]:
# Concatenating the compression features with the original DataFrame
final_data = pd.concat([compression_df, df['generated']], axis=1)


In [14]:

final_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9611 entries, 0 to 9610
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   comp_lz_2       9610 non-null   float64
 1   comp_lz_4       9610 non-null   float64
 2   comp_lz_8       9610 non-null   float64
 3   comp_lz_16      9610 non-null   float64
 4   comp_lz_32      9610 non-null   float64
 5   comp_lz_64      9610 non-null   float64
 6   comp_lz_128     9610 non-null   float64
 7   comp_lz_256     9610 non-null   float64
 8   comp_lz_512     9610 non-null   float64
 9   comp_lz_1024    9610 non-null   float64
 10  comp_gzip_2     9610 non-null   float64
 11  comp_gzip_4     9610 non-null   float64
 12  comp_gzip_8     9610 non-null   float64
 13  comp_gzip_16    9610 non-null   float64
 14  comp_gzip_32    9610 non-null   float64
 15  comp_gzip_64    9610 non-null   float64
 16  comp_gzip_128   9610 non-null   float64
 17  comp_gzip_256   9610 non-null   f

In [17]:
# Checking the total number of NaNs in each column
nan_summary = final_data.isna().sum()
print("\nCount of NaN values per column:")
print(nan_summary[nan_summary > 0])  


Count of NaN values per column:
comp_lz_2         1
comp_lz_4         1
comp_lz_8         1
comp_lz_16        1
comp_lz_32        1
comp_lz_64        1
comp_lz_128       1
comp_lz_256       1
comp_lz_512       1
comp_lz_1024      1
comp_gzip_2       1
comp_gzip_4       1
comp_gzip_8       1
comp_gzip_16      1
comp_gzip_32      1
comp_gzip_64      1
comp_gzip_128     1
comp_gzip_256     1
comp_gzip_512     1
comp_gzip_1024    1
generated         1
dtype: int64


In [18]:
# Droppoing rows with NaN values
final_data.dropna(inplace=True)


In [26]:
# Splitting the data into training, validation, and test sets
X = final_data.drop('generated', axis=1)
y = final_data['generated']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [27]:
# Standardizing the data using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

In [22]:
# Training an SVM model
svm = SVC(kernel='rbf', random_state=42)
svm.fit(X_train, y_train)

In [23]:
# Evaluating the model
y_pred = svm.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.97      0.73      1120
           1       0.41      0.03      0.06       802

    accuracy                           0.58      1922
   macro avg       0.50      0.50      0.39      1922
weighted avg       0.51      0.58      0.45      1922



In [38]:
# Defining the objective function for Optuna optimization to find best hyperparameters for SVM
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import optuna


def objective(trial):
    C = trial.suggest_float('C', 0.01, 100.0, log=True)  
    kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly'])
    gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
    degree = trial.suggest_int('degree', 2, 5) if kernel == 'poly' else 3  
    
    # Train and validate an SVC model
    model = SVC(C=C, kernel=kernel, gamma=gamma, degree=degree)
    model.fit(X_train, y_train)
    val_predictions = model.predict(X_val)
    val_accuracy = accuracy_score(y_val, val_predictions)
    return val_accuracy


In [39]:
# Creating an Optuna study and optimizing the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100) 

[I 2024-12-08 16:54:38,207] A new study created in memory with name: no-name-6a5055f2-2175-4af0-bf10-7db58f56928c
[I 2024-12-08 16:54:40,117] Trial 0 finished with value: 0.5624349635796045 and parameters: {'C': 14.172611049485681, 'kernel': 'rbf', 'gamma': 'auto'}. Best is trial 0 with value: 0.5624349635796045.
[I 2024-12-08 16:54:41,779] Trial 1 finished with value: 0.5738813735691988 and parameters: {'C': 0.04529097180734408, 'kernel': 'linear', 'gamma': 'scale'}. Best is trial 1 with value: 0.5738813735691988.
[I 2024-12-08 16:54:42,726] Trial 2 finished with value: 0.5738813735691988 and parameters: {'C': 0.010172329224031076, 'kernel': 'linear', 'gamma': 'scale'}. Best is trial 1 with value: 0.5738813735691988.
[I 2024-12-08 16:54:44,256] Trial 3 finished with value: 0.5723204994797086 and parameters: {'C': 0.4231305254100496, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 1 with value: 0.5738813735691988.
[I 2024-12-08 16:54:45,788] Trial 4 finished with value: 0.57232049947

In [40]:
print("Best Hyperparameters:", study.best_params)
print("Best Validation Accuracy:", study.best_value)

Best Hyperparameters: {'C': 0.04529097180734408, 'kernel': 'linear', 'gamma': 'scale'}
Best Validation Accuracy: 0.5738813735691988


In [41]:
# Training the best model on the entire training set
best_params = study.best_params

best_model = SVC(
    C=best_params['C'],
    kernel=best_params['kernel'],
    gamma=best_params['gamma'],
    degree=best_params.get('degree', 3)
)
best_model.fit(X_train, y_train)

In [42]:
# Evaluating the best model on the test set
test_accuracy = accuracy_score(y_test, best_model.predict(X_test))
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.5738813735691988


In [64]:
# Splitting the data into training, validation, and test sets

X = final_data.drop('generated', axis=1)
y = final_data['generated']
y.astype(int)
X = np.array(X, dtype=np.float32)  
y = np.array(y, dtype=np.float32)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [65]:
# Building model using Keras Tuner for Multi-Layer Perceptron
def build_model(hp):
    model = Sequential([
        Input(shape=(X.shape[1],)), 
        Dense(hp.Int('units_1', 16, 64, step=32), activation='relu', kernel_regularizer=l2(1e-4)),
        Dropout(hp.Float('dropout_1', 0.2, 0.6, step=0.1)),
        Dense(hp.Int('units_2', 8, 32, step=16), activation='relu', kernel_regularizer=l2(1e-4)),
        Dropout(hp.Float('dropout_2', 0.2, 0.6, step=0.1)),
        Dense(1, activation='sigmoid')  
    ])
    model.compile(
        optimizer=Adam(learning_rate=hp.Choice('learning_rate', [1e-4, 1e-5, 1e-6])),
        loss='binary_crossentropy',
        metrics=['accuracy',Precision(), Recall(), AUC()]
    )
    return model


In [66]:
# Creating a tuner hyperband object for tuning the hyperparameters
import shutil
import os


tuning_dir = 'mlp_comp_tuning'
if os.path.exists(tuning_dir):
    shutil.rmtree(tuning_dir)
tuner = Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=5,
    factor=2,
    directory='mlp_comp_tuning',
    project_name='mlp_comp_tune_embeddings'
)

In [67]:
print(y.unique())

AttributeError: 'numpy.ndarray' object has no attribute 'unique'

In [68]:
# Tuning the hyperparameters
tuner.search(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=32, verbose=2)

Trial 21 Complete [00h 00m 03s]
val_accuracy: 0.5744016766548157

Best val_accuracy So Far: 0.5749219655990601
Total elapsed time: 00h 00m 54s


In [69]:
# Getting the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=10)[0]
print(f"Best hyperparameters: {best_hps.values}")

Best hyperparameters: {'units_1': 16, 'dropout_1': 0.2, 'units_2': 8, 'dropout_2': 0.30000000000000004, 'learning_rate': 0.0001, 'tuner/epochs': 5, 'tuner/initial_epoch': 3, 'tuner/bracket': 2, 'tuner/round': 2, 'tuner/trial_id': '0007'}


In [70]:
# Training the best model
best_model = tuner.hypermodel.build(best_hps)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = best_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=32, callbacks=[early_stopping])

Epoch 1/20
[1m181/181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6056 - auc_3: 0.5038 - loss: 0.6816 - precision_3: 0.4882 - recall_3: 0.0338 - val_accuracy: 0.5739 - val_auc_3: 0.5107 - val_loss: 0.6864 - val_precision_3: 0.0000e+00 - val_recall_3: 0.0000e+00
Epoch 2/20
[1m181/181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5952 - auc_3: 0.4822 - loss: 0.6853 - precision_3: 0.4132 - recall_3: 0.0170 - val_accuracy: 0.5739 - val_auc_3: 0.5087 - val_loss: 0.6858 - val_precision_3: 0.0000e+00 - val_recall_3: 0.0000e+00
Epoch 3/20
[1m181/181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5897 - auc_3: 0.5041 - loss: 0.6843 - precision_3: 0.6026 - recall_3: 0.0205 - val_accuracy: 0.5739 - val_auc_3: 0.5113 - val_loss: 0.6852 - val_precision_3: 0.0000e+00 - val_recall_3: 0.0000e+00
Epoch 4/20
[1m181/181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5930 - auc_3

In [71]:
# Evaluating the best model on the test set
loss, accuracy,precision, recall, auc  = best_model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")
print(f"Test Precision: {precision}")
print(f"Test Recall: {recall}")
print(f"Test AUC: {auc}")

[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5745 - auc_3: 0.5200 - loss: 0.6830 - precision_3: 0.0000e+00 - recall_3: 0.0000e+00 
Test Loss: 0.6830009818077087
Test Accuracy: 0.5738813877105713
Test Precision: 0.0
Test Recall: 0.0
Test AUC: 0.5183936357498169
