In [None]:
import datasets
import pandas as pd
import numpy as np
from datasets import load_dataset
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Dropout
import kerastuner as kt
from tensorflow.keras.layers import Input, Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
dataset = load_dataset("humarin/chatgpt-paraphrases")

Downloading readme:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/265M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
#419197
print(dataset.shape)

{'train': (419197, 4)}


In [None]:
#convert to pandas since its type is 'DataDict'
df = dataset['train'].to_pandas()

In [None]:
df.tail()

Unnamed: 0,text,paraphrases,category,source
419192,He wants to see prices brought down across the...,['He desires a reduction in prices across all ...,sentence,cnn_news
419193,"Costs: The Department for Environment, Food an...","['The Department for Environment, Food and Rur...",sentence,cnn_news
419194,Details of the action on water bills are expec...,"[""Downing Street is expected to reveal specifi...",sentence,cnn_news
419195,Chancellor George Osborne signalled the govern...,"[""In a speech last month, Chancellor George Os...",sentence,cnn_news
419196,But Mr Miliband appeared to steal a march on t...,"['Yesterday, Mr Miliband warned that the indus...",sentence,cnn_news


### Preprocessing

In [None]:
#Assign label
category={}
for i in range(len(df)):
    chatgpt=df.iloc[i]["paraphrases"][1:-1].split(', ')
    for j in chatgpt[:1]:
        category[j[1:-1]]=1 # gpt
    category[df.iloc[i]['text']]=0 # human

In [None]:
combined_df=pd.DataFrame(category.items(),columns=["text","label"])
combined_dfa=combined_df.sample(frac=1)

In [None]:
combined_df["label"].value_counts()
df_train = combined_df

0    419108
1    388920
Name: label, dtype: int64

In [None]:
df_train

Unnamed: 0,text,label
0,Can you provide a detailed procedure for inves...,1
1,What is the step by step guide to invest in sh...,0
2,Can you tell me about the history of the Kohin...,1
3,What is the story of Kohinoor (Koh-i-Noor) Dia...,0
4,What are some ways to enhance my internet spee...,1
...,...,...
808023,Downing Street is expected to reveal specifics...,1
808024,Details of the action on water bills are expec...,0
808025,In a speech last mont,1
808026,Chancellor George Osborne signalled the govern...,0


In [None]:
# data cleaning functions
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' ', string)

def clean_sentence(sentence):
    sentence = re.sub(r'http\S+', ' ', sentence) # remove URLS
    sentence = remove_emoji(sentence) # remove emoji's
    sentence = re.sub("[^0-9A-Za-z ]", "" , sentence) # remove punctuation
    sentence = sentence.replace('  ',"") # remove double spaces
    return sentence.strip()

def remove_stopwords(tokens, stopwords):
    clean_token = [word for word in tokens if word not in stopwords]
    return clean_token

def lemmatize(tokens, lemma):
    lemmatized_tokens = [lemma.lemmatize(token, pos = 'v') for token in tokens]
    return lemmatized_tokens

def processing(df, stopwords, lemma):
    df['text'] = df['text'].apply(lambda sentence: sentence.lower()) # lowercasing
    df['text'] = df['text'].apply(lambda sentence: clean_sentence(sentence)) # cleaning
    df['text'] = df['text'].apply(lambda sentence: sentence.split()) # tokenization
    df['text'] = df['text'].apply(lambda sentence: remove_stopwords(sentence, stopwords)) # remove stopwords
    df['text'] = df['text'].apply(lambda sentence: lemmatize(sentence, lemma)) # lemmalization
    df['text'] = df['text'].apply(lambda sentence: ' '.join(sentence)) # detokenization

In [None]:
# nltk.download('stopwords')
# nltk.download('omw-1.4')
stop_words = nltk.corpus.stopwords.words('english')
lemma = WordNetLemmatizer()

processing(df_train, stop_words, lemma)

## Train

In [None]:
texts = df_train['text'].values
labels = df_train['label'].values

In [None]:
texts

array(['Can you provide a detailed procedure for investing in the Indian stock market?',
       'What is the step by step guide to invest in share market in india?',
       'Can you tell me about the history of the Kohinoor (Koh-i-Noor) Diamond?',
       ..., 'In a speech last mont',
       "Chancellor George Osborne signalled the government's plans in a speech last month when he said: 'There are important improvements we can make to the scale of energy and water bills, the cost of housing, the fees paid for everyday financial services, the expense of rail and road travel.",
      dtype=object)

In [None]:
# Constants
VOCAB_SIZE = 10000  # Adjust as needed
MAX_SEQUENCE_LENGTH = 100  # Adjust based on your text length
EMBEDDING_DIM = 50  # Size of the word embeddings
NUM_HEADS = 4  # Number of attention heads
TRANSFORMER_UNITS = [64, 64]  # Size of the transformer layers

In [None]:
# Text preprocessing
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, random_state=42)

In [None]:
def transformer_block(x, num_heads, ff_dim, rate=0.1):
    attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=ff_dim)(x, x)
    attn_output = Dropout(rate)(attn_output)
    out1 = LayerNormalization(epsilon=1e-6)(x + attn_output)
    ffn_output = Dense(ff_dim, activation="relu")(out1)
    ffn_output = Dense(x.shape[-1])(ffn_output)
    ffn_output = Dropout(rate)(ffn_output)
    return LayerNormalization(epsilon=1e-6)(out1 + ffn_output)

In [None]:
# Model architecture
input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,))
embedding_layer = Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)(input_layer)


In [None]:
x = embedding_layer
for units in TRANSFORMER_UNITS:
    x = transformer_block(x, NUM_HEADS, units)

x = GlobalAveragePooling1D()(x)
output_layer = Dense(1, activation='sigmoid')(x)

model = Model(inputs=input_layer, outputs=output_layer)

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
# Training the model
model.fit(X_train, y_train, batch_size=128, epochs=10, validation_data=(X_val, y_val))


Epoch 1/10
 257/5051 [>.............................] - ETA: 1:05:09 - loss: 0.5519 - accuracy: 0.7049

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Loss: {loss}')
print(f'Validation Accuracy: {accuracy}')

Validation Loss: 0.6831507682800293
Validation Accuracy: 0.8118000030517578


## Tuning Hyper-parameter

In [None]:
def build_model(hp):
    input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,))
    embedding_layer = Embedding(VOCAB_SIZE,
                                hp.Int('embedding_dim', min_value=32, max_value=512, step=32),
                                input_length=MAX_SEQUENCE_LENGTH)(input_layer)
    x = embedding_layer
    for i in range(hp.Int('num_layers', 1, 3)):
        x = transformer_block(x,
                              hp.Int('num_heads', 2, 8, step=2),
                              hp.Int('units', min_value=32, max_value=128, step=32))
    x = GlobalAveragePooling1D()(x)
    output_layer = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [None]:
tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=1,
    directory='my_dir',
    project_name='hparam_tuning')

In [None]:
tuner.search(X_train, y_train, epochs=10, validation_data=(X_val, y_val))
val_loss, val_accuracy = best_model.evaluate(X_val, y_val)
for hp in best_hp.space:
    print(f"{hp.name}: {best_hp.get(hp.name)}")

In [None]:
predictions = (best_model.predict(X_val) > 0.5).astype("int32")

In [None]:
precision = precision_score(y_val, predictions)
recall = recall_score(y_val, predictions)
f1 = f1_score(y_val, predictions)

In [None]:
y_pred_probs = best_model.predict(X_val).ravel()