In [33]:
import pandas as pd

train_and_pretest_data = pd.read_csv('data/detect_ai.csv')
valid_data = pd.read_csv('data/detect_ai_validation.csv')
test_data = pd.read_csv('data/daigt_v4.csv')

print(f"Test data size before removing rows identical to the training data: {len(test_data)}")

train_and_pretest_data = train_and_pretest_data.drop_duplicates(subset='text')
test_data = test_data[~test_data['text'].isin(train_and_pretest_data['text'])]

print(f"Train and pretest data size: {len(train_and_pretest_data)}")
print(f"Validation data size: {len(valid_data)}")
print(f"Test data size: {len(test_data)}")

valid_data_percentage = len(valid_data) / (len(train_and_pretest_data) + len(test_data) + len(valid_data))

print(f"The valdation data is {valid_data_percentage * 100:.2f}% of the total data")

# start a timer such that we know how the entire notebook takes to run
import time
start = time.time()


Test data size before removing rows identical to the training data: 73573
Train and pretest data size: 158294
Validation data size: 1679
Test data size: 40202
The valdation data is 0.84% of the total data


In [34]:
# Print the name of the columns in the dfs
print(f"Train and pretest data columns: {train_and_pretest_data.columns}")
print(f"Validation data columns: {valid_data.columns}")
print(f"Test data columns: {test_data.columns}")

# print what values test_data[model]] has
print(f"Test data[model] has the following values: {test_data['model'].unique()}")

Train and pretest data columns: Index(['id', 'prompt_id', 'text', 'generated'], dtype='object')
Validation data columns: Index(['id', 'prompt_id', 'text', 'generated'], dtype='object')
Test data columns: Index(['text', 'label', 'prompt_name', 'source', 'RDizzl3_seven', 'model'], dtype='object')
Test data[model] has the following values: ['human' 'mistral' 'llama' 'gpt' 'claude' 'falcon' 'palm' 'cohere' 'ada'
 'babbage' 'curie' 'davinci']


In [35]:
# Removing Irrelevant Data
# and renaming the columns to be the same for all datasets
# to ['text', 'generated'] as string, boolean

train_and_pretest_data = train_and_pretest_data[['text', 'generated']]
valid_data = valid_data[['text', 'generated']]

# Convert 'generated' from 1 or 0 to True or False for both datasets
train_and_pretest_data['generated'] = train_and_pretest_data['generated'].astype(bool)
valid_data['generated'] = valid_data['generated'].astype(bool)

# For the Test dataset
test_data['generated'] = test_data['model'] != 'human'
test_data = test_data[['text', 'generated']]

print(f"Train and pretest data columns: {train_and_pretest_data.columns}")
print(f"Validation data columns: {valid_data.columns}")
print(f"Test data columns: {test_data.columns}")

train_and_pretest_data.head()

Train and pretest data columns: Index(['text', 'generated'], dtype='object')
Validation data columns: Index(['text', 'generated'], dtype='object')
Test data columns: Index(['text', 'generated'], dtype='object')


Unnamed: 0,text,generated
0,"In recent years, there has been a growing move...",True
1,---\nWhy not cars in our life\n===============...,True
2,A car is considered by many a nessecity for ev...,True
3,"H\n\nello fellow citezens , we are here to inf...",False
4,Have you ever known how if feels not being abl...,True


In [36]:
""" # download these if it's your first time running the code
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
"""

" # download these if it's your first time running the code\nimport nltk\nnltk.download('punkt')\nnltk.download('wordnet')\nnltk.download('stopwords')\nnltk.download('averaged_perceptron_tagger')\nnltk.download('vader_lexicon')\n"

### Start commenting in train_and_pretest_data & test_data from here and below if you want to process all the data

In [37]:
# Tokenization
from nltk import word_tokenize

#train_and_pretest_data['tokenized_text'] = train_and_pretest_data['text'].apply(word_tokenize)
valid_data['tokenized_text'] = valid_data['text'].apply(word_tokenize)
#test_data['tokenized_text'] = test_data['text'].apply(word_tokenize)

In [38]:
# Lowercasing

#train_and_pretest_data['tokenized_text'] = train_and_pretest_data['tokenized_text'].apply(lambda x: [word.lower() for word in x])
valid_data['tokenized_text'] = valid_data['tokenized_text'].apply(lambda x: [word.lower() for word in x])
#test_data['tokenized_text'] = test_data['tokenized_text'].apply(lambda x: [word.lower() for word in x])

In [39]:
# Syntax and Grammar Patterns
from nltk import pos_tag

#train_and_pretest_data['pos_tags'] = train_and_pretest_data['tokenized_text'].apply(pos_tag)
valid_data['pos_tags'] = valid_data['tokenized_text'].apply(pos_tag)
#test_data['pos_tags'] = test_data['tokenized_text'].apply(pos_tag)

"""

Write code here

"""

valid_data['pos_tags'].head()

0    [(_, NN), (,, ,), (_, FW), (_and, NN), (it, PR...
1    [(there, EX), (are, VBP), (advantages, NNS), (...
2    [(limiting, VBG), (car, NN), (usage, NN), (ii,...
3    [(cars, NNS), (have, VBP), (been, VBN), (one, ...
4    [(are, VBP), (cars, NNS), (even, RB), (really,...
Name: pos_tags, dtype: object

In [40]:
# Text Complexity and Diversity
"""Features like sentence length,
lexical diversity, sentiment analysis,
and complexity of ideas could also be
indicative of the source of the text."""

from nltk import sent_tokenize

# Function to calculate average sentence length in words
def average_sentence_length(text):
    sentences = sent_tokenize(text)
    if sentences:  # Check if there are any sentences to avoid division by zero
        return sum(len(word_tokenize(sentence)) for sentence in sentences) / len(sentences)
    else:
        return 0  # Return 0 if text contains no sentences

#train_and_pretest_data['av_sentence_length'] = train_and_pretest_data['text'].apply(average_sentence_length)
valid_data['av_sentence_length'] = valid_data['text'].apply(average_sentence_length)
#test_data['av_sentence_length'] = test_data['text'].apply(average_sentence_length)

# Lexical diversity
from nltk import FreqDist

#train_and_pretest_data['vocabulary_diversity'] = train_and_pretest_data['tokenized_text'].apply(lambda x: len(set(x)) / len(x))
valid_data['vocabulary_diversity'] = valid_data['tokenized_text'].apply(lambda x: len(set(x)) / len(x))
#test_data['vocabulary_diversity'] = test_data['tokenized_text'].apply(lambda x: len(set(x)) / len(x))

# Readability
"""The Flesch Reading Ease score is a
readability test that provides a numerical
score indicating how easy or difficult a
text is to understand. The score is calculated
based on the average length of sentences and
the average number of syllables per word in the text.
Scores typically range from 0 to 100,
with higher scores indicating easier readability.
Here's what the scores generally imply:

90-100: Very easy to read, easily understood by an average 11-year-old student.
60-70: Plain English, easily understood by 13- to 15-year-old students.
0-30: Very difficult to read, best understood by university graduates.
This metric is commonly used in educational settings and for assessing the accessibility of texts to different audiences."""
from textstat import flesch_reading_ease

#train_and_pretest_data['flesch_reading_ease'] = train_and_pretest_data['text'].apply(flesch_reading_ease)
valid_data['flesch_reading_ease'] = valid_data['text'].apply(flesch_reading_ease)
#test_data['flesch_reading_ease'] = test_data['text'].apply(flesch_reading_ease)



# Zipfian Coefficient
"""The Zipfian coefficient is a measure of the
distribution of word frequencies in a text.
It is calculated by plotting the frequency of
each word in the text against its rank in the
frequency table and fitting a curve to the data.
"""
from nltk import ngrams

def zipfian_coefficient(text, n=1):
    ngrams_list = list(ngrams(text, n))
    freq_dist = FreqDist(ngrams_list)
    freq_values = list(freq_dist.values())
    freq_values.sort(reverse=True)
    return freq_values[0] / freq_values[1]

#train_and_pretest_data['zipfian_coefficient'] = train_and_pretest_data['tokenized_text'].apply(zipfian_coefficient)
valid_data['zipfian_coefficient'] = valid_data['tokenized_text'].apply(zipfian_coefficient)
#test_data['zipfian_coefficient'] = test_data['tokenized_text'].apply(zipfian_coefficient)

# Perplexity
# Denne er fra artikeln men jeg fikk den ikke til å fungere
""" "Perplexity serves as another widely used metric for LLM-
generated text detection. It measures the degree of uncer-
tainty or surprise in predicting the next word in a sequence,

based on the preceding words, by calculating the negative
average log-likelihood of the texts under the language model
[5]." """
"""
from nltk.lm import MLE
from nltk.lm.preprocessing import padded_everygram_pipeline

def perplexity(text, n=2):
    train_data, padded_sents = padded_everygram_pipeline(n, text)
    model = MLE(n)
    model.fit(train_data, padded_sents)
    return model.perplexity(text)

#train_and_pretest_data['perplexity'] = train_and_pretest_data['tokenized_text'].apply(perplexity)
valid_data['perplexity'] = valid_data['tokenized_text'].apply(perplexity)
#test_data['perplexity'] = test_data['tokenized_text'].apply(perplexity)
"""


# print the head of the new features for valid_data
valid_data[['av_sentence_length', 'vocabulary_diversity', 'flesch_reading_ease', 'zipfian_coefficient']].head()

Unnamed: 0,av_sentence_length,vocabulary_diversity,flesch_reading_ease,zipfian_coefficient
0,68.666667,0.57767,77.77,1.125
1,41.666667,0.432,57.27,1.3
2,21.0,0.510204,68.81,1.142857
3,23.0,0.536232,83.8,1.222222
4,9.939394,0.027439,96.18,1.0


In [41]:
# Stop word removal
from nltk.corpus import stopwords

# Removing stopwords from the text
#train_and_pretest_data['tokenized_text'] = train_and_pretest_data['tokenized_text'].apply(lambda x: [word for word in x if word not in stopwords.words('english')])
valid_data['tokenized_text'] = valid_data['tokenized_text'].apply(lambda x: [word for word in x if word not in stopwords.words('english')])
#test_data['tokenized_text'] = test_data['tokenized_text'].apply(lambda x: [word for word in x if word not in stopwords.words('english')])

print(f"valid_data['tokenized_text'] after removing stopwords: {valid_data['tokenized_text'].head()}")

valid_data['tokenized_text'] after removing stopwords: 0    [_, ,, _, _and, fact, automobile, ,, danger, g...
1    [advantages, limiting, car, usage, less, green...
2    [limiting, car, usage, ii, beneifial, envirome...
3    [cars, one, advanced, invention, world, ;, las...
4    [cars, even, really, necessary, ?, vehicles, c...
Name: tokenized_text, dtype: object


In [42]:
# Lemmatization

from nltk.stem import WordNetLemmatizer

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize the text
#train_and_pretest_data['tokenized_text'] = train_and_pretest_data['tokenized_text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
valid_data['tokenized_text'] = valid_data['tokenized_text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
#test_data['tokenized_text'] = test_data['tokenized_text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

print(f"valid_data['tokenized_text'] after lemmatization: {valid_data['tokenized_text'].head()}")

valid_data['tokenized_text'] after lemmatization: 0    [_, ,, _, _and, fact, automobile, ,, danger, g...
1    [advantage, limiting, car, usage, le, greenhou...
2    [limiting, car, usage, ii, beneifial, envirome...
3    [car, one, advanced, invention, world, ;, last...
4    [car, even, really, necessary, ?, vehicle, cau...
Name: tokenized_text, dtype: object


In [43]:
# Sentiment Analysis
from nltk.sentiment import SentimentIntensityAnalyzer

#train_and_pretest_data['sentiment_scores'] = train_and_pretest_data['text'].apply(lambda x: SentimentIntensityAnalyzer().polarity_scores(x))
#sentiment_columns = train_and_pretest_data['sentiment_scores'].apply(pd.Series)
#train_and_pretest_data = pd.concat([train_and_pretest_data, sentiment_columns], axis=1)
#train_and_pretest_data.drop('sentiment_scores', axis=1, inplace=True)

valid_data['sentiment_scores'] = valid_data['text'].apply(lambda x: SentimentIntensityAnalyzer().polarity_scores(x))
sentiment_columns = valid_data['sentiment_scores'].apply(pd.Series)
valid_data = pd.concat([valid_data, sentiment_columns], axis=1)
valid_data.drop('sentiment_scores', axis=1, inplace=True)

#test_data['sentiment_scores'] = test_data['text'].apply(lambda x: SentimentIntensityAnalyzer().polarity_scores(x))
#sentiment_columns = test_data['sentiment_scores'].apply(pd.Series)
#test_data = pd.concat([test_data, sentiment_columns], axis=1)
#test_data.drop('sentiment_scores', axis=1, inplace=True)

valid_data[['neg', 'neu', 'pos', 'compound']].head()


Unnamed: 0,neg,neu,pos,compound
0,0.069,0.878,0.053,0.1397
1,0.069,0.832,0.099,0.8277
2,0.044,0.868,0.087,0.9392
3,0.088,0.802,0.11,0.5878
4,0.311,0.689,0.0,-0.9996


In [44]:
# TF-IDF Vectorization


In [None]:
# Remove 'text' and 'tokenized_text' columns from the datasets

In [45]:
# Normalize all features to be between 0 and 1

In [None]:
# Save datasets

In [46]:

# stop and print timer such that we know how the entire notebook takes to run
end = time.time()
print(f"Time taken to run the notebook: {end - start:.2f} seconds")

total_seconds = (end - start) / valid_data_percentage
hours = int(total_seconds // 3600)
minutes = int((total_seconds % 3600) // 60)
seconds = total_seconds % 60

print(f"\nTime it will take to run the entire notebook for all the data:\n{hours} hours, {minutes} minutes, and {seconds:.2f} seconds")


Time taken to run the notebook: 81.41 seconds

Time it will take to run the entire notebook for all the data:
2 hours, 41 minutes, and 46.19 seconds
