# EXPERIMENT

In [None]:
# Installation - do only one time

!pip install matplotlib
!pip install PyPDF2
!pip install nltk
!pip install pandas
!pip install openpyxl
!pip install scipy

In [None]:
# Import - do only one time

import matplotlib.pyplot as plt
import PyPDF2
import pandas as pd
import openpyxl
import math
import nltk
from nltk.tokenize import word_tokenize
from string import punctuation
from collections import Counter
nltk.download('punkt')

In [None]:
# Function definitions - do only one time

def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
        return text


def extract_text_from_txt(file_path):
    with open(file_path) as f:
        lines = f.readlines()
        text = ""
        for line in lines:
            text += line.strip()
        return text

def calculate_entropy(frequencies):
    total_count = sum(frequencies.values())
    entropy = 0.0
    for count in frequencies.values():
        probability = count / total_count
        entropy += probability * math.log2(probability)
    entropy = -entropy
    return entropy

In [None]:
# File paths

file_path = 'resources/chatgpt12.txt'
output_path = 'results/chatgpt12.xlsx'

In [None]:
# Import Txt files

text = extract_text_from_txt(file_path)

In [None]:
# Inspect text

print(text[:10000])

In [None]:
# Tokenize text

tokens = word_tokenize(text)
tokens = [token.lower() for token in tokens]
word_counts = Counter(tokens)

In [None]:
# Print total number of tokens (words)

print(len(tokens))

In [None]:
# Print token result

print(word_counts)

In [None]:
# Create dataframe from tokens

df = pd.DataFrame.from_dict(word_counts, orient='index', columns=['Frequency'])
df = df.sort_values(by='Frequency', ascending=False)
display(df)

In [None]:
# Save result to excel

df.to_excel(output_path, index_label='Token')

In [None]:
# Sort words - frequencies, extract values for calculations

sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
words = [word for word, count in sorted_word_counts]
frequencies = [count for word, count in sorted_word_counts]

In [None]:
# Calsulate entropy

word_frequencies = dict(sorted_word_counts)
entropy = calculate_entropy(word_frequencies)

print("Entropy:", entropy)

In [None]:
# Calculate Shannon entropy

N = len(words) # Number of tokens
Hmax = math.log2(N)
equitability = entropy / Hmax

print("Shannon entropy:", equitability)

In [None]:
# Graph

plt.figure(figsize=(10, 6))
plt.plot(range(len(words)), frequencies, marker='o')
plt.xlabel('Index')
plt.ylabel('Frequency')
plt.title('Zipf distribution')
plt.tight_layout()
plt.show()

# STATISTICS - Calculated after the whole experiment

In [None]:
# Imports - Do only once
import numpy as np
import scipy.stats as stats

In [None]:
# Sets of Shannon entropy results - CAUSION: VALUES ARE ADDED MANUALLY AFTER CONDUCTING EXPERIMENT!!!!!
humans = [0.833113900126817, 0.8271454592911431, 0.8308515612682966, 0.8260935292878034, 0.8046836710696255, 0.8032746326804925, 0.8275908995933942, 0.8195706120205691, 0.8292101326005887, 0.8049577285396834, 0.8074345124072044, 0.8361819144172309]
chatgpt = [0.7803750294587815, 0.7809830951558806, 0.8158560359315511, 0.8291589192856613, 0.8157356267811509, 0.8203193596589072, 0.8031243808928831, 0.8161985942211579, 0.8191580234185233, 0.7998535368220583, 0.8013652965671577, 0.8051804650034923]

In [None]:
# Mean
mean_humans = np.average(humans)
print(mean_humans)
mean_chatgpt = np.average(chatgpt)
print(mean_chatgpt)

print("Difference: ", str(mean_humans - mean_chatgpt))

In [None]:
# Median
median_humans = np.median(humans)
print(median_humans)
median_chatgpt = np.median(chatgpt)
print(median_chatgpt)

print("Difference: ", str(median_humans - median_chatgpt))

In [None]:
# Variance
variance_humans = np.var(humans)
print(variance_humans)
variance_chatgpt = np.var(chatgpt)
print(variance_chatgpt)

In [None]:
# T-test
stats.ttest_ind(a=humans, b=chatgpt, equal_var=True)