# Appendix 1. Codes (Python)

The following is the entirety of the codes included in the main text.

Data used in the paper can be found at https://github.com/jhshlee/ling4181-progress/tree/main

The following codes are written in Python.

In [None]:
# Package installation

import nltk
import os
import random
import pandas as pd
import collections
import string
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist

nltk.download('punkt')
print("Done!")

In [None]:
# Preprocessing for everything except mean sentence length

def preprocess(filename):
    text = open(filename, 'r').read().replace("\n"," ").lower()
    return text.translate(str.maketrans("","", string.punctuation)).split()

In [None]:
# preprocessing:
text_a = preprocess('author_a.txt') # sample texts
text_b = preprocess('author_b.txt')
text_c = preprocess('author_c.txt')
text_d = preprocess('author_d.txt')
text_e = preprocess('author_e.txt')
text_f = preprocess('author_f.txt')

q_a = preprocess('q1.txt') # query texts
q_b = preprocess('q2.txt')
q_c = preprocess('q3.txt')
q_d = preprocess('q4.txt')
q_e = preprocess('q5.txt')
q_f = preprocess('q6.txt')

In [None]:
# Lexical Analysis
## type-token ratio
def typetoken_ratio(text):
    return round(len(set(text))/len(text),4)

In [None]:
## simpson's D
def simpson_D(text):
    count = collections.Counter(text)
    types = set(text)
    n = len(text)
    def VOC(r):
        VOC = 0
        for i in types: # i is a word(type)
            if count.get(i) == r:
                VOC += 1
        return VOC
    if sum(VOC(r) for r in range(1, n)) == 0:
        return 1
    else:
        return round(sum(VOC(r) * (r**2 - r) / (n**2 - n) for r in range(1,n+1)),4)

In [None]:
## Big Word index
def BWI(text):
    big_word = 0
    for word in text:
        if len(word) >= 7:
            big_word += 1
    return big_word / len(text)

In [None]:
## Mean sentence length
def mean_sent(filename): # put raw text, not split by space or deleted punctuation marks!
    text = open(filename, 'r').read().replace("\n"," ").lower()
    t = sent_tokenize(text)
    split = []
    for sent in t:
        a = sent.translate(str.maketrans("","", string.punctuation)).split()
        split.append(a)
    return sum(len(sent) for sent in split) / len(split)

In [None]:
## Mean word length
def mean_word(text):
    return sum(len(word) for word in text) / len(text)

In [None]:
### Splitting sample texts to match lengths
text_a_5k = text_a[:15000]
text_b_5k = text_b[:15000]
text_c_5k = text_c[:15000]

In [None]:
## Word length distribution
def wordlength(text):
    dist = {}
    n = len(max(text, key=len))
    X = list(i for i in range(1,n+1))
    def length(r):
        length = 0
        for word in text:
            if len(word) == r:
                length += 1
        return length
    for x in X:
        dist[x] = length(x)
    return dist

In [None]:
## Lexical density
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

def l_density(text):
    filtered = []
    for w in text:
        if w not in stopwords:
            filtered.append(w)
    return round(len(filtered) / len(text),4)

In [None]:
# Distance-based analysis (Burrows' Delta)
## Most frequent word list
def MFW(text):
    freq = FreqDist(text)
    MFWlist = freq.most_common(300)
    return MFWlist

def MFW_100(text):
    return 100 * sum(i[1] for i in MFW(text)) / len(text)

In [None]:
## Absolute frequency table
def abs_table(xa, xb, xc):
    dict_b = (dict(MFW(xb)))
    dict_c = (dict(MFW(xc)))
    table = pd.DataFrame(MFW(xa)).rename(columns={0: 'word', 1:'a'})
    table.set_index('word',inplace=True)
    table["b"] = ""
    table["c"] = ""
    for n in MFW(xa):
        word = n[0]
        if dict_b.get(word) != None and dict_c.get(word) != None:
            table.loc[word,"b"] = dict_b.get(word)
            table.loc[word,"c"] = dict_c.get(word)
        else:
            table.loc[word,"b"] = np.nan
            table.loc[word,"c"] = np.nan
        table.dropna(inplace= True)
    return table

In [None]:
## Relative frequency table
def rel_table(xa, xb, xc):
    table = abs_table(xa, xb, xc)
    table = table.astype(float)
    table["words"] = table.index
    table.loc[:,"a"] = round(table["a"] / len(xa),5)
    table.loc[:,"b"] = round(table["b"] / len(xb),5)
    table.loc[:,"c"] = round(table["c"] / len(xc),5)
    table.loc[:,"mean"] = table.mean(axis='columns')
    table.loc[:,"sd"] = table.std(axis='columns')
    return table

In [None]:
## Z-score table
def zscore_table(a,b,c):
    dict_q = dict(collections.Counter(q))
    table = abs_table(a, b, c)
    table = table.astype(float)
    table["words"] = table.index
    table["q"] = ""
    table.loc[:,"a"] = round(table["a"] / len(a),5)
    table.loc[:,"b"] = round(table["b"] / len(b),5)
    table.loc[:,"c"] = round(table["c"] / len(c),5)
    table.loc[:,"mean"] = table.mean(axis='columns')
    table.loc[:,"sd"] = table.std(axis='columns')
    for word in table["words"]:
        if dict_q.get(word) != None:
            table.loc[word,"q"] = round((dict_q.get(word) / len(q)),5)
        else:
            table.loc[word,"q"] = np.nan
    table.loc[:,"z_a"] = (table["a"] - table["mean"]) / table["sd"] # calculates z-scores for columns a,b,c,q
    table.loc[:,"z_b"] = (table["b"] - table["mean"]) / table["sd"]
    table.loc[:,"z_c"] = (table["c"] - table["mean"]) / table["sd"]
    table.loc[:,"z_q"] = (table["q"] - table["mean"]) / table["sd"]
    table.dropna(inplace= True) # deletes rows that contain NaN
    table.drop('words', axis = 'columns',inplace= True) # deletes the redundant column
    return table

In [None]:
## Delta score
def delta(df): # calculates delta score between the column a in the given dataframe and the query text
    delta_a = round(sum(list(abs(df["z_a"]-df["z_q"]))) / len(df),5)
    delta_b = round(sum(list(abs(df["z_b"]-df["z_q"]))) / len(df),5)
    delta_c = round(sum(list(abs(df["z_c"]-df["z_q"]))) / len(df),5)
    return delta_a, delta_b, delta_c

## Tables and visualizations

In [None]:
# Tables 1-3 (type-token ratio):

# Number of tokens
print(len(text_a))
print(len(text_b))
print(len(text_c))
print(len(text_d))
print(len(text_e))
print(len(text_f))

# Number of types
print(len(set(text_a)))
print(len(set(text_b)))
print(len(set(text_c)))
print(len(set(text_d)))
print(len(set(text_e)))
print(len(set(text_f)))

tt = pd.DataFrame({'token':[21654, 22897, 15234, 51333, 51063, 50910],
                        'type':[3560, 3640, 2287, 4848, 6679, 7382]}, index = ['a', 'b', 'c', 'd','e','f'])
ttratio = tt.loc[:,"ratio"] = round(ttratio["type"] / ttratio["token"],4)

In [None]:
print(tt) # Table 1
print(ttratio) # Table 2

In [None]:
print(typetoken_ratio(q_a))
print(typetoken_ratio(q_b))
print(typetoken_ratio(q_c))
print(typetoken_ratio(q_d))
print(typetoken_ratio(q_e))
print(typetoken_ratio(q_f))

# Table 3
ttratio_q = pd.DataFrame({'sample':[0.1644,0.1590,0.1501,0.0944,0.1308,0.1450],
                        'query':[0.2593,0.3952,0.3203,0.3065,0.3578,0.3496]}, index = ['a', 'b', 'c', 'd','e','f'])
print(ttratio_q)

In [None]:
# Table 5 (Simpson's D)
print(simpson_D(text_a))
print(simpson_D(text_b))
print(simpson_D(text_c))
print(simpson_D(text_d))
print(simpson_D(text_e))
print(simpson_D(text_f))
print(simpson_D(q_a))
print(simpson_D(q_b))
print(simpson_D(q_c))
print(simpson_D(q_d))
print(simpson_D(q_e))
print(simpson_D(q_f)) # these results were compiled to a .csv file and imported  back to create a dataframe:

simpsonD = pd.read_csv('Simpson_D.csv')
simpsonD.set_index('index',inplace=True)

In [None]:
# Table 7 (Big Words Index)
print(BWI(text_a))
print(BWI(text_b))
print(BWI(text_c))
print(BWI(text_d))
print(BWI(text_e))
print(BWI(text_f))
print(BWI(q_a))
print(BWI(q_b))
print(BWI(q_c))
print(BWI(q_d))
print(BWI(q_e))
print(BWI(q_f)) # these results were compiled to a .csv file and imported  back to create a dataframe:

BWI = pd.read_csv('BWI.csv')
BWI.set_index('index',inplace=True)

In [None]:
# Table 9 (Mean sentence length)

sent_a = mean_sent('author_a.txt')
sent_b = mean_sent('author_b.txt')
sent_c = mean_sent('author_c.txt')
sent_d = mean_sent('author_d.txt')
sent_e = mean_sent('author_e.txt')
sent_f = mean_sent('author_f.txt')
q_sent_a = mean_sent('q1.txt')
q_sent_b = mean_sent('q2.txt')
q_sent_c = mean_sent('q3.txt')
q_sent_d = mean_sent('q4.txt')
q_sent_e = mean_sent('q5.txt')
q_sent_f = mean_sent('q6.txt')

print(sent_a)
print(sent_b)
print(sent_c)
print(sent_d)
print(sent_e)
print(sent_f)
print(q_sent_a)
print(q_sent_b)
print(q_sent_c)
print(q_sent_d)
print(q_sent_e)
print(q_sent_f) # these results were compiled to a .csv file and imported  back to create a dataframe:

MSL = pd.read_csv('MSL.csv')
MSL.set_index('index',inplace=True)

In [None]:
# Table 11 (Mean word length)
print(mean_word(text_a_split))
print(mean_word(text_b_split))
print(mean_word(text_c_split))
print(mean_word(text_d))
print(mean_word(text_e))
print(mean_word(text_f))
print(mean_word(q_a))
print(mean_word(q_b))
print(mean_word(q_f))
print(mean_word(q_d))
print(mean_word(q_e))
print(mean_word(q_f)) # these results were compiled to a .csv file and imported  back to create a dataframe:

MWL = pd.read_csv('MWL.csv')
MWL.set_index('index',inplace=True)

In [None]:
# a function for data correction for word length distribution graph (figure 2&3)
def find_length(text,r):
    word = []
    for i in text:
        if len(i) == r:
            word.append(i)
    return word

In [None]:
# Table 13 (Lexical Density)
print(l_density(text_a))
print(l_density(text_b))
print(l_density(text_c))
print(l_density(text_d))
print(l_density(text_e))
print(l_density(text_f))

print(l_density(q_a))
print(l_density(q_b))
print(l_density(q_c))
print(l_density(q_d))
print(l_density(q_e))
print(l_density(q_f)) # these results were compiled to a .csv file and imported  back to create a dataframe:

LD = pd.read_csv('LD.csv')
LD.set_index('index',inplace=True)
LD

In [None]:
# Burrows' Delta
q = q_a
abc_a = zscore_table(text_a, text_b, text_c) # Table 15
q = q_b
abc_b = zscore_table(text_a, text_b, text_c)
q = q_c
abc_c = zscore_table(text_a, text_b, text_c)
q = q_d
def_d = zscore_table(text_d, text_e, text_f)
q = q_e
def_e = zscore_table(text_d, text_e, text_f)
q = q_f
def_f = zscore_table(text_d, text_e, text_f)

In [None]:
print(delta(abc_a))
print(delta(abc_b))
print(delta(abc_c))
print(delta(def_d))
print(delta(def_e))
print(delta(def_f))

In [None]:
# Table 17 (F1-score)
f1 = pd.read_csv('f1_score.csv')
f1.set_index('index',inplace=True)
f1

In [None]:
# Table 18
assigned = pd.read_csv('assigned_table.csv')
assigned.set_index('data',inplace=True)
assigned

In [None]:
# Table 19
acc = pd.read_csv('ans_rate.csv')
acc.set_index('data',inplace=True)
acc