# Comparing stop words and tokens in snippets

Lynette Boos

In [1]:
import nltk
from nltk.corpus import treebank
from statistics import mean

from nltk.corpus import stopwords
import string
import matplotlib.pyplot as plt
from collections import Counter
from math import log, log10
import csv

In [25]:
def read_csvtagged(csvtagged_path: str):
    """
    Reads rows from a csv .tagged file.
    Each row consists of 3 columns of information:

    COLUMN	DESCRIPTION
    ID	Unique ID for this datapoint
    TEXT	Two snippets of text separated by [SNIPPET]
    LABEL	The label for this datapoint (see below)

    The labels are:
    0	Not the same author
    1	Same author
    """
    rows = []
    with open(csvtagged_path, newline='') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            rows.append((row[0], row[1], row[2],row[3]))
    return rows

everything = read_csvtagged('/Users/lynette/Desktop/Ling-582/Class_Competition/train_split_sorted.csv')[1:]

diff = everything[:1245]
same = everything[1245:]
print(f"len diff: {len(diff)}")
print(f"len same: {len(same)}")

diffa = [text for _,text,_,_ in diff]
diffb = [text for _,_,text,_ in diff]
samea = [text for _,text,_,_ in same]
sameb = [text for _,_,text,_ in same]
# print(len(diffa))

len diff: 1245
len same: 356


## Difference in variety of tokens used

In [57]:
diff_tok = [(set(nltk.word_tokenize(texta)), set(nltk.word_tokenize(textb))) for _,texta,textb,_ in diff]
same_tok = [(set(nltk.word_tokenize(texta)), set(nltk.word_tokenize(textb))) for _,texta,textb,_ in same]

diff_tok_variety_diff = [abs(len(toka)-len(tokb)) for toka,tokb in diff_tok]
same_tok_variety_diff = [abs(len(toka)-len(tokb)) for toka,tokb in same_tok]
freqMeanDiffTok = mean(diff_tok_variety_diff)
freqMeanSameTok = mean(same_tok_variety_diff)
print(f"mean difference in variety of tokens for different texts: {freqMeanDiffTok}")
print(f"mean difference in variety of tokens for same texts: {freqMeanSameTok}")

avg_diff = mean([freqMeanDiffTok,freqMeanSameTok])
print(avg_diff)

diff_counter = 0
for row in diff_tok_variety_diff:
    if abs(row) > avg_diff:
        diff_counter+= 1
print(f"number of different texts with difference in variety of tokens higher than mean: {diff_counter}")

same_counter = 0
for row in same_tok_variety_diff:
    if abs(row) < avg_diff:
        same_counter+= 1
print(f"number of same texts with difference in variety of tokens lower than mean: {same_counter}")

mean difference in variety of tokens for different texts: 35.42168674698795
mean difference in variety of tokens for same texts: 30.60674157303371
33.01421416001083
number of different texts with difference in variety of tokens higher than mean: 533
number of same texts with difference in variety of tokens lower than mean: 234


## Density of stop words

In [52]:
diffa_stop = []
for row in diffa:
    row_stop = []
    for word in row.lower().split():
        if word in set(stopwords.words('english')):
            row_stop.append(word)
    diffa_stop.append(row_stop)
# print(diffa_stop[:10])
# print(f"len of diffa_stop: {len(diffa_stop)}")

diffb_stop = []
for row in diffb:
    row_stop = []
    for word in row.lower().split():
        if word in set(stopwords.words('english')):
            row_stop.append(word)
    diffb_stop.append(row_stop)
# print(diffb_stop[:10])
# print(f"len of diffa_stop: {len(diffa_stop)}")

samea_stop = []
for row in samea:
    row_stop = []
    for word in row.lower().split():
        if word in set(stopwords.words('english')):
            row_stop.append(word)
    samea_stop.append(row_stop)
# print(diffa_stop[:10])
# print(f"len of diffa_stop: {len(diffa_stop)}")

sameb_stop = []
for row in sameb:
    row_stop = []
    for word in row.lower().split():
        if word in set(stopwords.words('english')):
            row_stop.append(word)
    sameb_stop.append(row_stop)
# print(diffb_stop[:10])
# print(f"len of diffa_stop: {len(diffa_stop)}")

In [55]:
diff_stop_density = []
same_stop_density = []

for i in range(len(diff)):
    diff_stop_density.append((len(diffa_stop[i])/len(diffa[i]),len(diffb_stop[i])/len(diffa[i])))
    
for i in range(len(same)):
    same_stop_density.append((len(samea_stop[i])/len(samea[i]),len(sameb_stop[i])/len(samea[i])))
# print(same_stop_density[:10])

diff_stop_diff = [abs(a-b) for a,b in diff_stop_density]
same_stop_diff = [abs(a-b) for a,b in same_stop_density]
freqMeanDiffDensity = mean(diff_stop_diff)
freqMeanSameDensity = mean(same_stop_diff)
print(f"mean difference in density of stop words for different texts: {freqMeanDiffDensity}")
print(f"mean difference in density of stop words for same texts: {freqMeanSameDensity}")

avg_diff = mean([freqMeanDiffDensity,freqMeanSameDensity])
print(avg_diff)

diff_counter = 0
for row in diff_stop_diff:
    if abs(row) > avg_diff:
        diff_counter+= 1
print(f"number of different texts with density of stop words higher than mean: {diff_counter}")

same_counter = 0
for row in same_stop_diff:
    if abs(row) < avg_diff:
        same_counter+= 1
print(f"number of same texts with density of stop words lower than mean: {same_counter}")

mean difference in variety of tokens for different texts: 0.07892826523979596
mean difference in variety of tokens for same texts: 0.06025661241210909
0.06959243882595252
number of different texts with difference in variety of tokens higher than mean: 394
number of same texts with difference in variety of tokens lower than mean: 270


## Difference in actual tokens used

In [28]:
diff_tok_variety_diff = [len(set(toka)-set(tokb)) for toka,tokb in diff_tok]
same_tok_variety_diff = [len(set(toka)-set(tokb)) for toka,tokb in same_tok]
freqMeanDiffTok = mean(diff_tok_variety_diff)
freqMeanSameTok = mean(same_tok_variety_diff)
print(f"mean difference in variety of tokens for different texts: {freqMeanDiffTok}")
print(f"mean difference in variety of tokens for same texts: {freqMeanSameTok}")

avg_diff = mean([freqMeanDiffTok,freqMeanSameTok])
print(avg_diff)

diff_counter = 0
for row in diff_tok_variety_diff:
    if abs(row) > avg_diff:
        diff_counter+= 1
print(f"number of different texts with different tokens higher than mean: {diff_counter}")

same_counter = 0
for row in same_tok_variety_diff:
    if abs(row) < avg_diff:
        same_counter+= 1
print(f"number of same texts with different tokens lower than mean: {same_counter}")

mean difference in variety of tokens for different texts: 59.40803212851406
mean difference in variety of tokens for same texts: 56.28651685393258
57.84727449122332
number of different texts with different tokens higher than mean: 541
number of same texts with different tokens lower than mean: 224
