### Part 1: Data Preporcessing

In [1]:
#imports
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [2]:
# Necessary data is downloaded
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('universal_tagset')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/iremesendemir/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/iremesendemir/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/iremesendemir/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/iremesendemir/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/iremesendemir/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
# reading the file
corpora_file = open("Jane Austen Processed.txt", "r")
corpora = corpora_file.read()
corpora_file.close()

In [None]:
# tokenize the text
tokens = word_tokenize(corpora)

In [33]:
# part 1.b
N = len(tokens)
print("Number of tokens in the corpus (N):", N);

Number of tokens in the corpus (N): 686618


In [None]:
# tokenized text
pos_tags = pos_tag(tokens, tagset='universal')

In [7]:
# lemmatizing the tokens based on the pos tags
def lemmatize_tokens(pos_tags):
    tag_dict = {"ADJ": wordnet.ADJ,
                "NOUN": wordnet.NOUN}
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = []
    for word, pos_tag in pos_tags:
        if pos_tag in tag_dict:
            lemmatized_token = lemmatizer.lemmatize(word, tag_dict[pos_tag]).lower()
        else:
            lemmatized_token = word.lower()
        lemmatized_tokens.append((lemmatized_token, pos_tag))
    return lemmatized_tokens

In [None]:
# lemmatized tokens
lemmatized_tokens = lemmatize_tokens(pos_tags)

In [9]:
# part 1.d, 
part1d_tokens = ["that","the","london","honor","."]
#the counts of the query_tokens words in the lemmatized_tokens
word_counts = {word: 0 for word in part1d_tokens}
for word, _ in lemmatized_tokens:
    if word in word_counts:
        word_counts[word] += 1

print("The counts of the part 1.d words are:\n")
for word, count in word_counts.items():
    print(f"{word}: {count}")


The counts of the part 1.d words are:

that: 6432
the: 21715
london: 193
honor: 2
.: 18319


In [10]:
# all bigrams are found based on the given window size, with their POS tags
def find_all_bigrams(lemmatized_tokens, window_size):
    bigrams = []
    for i in range(len(lemmatized_tokens)):
        word_1_pair = lemmatized_tokens[i]
        window = lemmatized_tokens[i+1:i+1+window_size]
        for word_pair in window:
            bigram = (word_1_pair, word_pair)
            bigrams.append(bigram)
    return bigrams

In [12]:
# filtering the bigrams by their pos tag 
def filter_by_tag(bigrams):
    candidate_collocations = []
    for bigram in bigrams:
        if (bigram[0][1] == "NOUN" or bigram[0][1] == "ADJ") and bigram[1][1] == "NOUN":
            candidate_collocation = (bigram[0][0], bigram[1][0])
            candidate_collocations.append(candidate_collocation)
    return candidate_collocations

In [16]:
# eliminate bigrams including stopwords (https://gist.github.com/sebleier/554280).
def get_stopwords():
    stop_words_file = open("nltk_list_of_english_stopwords.txt","r", encoding='utf-8-sig')
    stop_words_list = [stop_word.strip() for stop_word in stop_words_file]
    stop_words_file.close()
    return stop_words_list

In [17]:
# filtering the bigrams by the stop words
def filter_by_stopwords(bigrams, stopwords):
    candidate_collocations = []
    for bigram in bigrams:
        if bigram[0] not in stopwords and bigram[1] not in stopwords:
            candidate_collocation = bigram
            candidate_collocations.append(candidate_collocation)
    return candidate_collocations

In [19]:
# filtering the bigrams by the punctuation_marks 
def filter_by_punctuation_marks(bigrams):
    candidate_collocations = []
    for word1, word2 in bigrams:   
        if word1.isalpha() and word2.isalpha():
            bigram = (word1, word2)
            candidate_collocations.append(bigram)
    return candidate_collocations

In [20]:
# filtering the bigrams by their occurance number
def filter_by_occurance(bigrams, min_occurance):
    bigram_counts = {}
    # Count occurrences
    for bigram in bigrams:
        if bigram not in bigram_counts:
            bigram_counts[bigram] = 1
        else:
            bigram_counts[bigram] += 1

    # Filter bigrams by min_occurance
    candidate_collocations = {}
    for bigram, occurance in bigram_counts.items():
        if occurance >= min_occurance:
            candidate_collocations[bigram] = occurance
    return candidate_collocations.keys(), candidate_collocations


In [21]:
# get the word count of the bigrams
def get_word_counts(bigrams):
    word_counts = {"word1":{}, "word2":{}}
    for ((word1, pos_tag1), (word2, pos_tag2)) in bigrams:
        if word1 not in word_counts["word1"]:
            word_counts["word1"][word1] = 1
        else:
            word_counts["word1"][word1] += 1
            
        if word2 not in word_counts["word2"]:
            word_counts["word2"][word2] = 1
        else:
            word_counts["word2"][word2] += 1
    return word_counts

In [22]:
# candidate collocations, bigram counts and word counts are found for the given window sizes
window_sizes = [1,3]
collocations_dict = {}
stopwords = get_stopwords()
for window_size in window_sizes:
    bigrams = find_all_bigrams(lemmatized_tokens, window_size) #print(f"Number of bigrams for window size {window_size}:", len(bigrams))
    candidate_collocations = filter_by_tag(bigrams) 
    candidate_collocations = filter_by_stopwords(candidate_collocations, stopwords) 
    candidate_collocations = filter_by_punctuation_marks(candidate_collocations) 
    candidate_collocations, bigram_counts = filter_by_occurance(candidate_collocations, min_occurance=10)
    word_counts = get_word_counts(bigrams)
    collocations_dict[window_size] = {
        "candidate_collocations": candidate_collocations,
        "bigram_counts": bigram_counts,
        "word_counts": word_counts,
        "bigrams": bigrams
    }

In [23]:
#part 1.e
window_1_bigrams = collocations_dict[1]["bigrams"]
window_1_bigrams_without_pos_tags = []
for ((word_1, pos_tag_1),(word_2, pos_tag_2)) in window_1_bigrams:
    window_1_bigrams_without_pos_tags.append((word_1, word_2))

asked_bigram = ("good", "company")
print("Count of {}: {}".format(asked_bigram, window_1_bigrams_without_pos_tags.count(asked_bigram)))

window_1_bigrams = collocations_dict[3]["bigrams"]
window_1_bigrams_without_pos_tags = []
for ((word_1, pos_tag_1),(word_2, pos_tag_2)) in window_1_bigrams:
    window_1_bigrams_without_pos_tags.append((word_1, word_2))
asked_bigram = ("old", "friend")
print("Count of {}: {}".format(asked_bigram, window_1_bigrams_without_pos_tags.count(asked_bigram)))

Count of ('good', 'company'): 11
Count of ('old', 'friend'): 16


In [24]:
#part 1.f

asked_bigram = ("mr.","skimpole")
result = asked_bigram in collocations_dict[1]["candidate_collocations"]
print("Is the bigram {} a candidate collocation: {}".format(asked_bigram, result))


#part 1.f
asked_bigram = ("large", "fortune")
result = asked_bigram in collocations_dict[3]["candidate_collocations"]
print("Is the bigram {} a candidate collocation: {}".format(asked_bigram, result))


Is the bigram ('mr.', 'skimpole') a candidate collocation: False
Is the bigram ('large', 'fortune') a candidate collocation: True


### Part 2: Finding the Collocations

In [26]:
import pandas as pd
import numpy as np
from scipy.stats import binom
from IPython.display import display

In [27]:
# calculating t-score
def t_test_score(bigram, bigram_counts, word_counts, N):
    word1 = bigram[0]
    word2 = bigram[1]
    bigram_count = bigram_counts[bigram]
    word1_count = word_counts["word1"][word1]
    word2_count = word_counts["word2"][word2]
    # t_score = sample_mean - population_mean / (sample_variance / N)^0.5
    mle = bigram_count / N
    h0 = (word1_count/N) * (word2_count/N)
    sample_mean = mle
    sample_variance = mle # as p is small, p is approximated by mle
    t_score = (sample_mean - h0) / (sample_variance / N)**0.5
    return t_score

In [29]:
# calculating chi-square score
def chi_square_score(bigram, bigram_counts, word_counts, N):
    word1 = bigram[0]
    word2 = bigram[1]
    bigram_count = bigram_counts[bigram]
    word1_count = word_counts["word1"][word1]
    word2_count = word_counts["word2"][word2]
    #observed matrix 
    observed = np.array([[bigram_count, word2_count - bigram_count], 
                         [word1_count - bigram_count, N - (word1_count + word2_count - bigram_count)]
                        ], dtype=np.float64)

    X2_num = N * (observed[0][0]*observed[1][1] - observed[0][1]*observed[1][0])**2 
    X2_denum = (observed[0][0] + observed[0][1]) * (observed[0][0] + observed[1][0]) * (observed[0][1] + observed[1][1]) * (observed[1][0] + observed[1][1])
    X2 = X2_num / X2_denum
    return X2

In [30]:
# calculating the likelihood ratio score
def likelihood_ratio_score(bigram, bigram_counts, word_counts, N):
    word1 = bigram[0]
    word2 = bigram[1]
    c12 = bigram_counts[bigram]
    c1 = word_counts["word1"][word1]
    c2 = word_counts["word2"][word2]
    p = c2 / N
    p1 = c12 / c1
    p2 = (c2 - c12) / (N - c1)
    threshold = 1e-100 # to not to calculate log(0), threshold is used
    #likelihood of h1(independence), using binomial distribution
    L1_1 = np.log(max(binom.pmf(c12, c1, p), threshold))
    L1_2 = np.log(max(binom.pmf(c2 - c12, N - c1, p), threshold))
    #likelihood of h2(dependence), using binomial distribution
    L2_1 = np.log(max(binom.pmf(c12, c1, p1), threshold))
    L2_2 = np.log(max(binom.pmf(c2 - c12, N - c1, p2), threshold))
    #likelihood ratio score
    LR = -2 * (L1_1 + L1_2 - L2_1 - L2_2)
    return LR

In [31]:
def dict_to_df(collocations_dict, window_size):
    bigram_counts = collocations_dict["bigram_counts"]
    word_counts = collocations_dict["word_counts"]
    candidate_collocations = collocations_dict["candidate_collocations"]

    data = []

    for bigram in candidate_collocations:
        w1, w2 = bigram
        c_w1w2 = bigram_counts.get(bigram, 0)
        c_w1 = word_counts["word1"].get(w1, 1)  # Avoid division by zero
        c_w2 = word_counts["word2"].get(w2, 1)
        t_score = t_test_score(bigram, bigram_counts, word_counts, N*window_size)
        chi_score = chi_square_score(bigram, bigram_counts, word_counts,  N*window_size)
        g_score = likelihood_ratio_score(bigram, bigram_counts, word_counts, N*window_size)

        # Append data as a row in the list
        data.append([w1+ ' '+ w2, t_score, chi_score, g_score, c_w1w2, c_w1, c_w2])

    # Create a DataFrame
    df = pd.DataFrame(data, columns=["w1w2",  "t-score", "chi-squared", "likelihood", "c(w1w2)", "c(w1)", "c(w2)"])
    
    return df

In [None]:
# create df for both window sizes
df_window_1 = dict_to_df(collocations_dict[1], 1)
df_window_3 = dict_to_df(collocations_dict[3], 3)

In [36]:
# window size: 1, sort by t-score
df_window_1_t_sorted = df_window_1.sort_values(by="t-score", ascending=False, ignore_index=True).reset_index(drop=True)
df_window_1_t_sorted.index += 1
print("Window size 1, sorted by t-score:")
display(df_window_1_t_sorted.head(20).drop(columns=["chi-squared", "likelihood"]))


Window size 1, sorted by t-score:


Unnamed: 0,w1w2,t-score,c(w1w2),c(w1),c(w2)
1,sir thomas,15.531729,242,785,335
2,miss crawford,14.550244,214,1282,615
3,great deal,12.262715,151,1005,214
4,young man,12.195642,150,685,636
5,young lady,11.528192,135,685,1057
6,mr elliot,11.471638,132,479,288
7,lady bertram,10.916508,120,1057,270
8,lady russell,10.841948,118,1057,147
9,captain wentworth,10.713802,115,341,216
10,sir walter,10.52057,111,785,139


In [38]:
# window size: 1, sort by chi-square-score
df_window_1_chi_sorted = df_window_1.sort_values(by="chi-squared", ascending=False, ignore_index=True).reset_index(drop=True)
df_window_1_chi_sorted.index += 1
print("Window size 1, sorted by chi-square score:")
display(df_window_1_chi_sorted.head(20).drop(columns=["t-score", "likelihood"]))

Window size 1, sorted by chi-square score:


Unnamed: 0,w1w2,chi-squared,c(w1w2),c(w1),c(w2)
1,thornton lacey,469979.208427,16,22,17
2,sir thomas,152673.775053,242,785,335
3,colonel brandon,143228.466218,88,265,140
4,captain wentworth,123152.862102,115,341,216
5,mrs clay,102021.953523,35,123,67
6,mr elliot,86556.014982,132,479,288
7,sir walter,77413.550446,111,785,139
8,great deal,72620.14489,151,1005,214
9,kellynch hall,64625.631212,20,72,59
10,lady russell,61401.870217,118,1057,147


In [39]:
# window size: 1, sort by likelihood ratio score
df_window_1_g_sorted = df_window_1.sort_values(by="likelihood", ascending=False, ignore_index=True).reset_index(drop=True)
df_window_1_g_sorted.index += 1
print("Window size 1, sorted by likelihood ratio score:")
display(df_window_1_g_sorted.head(20).drop(columns=["t-score", "chi-squared"]))

Window size 1, sorted by likelihood ratio score:


Unnamed: 0,w1w2,likelihood,c(w1w2),c(w1),c(w2)
1,sir thomas,698.725146,242,785,335
2,great deal,601.336763,151,1005,214
3,lady russell,595.540466,118,1057,147
4,sir walter,586.157584,111,785,139
5,lady middleton,541.401094,86,1057,119
6,miss crawford,537.779431,214,1282,615
7,captain wentworth,530.697003,115,341,216
8,colonel brandon,527.545064,88,265,140
9,mr elliot,526.669955,132,479,288
10,lady bertram,517.326507,120,1057,270


In [42]:
# window size: 3, sort by t-score
df_window_3_t_sorted = df_window_3.sort_values(by="t-score", ascending=False, ignore_index=True).reset_index(drop=True)
df_window_3_t_sorted.index += 1
print("Window size 3, sorted by t-score:")
display(df_window_3_t_sorted.head(20).drop(columns=["chi-squared", "likelihood"]))


Window size 3, sorted by t-score:


Unnamed: 0,w1w2,t-score,c(w1w2),c(w1),c(w2)
1,sir thomas,15.482489,242,2355,1005
2,miss crawford,14.393254,214,3846,1845
3,great deal,12.211735,151,3015,642
4,young man,12.133301,151,2055,1908
5,mr elliot,11.436663,132,1437,864
6,young lady,11.346677,135,2055,3171
7,lady bertram,10.840622,120,3171,810
8,captain wentworth,10.833154,118,1023,648
9,lady russell,10.800284,118,3171,441
10,sir walter,10.490403,111,2355,417


In [43]:
# window size: 3, sort by chi-square-score
df_window_3_chi_sorted = df_window_3.sort_values(by="chi-squared", ascending=False, ignore_index=True).reset_index(drop=True)
df_window_3_chi_sorted.index += 1
print("Window size 3, sorted by chi-square score:")
display(df_window_3_chi_sorted.head(20).drop(columns=["t-score", "likelihood"]))

Window size 3, sorted by chi-square score:


Unnamed: 0,w1w2,chi-squared,c(w1w2),c(w1),c(w2)
1,thornton lacey,156638.40305,16,66,51
2,o clock,105236.109009,32,126,159
3,sir thomas,50569.087679,242,2355,1005
4,colonel brandon,47625.563672,88,795,420
5,captain wentworth,43065.556516,118,1023,648
6,mrs clay,33960.670273,35,369,201
7,lover vow,29175.920009,10,168,42
8,mr elliot,28676.344594,132,1437,864
9,sir walter,25656.741762,111,2355,417
10,great deal,24005.860413,151,3015,642


In [44]:
# window size: 3, sort by likelihood ratio score
df_window_3_g_sorted = df_window_3.sort_values(by="likelihood", ascending=False, ignore_index=True).reset_index(drop=True)
df_window_3_g_sorted.index += 1
print("Window size 3, sorted by likelihood ratio score:")
display(df_window_3_g_sorted.head(20).drop(columns=["t-score", "chi-squared"]))

Window size 3, sorted by likelihood ratio score:


Unnamed: 0,w1w2,likelihood,c(w1w2),c(w1),c(w2)
1,sir thomas,516.384006,242,2355,1005
2,great deal,491.964523,151,3015,642
3,lady russell,488.429531,118,3171,441
4,sir walter,486.354231,111,2355,417
5,miss crawford,478.438999,214,3846,1845
6,captain wentworth,476.844724,118,1023,648
7,lady middleton,476.605369,86,3171,357
8,mr elliot,474.996134,132,1437,864
9,colonel brandon,474.133557,88,795,420
10,lady bertram,472.295012,120,3171,810


### Part 3: Explaining the Statistical Tests

In [45]:
# print values for good wish with window size 1 
good_wish = ("good", "wish")
good_wish_row = df_window_1[df_window_1["w1w2"] == "good wish"]
display(good_wish_row)


# print values for high spirit with window size 1
high_spirit = ("high", "spirit")
high_spirit_row = df_window_1[df_window_1["w1w2"] == "high spirit"]
display(high_spirit_row)


Unnamed: 0,w1w2,t-score,chi-squared,likelihood,c(w1w2),c(w1),c(w2)
87,good wish,3.033072,107.87658,34.25975,11,1198,539


Unnamed: 0,w1w2,t-score,chi-squared,likelihood,c(w1w2),c(w1),c(w2)
86,high spirit,3.979248,3054.456318,138.887736,16,161,354
