In [None]:
import pandas as pd
import numpy as np
import math
import random
import re
import matplotlib.pyplot as plt

# Processing the First Round of Lautonomy's Data

In [None]:
og_words = []
years = []
with open("/content/years.txt") as f:
    count = 0
    for line in f:
        if count % 2 == 0:
            index = re.search("first_appearance=", line)
            if index:
                word_end = index.start() - 1
                start = index.start() + 17
                end = start + 10
                year = line[start:end]
                word = line[0: word_end]
                years.append(year)
                og_words.append(word)
        count += 1
    print(count)

33504


In [None]:
new_df = []
new_df.append(og_words)
new_df.append(years)
pd.DataFrame(new_df, index=['words', 'years']).T

Unnamed: 0,words,years
0,the,2020-12-10
1,and,2020-12-10
2,of,2020-12-10
3,to,2020-12-10
4,in,2020-12-10
...,...,...
14562,a7073add1,2020-12-10
14563,a7073,2020-12-10
14564,a7036,2021-12-16
14565,a7030,2021-12-24


In [None]:
words = []
with open("/content/full_data") as f:
    count = 0
    for line in f:
        if count % 2 == 0:
            words.append(line.rstrip())
        count += 1

In [None]:
new_df = []
new_df.append(words)
new_df.append(years)
pd.DataFrame(new_df, index=['words', 'years']).T

Unnamed: 0,words,years
0,[UNK],2020-12-10
1,the,2020-12-10
2,and,2020-12-10
3,of,2020-12-10
4,to,2020-12-10
...,...,...
18792,10b2,
18793,108th,
18794,104th,
18795,102nd,


In [None]:
new_words = []
new_years = []
poly_index = []
index = 0
for word in words:
    if word in og_words:
        og_index = og_words.index(word)
        year = years[og_index]
        new_years.append(year)
        new_words.append(word)
        poly_index.append(index)
    index += 1

In [None]:
new_df = []
new_df.append(new_words)
new_df.append(new_years)
new_df.append(poly_index)
df = pd.DataFrame(new_df, index=['words', 'years', 'poly_index']).T
df

Unnamed: 0,words,years,poly_index
0,the,2020-12-10,1
1,and,2020-12-10,2
2,of,2020-12-10,3
3,to,2020-12-10,4
4,in,2020-12-10,5
...,...,...,...
12850,a7027,2020-12-07,18600
12851,a65201,2021-12-06,18630
12852,a64295,2020-12-16,18638
12853,a63182,2021-12-24,18641


In [None]:
'''
Making the new meta-main.txt file
'''
with open('meta-main-2.txt', 'a') as the_file:
    for i in range(0, len(new_words)):
        the_file.write(str(poly_index[i]) + ' ' + str(new_words[i]) + '\n')

In [None]:
df['years'].value_counts()

2020.9    7333
2021.0    2039
2021.9     879
2022.9     675
2022.0     410
2021.5     212
2021.7     188
2021.3     183
2023.0     172
2021.6     110
2021.2      91
2022.4      64
2022.5      61
2021.4      58
2022.2      55
2020.7      42
2023.3      41
2021.1      33
2020.8      30
2022.7      27
2022.1      27
2023.1      25
2022.8      24
2022.6      22
2023.4      19
2021.8      18
2023.2      10
2022.3       7
Name: years, dtype: int64

In [None]:
'''
Change the YEAR-MONTH-DAY format to a decimal (e.g., 2020-06-01 = 2020.5)
'''
for i, row in df.iterrows():
    year = df.at[i,'years']
    fraction = ((int(year[5:7]) - 1)*30 + int(year[8:10]))/365
    total = int(year[0:4]) + fraction
    df.at[i,'years'] = round(total, 1)

In [None]:
float_years = np.sort(df['years'].unique())
float_years

array([2020.7, 2020.8, 2020.9, 2021.0, 2021.1, 2021.2, 2021.3, 2021.4,
       2021.5, 2021.6, 2021.7, 2021.8, 2021.9, 2022.0, 2022.1, 2022.2,
       2022.3, 2022.4, 2022.5, 2022.6, 2022.7, 2022.8, 2022.9, 2023.0,
       2023.1, 2023.2, 2023.3, 2023.4], dtype=object)

In [None]:
dict_float_years = {}
for i in range(0, 28):
    dict_float_years[float_years[i]] = i
dict_float_years

In [None]:
df['documents'] = df['years'].map(dict_float_years)
df = df.rename(columns={'poly_index': 'polyglot_index', 'words': 'word'})
df

Unnamed: 0,word,years,polyglot_index,documents
0,the,2020.9,1,2
1,and,2020.9,2,2
2,of,2020.9,3,2
3,to,2020.9,4,2
4,in,2020.9,5,2
...,...,...,...,...
12850,a7027,2020.9,18600,2
12851,a65201,2021.9,18630,12
12852,a64295,2020.9,18638,2
12853,a63182,2022.0,18641,13


# Implementing the Doc Metric

In [None]:
'''
Uses the list of anchor points just the MCPM metric
'''
# can use any correctly formatted "meta-main.txt" file here
data = pd.read_csv('/content/meta-main.txt', sep=" ", index_col=None, header = None)
data.columns = ["polyglot_index", "word"]
words_list = data.word.to_list()
data

Unnamed: 0,polyglot_index,word
0,0,also_ADV
1,1,one_NUM
2,2,first_ADJ
3,7,use_VERB
4,8,time_NOUN
...,...,...
564,276937,Gmina::Zgierz_PROPN
565,283434,95.00_NUM
566,287247,home_ADJ
567,290653,last_NOUN


In [None]:
'''
SYNTHETIC Generate a list of documents and associate each word to some of them
'''
# initialize document indexes
num_docs = 101
doc_indexes = np.arange(0, num_docs)

# initialize dictionary of years and documents
start_year = 1980
end_year = 2001
years = np.arange(start_year, end_year)
year_docs = dict((year_key,0) for year_key in years)

# assign docs to years
for year in years:
    year_docs[year] = doc_indexes[np.random.choice(doc_indexes.shape[0], 20, replace=False)].tolist()

In [None]:
year_docs

In [None]:
years.tolist()

In [None]:
full_data = data.reindex(columns = data.columns.tolist() + years.tolist())

In [None]:
full_data

Unnamed: 0,polyglot_index,word,1980,1981,1982,1983,1984,1985,1986,1987,...,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000
0,0,also_ADV,,,,,,,,,...,,,,,,,,,,
1,1,one_NUM,,,,,,,,,...,,,,,,,,,,
2,2,first_ADJ,,,,,,,,,...,,,,,,,,,,
3,7,use_VERB,,,,,,,,,...,,,,,,,,,,
4,8,time_NOUN,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,276937,Gmina::Zgierz_PROPN,,,,,,,,,...,,,,,,,,,,
565,283434,95.00_NUM,,,,,,,,,...,,,,,,,,,,
566,287247,home_ADJ,,,,,,,,,...,,,,,,,,,,
567,290653,last_NOUN,,,,,,,,,...,,,,,,,,,,


In [None]:
# assign words to docs (allow for word to exist in same doc multiple times)
for i, row in data.iterrows():
    # each document can have between 50 and 100 words
    num_words = random.randint(50, 100)
    doc_words = []
    for j in range(1, num_words+1):
        rank_j_word = random.randint(0, 568)
        freq = (1/j) * num_words
        for i in range(0, round(freq) + 1):
            doc_words.append(data.iloc[rank_j_word]["word"])
    doc_words = ','.join(str(x) for x in doc_words)
    data.at[i,'documents'] = doc_words

data
'''
Zipf's Law: https://en.wikipedia.org/wiki/Zipf%27s_law#Occurrences
replace the uniform random assignment of documents with a power law distribution (match frequencies)
'''

"\nZipf's Law: https://en.wikipedia.org/wiki/Zipf%27s_law#Occurrences\nreplace the uniform random assignment of documents with a power law distribution (match frequencies)\n"

In [None]:
# assign words to docs (allow for word to exist in same doc multiple times)

# cache the number of words in each document
num_words_per_doc = {}
full_doc_words = []
for document in range(0, num_docs):
    # each document can have between 50 and 100 words
    num_words = random.randint(50, 100)
    num_words_per_doc[document] = num_words
    doc_words = []
    for j in range(1, num_words+1):
        rank_j_word = random.randint(0, 568)
        freq = (1/j) * num_words
        for i in range(0, round(freq) + 1):
            doc_words.append(data.iloc[rank_j_word]["word"])
    full_doc_words.append(doc_words)
full_doc_words

In [None]:
num_words_per_doc

In [None]:
year_docs

In [None]:
data

Unnamed: 0,polyglot_index,word,documents
0,0,also_ADV,"63,100,95,86,62,14,16,48,25,29,79,78,24,81,74,..."
1,1,one_NUM,"68,20,84,3,58,31,5,56,62,76,94,81,60,46,79,7,6..."
2,2,first_ADJ,"79,29,19,24,83,61,37,97,59,34,56,83,3,44,16,19..."
3,7,use_VERB,"81,95,55,30,83,39,28,6,61,20,4,13,38,28,79,68,..."
4,8,time_NOUN,"39,1,93,8,55,68,17,67,93,2,92,96,67,50,64,71,7..."
...,...,...,...
564,276937,Gmina::Zgierz_PROPN,"75,63,81,81,98,97,2,99,16,95,25,16,96,89,64,21..."
565,283434,95.00_NUM,"68,8,32,36,50,69,5,2,6,65,64,86,7,2,43,55,31,8..."
566,287247,home_ADJ,"16,42,28,38,95,43,77,48,83,87,86,43,90,94,27,3..."
567,290653,last_NOUN,"81,94,62,75,53,75,13,40,33,53,97,1,40,47,81,70..."


In [None]:
years.tolist()

In [None]:
df2 = data.reindex(columns = data.columns.tolist() + years.tolist())
df2

Unnamed: 0,polyglot_index,word,documents,1980,1981,1982,1983,1984,1985,1986,...,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000
0,0,also_ADV,,,,,,,,,...,,,,,,,,,,
1,1,one_NUM,"cut_VERB,cut_VERB,cut_VERB,cut_VERB,cut_VERB,c...",,,,,,,,...,,,,,,,,,,
2,2,first_ADJ,,,,,,,,,...,,,,,,,,,,
3,7,use_VERB,,,,,,,,,...,,,,,,,,,,
4,8,time_NOUN,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,276937,Gmina::Zgierz_PROPN,,,,,,,,,...,,,,,,,,,,
565,283434,95.00_NUM,,,,,,,,,...,,,,,,,,,,
566,287247,home_ADJ,,,,,,,,,...,,,,,,,,,,
567,290653,last_NOUN,,,,,,,,,...,,,,,,,,,,


In [None]:
'''
!!TEMP!!
For each document, we compute all of the tf-idf scores of each word
tf-idf = (frequency of w in d)/(number of words in d) * log(size of corpus * number of documents containing w)
'''
# TEMP FIX
num_docs = 28
# first we cache the number of documents each word is in
word_doc_freq = dict((word,0) for word in words_list)
for i, row in data.iterrows():
    #word_doc_freq[words_list[i]] = len(data.at[i, "documents"].split(','))
    # TEMP FIX FOR SINGLE YEAR
    word_doc_freq[words_list[i]] = 1

# then for each document, we cache its total number of words
doc_total_words = []
for i in range(0, num_docs):
    count = 0
    for j, row in data.iterrows():
        if i == data.at[j, "documents"]:
            count += 1
    doc_total_words.append(count)


In [None]:
'''
!! TEMP !!
'''
scores = []
for i in range(0, num_docs):
    temp_arr = []
    words_in_doc = doc_total_words[i]
    for j, row in data.iterrows():
        if i != data.at[j, "documents"]:
            continue
        else:
            polyglot_index = data.at[j,'polyglot_index']
            word_freq = 1
            numerator = word_freq / words_in_doc
            denominator = np.log(num_docs * 1)
            if_idf = numerator/denominator
            temp_arr.append(str(polyglot_index) + ' ' + str(if_idf * 10000000)) # scale so viz is better later
    scores.append(temp_arr)

In [None]:
'''
For each document, we compute all of the tf-idf scores of each word
tf-idf = (frequency of w in d)/(number of words in d) * log(size of corpus * number of documents containing w)
'''
# first we cache the number of documents each word is in
word_doc_freq = dict((word,0) for word in words_list)
for i, row in data.iterrows():
    word_doc_freq[words_list[i]] = len(data.at[i, "documents"].split(','))

# then for each document, we cache its total number of words
doc_total_words = []
for i in range(0, num_docs):
    count = 0
    for j, row in data.iterrows():
        list_temp = data.at[j, "documents"].split(',')
        if i in list(map(int, list_temp)):
            count += 1
    doc_total_words.append(count)


In [None]:
'''
For each document, we compute all of the tf-idf scores of each word
tf-idf = (frequency of w in d)/(number of words in d) * log(size of corpus * number of documents containing w)
'''
# first we cache the number of documents each word is in
num_docs_word_in = {}
for i, row in data.iterrows():
    word = row['word']
    num_docs_word_in[word] = 0
    for doc in full_doc_words:
        if word in doc:
            num_docs_word_in[word] = num_docs_word_in[word] + 1

# then for each document, we cache its total number of words
num_words_per_doc


In [None]:
num_docs_word_in

In [None]:
for year in year_docs:
    for doc in year_docs[year]:
        for word in set(full_doc_words[doc]):
            num_times_word_in_doc = full_doc_words[doc].count(word)
            num_words_in_doc = num_words_per_doc[doc]
            num_docs_tword_in = num_docs_word_in[word]
            if_idf = (num_times_word_in_doc/num_words_in_doc) * math.log(num_docs * num_docs_tword_in)
            df2.loc[df2['word'] == word, year] += if_idf
            # compute if-df of word, add to (word, year) index
df2

Unnamed: 0,polyglot_index,word,1980,1981,1982,1983,1984,1985,1986,1987,...,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000
0,0,also_ADV,0.329630,0.700369,0.549225,0.370740,,0.612396,,0.507189,...,0.594536,,0.208541,,,,0.491828,0.753297,0.224583,0.576945
1,1,one_NUM,1.708506,0.152566,1.035966,2.006989,2.390436,0.313515,0.236231,0.777835,...,1.718311,1.114357,0.467489,1.917973,0.559791,0.943615,1.006598,2.407786,1.204129,1.331086
2,2,first_ADJ,,0.141677,0.401552,0.510783,0.226226,0.712215,1.185738,0.848944,...,1.219274,,1.532507,,0.533819,0.731481,0.615177,0.988076,1.185715,0.826531
3,7,use_VERB,,0.432350,0.259741,0.892566,1.250049,1.693732,0.682262,1.374813,...,0.791882,1.221724,0.212516,0.628352,1.068935,1.298411,,0.212516,0.655062,1.290055
4,8,time_NOUN,0.909171,1.392043,7.963866,10.283895,16.340714,4.044943,2.601904,0.992965,...,4.056823,13.016296,0.319383,1.111734,8.751168,12.315678,4.467207,8.467009,11.743801,4.876814
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,276937,Gmina::Zgierz_PROPN,0.683102,,0.403411,0.947554,0.733549,0.761194,0.761194,0.901658,...,0.938036,0.620453,1.921349,0.582925,0.516366,0.674941,0.982566,0.244829,1.209134,1.510995
565,283434,95.00_NUM,0.916133,1.854516,0.796163,,1.258501,0.740789,1.032571,0.291782,...,0.740789,1.599851,1.339311,0.794310,,0.684439,1.147655,1.147655,0.338097,1.171807
566,287247,home_ADJ,1.495043,2.163692,0.340717,1.755443,0.183909,1.921303,2.155376,0.945365,...,1.323713,2.190708,2.270237,1.326404,4.066789,0.215894,1.331985,0.958525,1.835894,2.438378
567,290653,last_NOUN,0.482606,1.038944,0.560528,0.253572,0.407921,1.236722,6.279406,0.194521,...,3.130011,0.617085,2.909326,3.890172,0.588545,,0.596382,3.513392,3.636599,0.156045


In [None]:
## ASK ##
df2.drop(['polyglot_index','word'], axis=1).interpolate(method='linear', axis=1, limit_direction = 'forward')

Unnamed: 0,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000
0,0.329630,0.700369,0.549225,0.370740,0.491568,0.612396,0.559792,0.507189,0.317025,0.567489,...,0.594536,0.401538,0.208541,0.279363,0.350185,0.421007,0.491828,0.753297,0.224583,0.576945
1,1.708506,0.152566,1.035966,2.006989,2.390436,0.313515,0.236231,0.777835,0.249654,0.796023,...,1.718311,1.114357,0.467489,1.917973,0.559791,0.943615,1.006598,2.407786,1.204129,1.331086
2,,0.141677,0.401552,0.510783,0.226226,0.712215,1.185738,0.848944,0.988076,0.567467,...,1.219274,1.375890,1.532507,1.033163,0.533819,0.731481,0.615177,0.988076,1.185715,0.826531
3,,0.432350,0.259741,0.892566,1.250049,1.693732,0.682262,1.374813,0.961145,0.951993,...,0.791882,1.221724,0.212516,0.628352,1.068935,1.298411,0.755464,0.212516,0.655062,1.290055
4,0.909171,1.392043,7.963866,10.283895,16.340714,4.044943,2.601904,0.992965,23.842928,4.738441,...,4.056823,13.016296,0.319383,1.111734,8.751168,12.315678,4.467207,8.467009,11.743801,4.876814
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,0.683102,0.543256,0.403411,0.947554,0.733549,0.761194,0.761194,0.901658,1.091991,0.674941,...,0.938036,0.620453,1.921349,0.582925,0.516366,0.674941,0.982566,0.244829,1.209134,1.510995
565,0.916133,1.854516,0.796163,1.027332,1.258501,0.740789,1.032571,0.291782,0.638125,0.577865,...,0.740789,1.599851,1.339311,0.794310,0.739375,0.684439,1.147655,1.147655,0.338097,1.171807
566,1.495043,2.163692,0.340717,1.755443,0.183909,1.921303,2.155376,0.945365,0.927631,1.592899,...,1.323713,2.190708,2.270237,1.326404,4.066789,0.215894,1.331985,0.958525,1.835894,2.438378
567,0.482606,1.038944,0.560528,0.253572,0.407921,1.236722,6.279406,0.194521,2.857959,4.623185,...,3.130011,0.617085,2.909326,3.890172,0.588545,0.592464,0.596382,3.513392,3.636599,0.156045


In [None]:
df2.fillna(0, inplace = True)

In [None]:
df2.to_csv('if_idf_scores_fill_with_zeros.csv', index = False)
!cp if_idf_scores_blanks.csv "drive/MyDrive/"

In [None]:
df2 = df2.drop('documents', axis = 1)

In [None]:
df2

Unnamed: 0,polyglot_index,word,1980,1981,1982,1983,1984,1985,1986,1987,...,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000
0,0,also_ADV,,,,,,,,,...,,,,,,,,,,
1,1,one_NUM,,,,,,,,,...,,,,,,,,,,
2,2,first_ADJ,,,,,,,,,...,,,,,,,,,,
3,7,use_VERB,,,,,,,,,...,,,,,,,,,,
4,8,time_NOUN,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,276937,Gmina::Zgierz_PROPN,,,,,,,,,...,,,,,,,,,,
565,283434,95.00_NUM,,,,,,,,,...,,,,,,,,,,
566,287247,home_ADJ,,,,,,,,,...,,,,,,,,,,
567,290653,last_NOUN,,,,,,,,,...,,,,,,,,,,


In [None]:
scores = []
for i in range(0, num_docs):
    temp_arr = []
    words_in_doc = doc_total_words[i]
    for j, row in data.iterrows():
        word_docs = data.at[j, "documents"].split(',')
        if i not in list(map(int, word_docs)):
            continue
        else:
            polyglot_index = data.at[j,'polyglot_index']
            word_freq = word_docs.count(str(i))
            numerator = word_freq / words_in_doc
            denominator = np.log(num_docs * word_doc_freq[data.at[j, "word"]])
            if_idf = numerator/denominator
            temp_arr.append(str(polyglot_index) + ' ' + str(if_idf * 10000000)) # scale so viz is better later
    scores.append(temp_arr)

In [None]:
for i in range(0, len(scores)):
   for j in range(0, len(scores[i])):
        scores[i][j] = scores[i][j].split(' ')

In [None]:
for i in range(0, len(scores)):
    scores[i].sort(key=lambda x: float(x[1]), reverse = True)

In [None]:
for i in range(0, len(scores)):
   for j in range(0, len(scores[i])):
        scores[i][j] = ' '.join(scores[i][j])

In [None]:
len(scores)

101

In [None]:
import os
!mkdir doc_anchor_files
os.chdir('doc_anchor_files')
!pwd

/content/doc_anchor_files


In [None]:
for i in range(0, num_docs):
    with open(str(i)+'.txt', 'w') as f:
        f.write('\n'.join(scores[i]))

In [None]:
os.chdir('..')
!pwd

/content


In [None]:
!ls

doc_anchor_files  drive  full_data  meta-main-2.txt  sample_data  years.txt


In [None]:
cp -r "/content/doc_anchor_files" "/content/drive/MyDrive/check_here"

In [None]:
str(year_docs[1998])

NameError: ignored

In [None]:
dict_float_years

{2020.7: 0,
 2020.8: 1,
 2020.9: 2,
 2021.0: 3,
 2021.1: 4,
 2021.2: 5,
 2021.3: 6,
 2021.4: 7,
 2021.5: 8,
 2021.6: 9,
 2021.7: 10,
 2021.8: 11,
 2021.9: 12,
 2022.0: 13,
 2022.1: 14,
 2022.2: 15,
 2022.3: 16,
 2022.4: 17,
 2022.5: 18,
 2022.6: 19,
 2022.7: 20,
 2022.8: 21,
 2022.9: 22,
 2023.0: 23,
 2023.1: 24,
 2023.2: 25,
 2023.3: 26,
 2023.4: 27}

In [None]:
fout = "years.txt"
fo = open(fout, "w")

for k, v in dict_float_years.items():
    fo.write(str(k) + ' ' + str(v) + '\n')

fo.close()