In [1]:
%matplotlib inline
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer
from matplotlib import pyplot as plt
from matplotlib_venn import venn2
import tqdm
import re
import zipfile
import json

## I. Gather the Preprocessed Data (+ Aggregate Lines)

### 1. ETCSL

In [61]:
#COPYING NIEK VELDHUIS
file = "alltexts.csv"
etcsl = pd.read_csv(file, keep_default_na=False)
etcsl = etcsl.loc[etcsl["lang"].str.contains("sux")]  # throw out non-Sumerian words

In [62]:
#COPYING NIEK VELDHUIS
etcsl["lemma"] = etcsl.apply(lambda r: (r["cf"] + '[' + r["gw"] + ']' + r["pos"]) 
                            if r["cf"] != '' else r['form'] + '[NA]NA', axis=1)
etcsl['lemma'] = [lemma if not lemma == '[NA]NA' else '' for lemma in etcsl['lemma'] ] 
# kick out empty forms
etcsl["lemma"] = etcsl["lemma"].str.lower()

In [63]:
etcsl_lines = etcsl.groupby([etcsl['id_text'], etcsl['id_line'], etcsl['text_name']]).agg({
        'lemma': ' '.join,
        'extent': ''.join
    }).reset_index()

In [64]:
etcsl_lines

Unnamed: 0,id_text,id_line,text_name,lemma,extent
0,c.0.1.1,1,Ur III catalogue from Nibru (N1),dubsaŋ[first]aj,
1,c.0.1.1,2,Ur III catalogue from Nibru (N1),enki[1]dn unu[dwelling]n gal[big]v/i ed[ascend...,
2,c.0.1.1,3,Ur III catalogue from Nibru (N1),anzag[horizon]n,
3,c.0.1.1,4,Ur III catalogue from Nibru (N1),anŋi[eclipse]n zu[know]v/t ama[mother]n tu[inc...,
4,c.0.1.1,5,Ur III catalogue from Nibru (N1),gi[thicket]n tuku[rock]v/t,
5,c.0.1.1,6,Ur III catalogue from Nibru (N1),an[na]na kaš₄[na]na an[na]na kaš₄[na]na me[bat...,
6,c.0.1.1,7,Ur III catalogue from Nibru (N1),mašmaš[sorcerer]n erim[enemy]n kur[different]v/i,
7,c.0.1.1,8,Ur III catalogue from Nibru (N1),ŋiriŋena[path]n enki[1]dn ki[place]n unu[dwell...,
8,c.0.1.1,9,Ur III catalogue from Nibru (N1),šag[heart]n pu₂[na]na 1-kam-ma[1st]nu,
9,c.0.1.1,10,Ur III catalogue from Nibru (N1),dubsaŋ[first]aj,


### 2. DCCLT

In [6]:
#COPYING NIEK VELDHUIS
file = "dcclt_parsed.csv"
lexical = pd.read_csv(file, keep_default_na=False)
lexical = lexical.loc[lexical["lang"].str.contains("sux")]

In [7]:
#COPYING NIEK VELDHUIS
lexical["lemma"] = lexical.apply(lambda r: (r["cf"] + '[' + r["gw"] + ']' + r["pos"]) 
                            if r["cf"] != '' else r['form'] + '[NA]NA', axis=1)
lexical['lemma'] = [lemma if not lemma == '[NA]NA' else '' for lemma in lexical['lemma'] ] 
# kick out empty forms
lexical["lemma"] = lexical["lemma"].str.lower()

In [8]:
#COPYING NIEK VELDHUIS
lexical = lexical[~lexical["field"].isin(["sg", "pr"])] # remove lemmas that derive from the fields "sign" 
# or "pronunciation" in sign lists.

In [9]:
lexical['id_text'] = lexical['id_text'].apply(lambda x: x.replace('dcclt/',''))

In [10]:
lexical_lines = lexical.groupby([lexical['id_text'], lexical['id_line']]).agg({
        'lemma': ' '.join,
        'extent': ''.join
    }).reset_index()

In [11]:
lexical_lines

Unnamed: 0,id_text,id_line,lemma,extent
0,P000723,4,x-bad[na]na,
1,P000723,5,x-|en+ib|[na]na,
2,P000723,7,gada-sukkal[na]na,
3,P000723,8,gal-ga[na]na,
4,P000723,9,tug₂-gara₂[na]na,
5,P000723,12,gal-ŋišgal[na]na,
6,P000723,13,gal-|ga₂×di.me|[na]na,
7,P000723,14,saŋŋa[na]na |ga₂×gar.me|[na]na,
8,P000723,15,x[na]na |ga₂×x|[na]na,
9,P000724,5,x-suhur[na]na,


### 3. OB Royal Inscriptions

In [12]:
#COPYING NIEK VELDHUIS
file = "ribo_babylon0_parsed.csv"
royal = pd.read_csv(file, keep_default_na=False)
royal = royal.loc[royal["lang"].str.contains("sux")]

#COPYING NIEK VELDHUIS
royal["lemma"] = royal.apply(lambda r: (r["cf"] + '[' + r["gw"] + ']' + r["pos"]) 
                            if r["cf"] != '' else r['form'] + '[NA]NA', axis=1)
royal['lemma'] = [lemma if not lemma == '[NA]NA' else '' for lemma in royal['lemma'] ] 
# kick out empty forms
royal["lemma"] = royal["lemma"].str.lower()

royal['id_text'] = royal['id_text'].apply(lambda x: x.replace('ribo-babylon0/',''))

In [13]:
royal_lines = royal.groupby([royal['id_text'], royal['id_line']]).agg({
        'lemma': ' '.join,
        'extent': ''.join
    }).reset_index()

In [14]:
royal_lines

Unnamed: 0,id_text,id_line,lemma,extent
0,Q000525,1,zi[life]n x[na]na,
1,Q000525,2,zi[life]n x[na]na,
2,Q000525,3,zi[life]n x[na]na,
3,Q000525,4,mu[name]n kug[pure]v/i x[na]na,
4,Q000525,5,til[life]n x[na]na,
5,Q000525,6,enki[1]dn an[1]dn x[na]na,
6,Q000525,7,nun[prince]n samsu-iluna[king-of-babylon]rn x[...,
7,Q000525,8,melim[splendor]n daŋal[wide]v/i x[na]na,
8,Q000525,9,guza[chair]n x[na]na enki[1]dn x[na]na,
9,Q000525,10,ŋidru[scepter]n mu[year]n ud[sun]n sud[distant...,


In [15]:
lexical = lexical[~lexical['lemma'].str.contains('\[na\]na')]
etcsl = etcsl[~etcsl['lemma'].str.contains('\[na\]na')]
royal = royal[~royal['lemma'].str.contains('\[na\]na')]

JASON STUFF

## II. Gather Text Info

Gather Text Information. Attributes of a text are:
1. id
2. name
3. period
4. \# of unique words

### 1. Get ETCSL Text List

In [16]:
etcsl_tl = pd.DataFrame(etcsl.groupby(['id_text','text_name']).agg({'lemma':'nunique'})).reset_index()
etcsl_tl['period'] = 'Old Babylonian'
etcsl_tl = etcsl_tl.set_index('id_text')
etcsl_tl.columns = ['designation','uniq_words','period']
#df_tl_nonlex

### 2. Get Lexical Text List

In [17]:
file = "dcclt.zip"
z = zipfile.ZipFile(file) 
st = z.read("dcclt/catalogue.json").decode("utf-8")
j = json.loads(st)
lexical_tl = pd.DataFrame(j["members"]).T
lexical_tl = lexical_tl[["id_text", "period","designation"]] #added designation

In [18]:
df_temp = lexical.groupby(lexical['id_text']).agg({'lemma':'nunique'})
df_temp.columns = ['uniq_words']

In [19]:
lexical_tl = pd.concat([lexical_tl,df_temp],axis=1)
lexical_tl['uniq_words'] = lexical_tl['uniq_words'].fillna(0.0)
lexical_tl

Unnamed: 0,id_text,period,designation,uniq_words
P000001,P000001,Uruk III,"W 06435,a",0.0
P000002,P000002,Uruk III,"W 06435,b",0.0
P000003,P000003,Uruk IV,"W 09123,d",0.0
P000004,P000004,Uruk IV,"W 09169,d",0.0
P000005,P000005,Uruk IV,"W 09206,k",0.0
P000006,P000006,Uruk IV,"W 09656,h1",0.0
P000007,P000007,Uruk IV,"W 09656,x",0.0
P000008,P000008,Uruk III,"W 11985,e",0.0
P000009,P000009,Uruk III,"W 11985,f",0.0
P000010,P000010,Uruk III,"W 11985,g",0.0


### 3. OB Royal Text Info

In [20]:
file = "ribo-babylon0.zip"
z = zipfile.ZipFile(file) 
st = z.read("ribo/babylon0/catalogue.json").decode("utf-8")
j = json.loads(st)
royal_tl = pd.DataFrame(j["members"]).T
royal_tl = royal_tl[["id_composite", "period","designation"]] #added designation
royal_tl.columns = ['id_text','period','designation']

In [21]:
df_temp = royal.groupby(royal['id_text']).agg({'lemma':'nunique'})
df_temp.columns = ['uniq_words']

In [22]:
royal_tl = pd.concat([royal_tl,df_temp],axis=1)
royal_tl['uniq_words'] = royal_tl['uniq_words'].fillna(0.0)
royal_tl

Unnamed: 0,id_text,period,designation,uniq_words
Q000518,Q000518,Old Babylonian,Hammu-rapi A,0.0
Q000519,Q000519,Old Babylonian,Hammu-rapi B,0.0
Q000520,Q000520,Old Babylonian,Hammu-rapi C,0.0
Q000521,Q000521,Old Babylonian,Hammu-rapi D,0.0
Q000522,Q000522,Old Babylonian,Hammu-rapi E,0.0
Q000523,Q000523,Old Babylonian,Hammu-rapi F,0.0
Q000524,Q000524,Old Babylonian,Samsu-iluna A,0.0
Q000525,Q000525,Old Babylonian,Samsu-iluna B,109.0
Q000526,Q000526,Old Babylonian,Samsu-iluna C,0.0
Q000527,Q000527,Old Babylonian,Samsu-iluna D,0.0


### III. Filter Functions

In [96]:
## Argument "period" should be a list

def filter_period(df_inst, df_tl, df_lines, period):
    cat_period = df_tl[df_tl["period"].isin(period)]
    keep = cat_period.index.values
    keep = [id_text for id_text in keep]
    df_inst = df_inst.loc[df_inst["id_text"].isin(keep)]
    df_tl = df_tl.loc[keep]
    df_lines = df_lines[df_lines['id_text'].isin(keep)]
    return df_inst, df_tl, df_lines

def filter_words_minmax(df_inst, df_tl, df_lines, min_words, max_words):
    df_tl = df_tl[(df_tl['uniq_words'] > min_words) & (df_tl['uniq_words'] < max_words)]
    df_inst = df_inst[df_inst['id_text'].isin(df_tl.index)]
    df_lines = df_lines[df_lines['id_text'].isin(df_tl.index)]
    return df_inst, df_tl, df_lines

def filter_genre(df_list,genre):
    return df_list[genre]

def filter_remove_list(df,text_list):
    return df[~df['cat_num'].isin(text_list)]

def filter_word_freqs(word_list,cat_words,min_freq,max_freq):
    return {x for x in word_list if cat_words.loc[x]['freq'] > min_freq and cat_words.loc[x]['freq'] < max_freq}

def filter_all(df_inst,df_tl,df_lines,args):
    if args['periods']:
        df_inst,df_tl,df_lines = filter_period(df_inst,df_tl,df_lines,args['periods'])
    if 'min_words' in args or 'max_words' in args:
        df_inst,df_tl,df_lines = filter_words_minmax(df_inst,df_tl,df_lines,args.get('min_words',0),args.get('max_words',2000))
    if 'qnums_only' in args:
        df_tl = df_tl[df_tl.index.str.startswith('Q')]
        df_inst = df_inst[df_inst['id_text'].isin(df_tl.index)]
        df_lines = df_lines[df_lines['id_text'].isin(df_tl.index)]
    return df_inst, df_tl, df_lines

In [97]:
def compare_single(lex_wl,lit_wl): #inputs are sets
    inter_wl = lex_wl.intersection(lit_wl)
    lex_comp = len(inter_wl) / len(lex_wl)
    lit_comp = len(inter_wl) / len(lit_wl)
    return (lex_comp,lit_comp,len(inter_wl))

In [98]:
#Arguments:
#
#    df_inst_lex    - The Dataframe of lexical word instances
#    df_inst_nonlex - The Dataframe for non-lexical word instances (etcsl, ob royal, etc.)
#    df_tl_lex      - The lexical text list to use
#    df_tl_nonlex   - The non-lexical text list to use
#

def get_comparison_df(df_inst_lex,df_inst_nonlex,df_tl_lex,df_tl_nonlex,df_lex_wl,df_nonlex_wl):
    comps = []
    for lex_t in df_tl_lex.index:
        lex_wl = df_lex_wl.loc[lex_t]['wordlist']     
        
        #print(lex_t + ': ' + str(len(lex_wl)))
        for lit_t in df_tl_nonlex.index: #Make sure to use the dictionary here
            lit_wl = df_nonlex_wl.loc[lit_t]['wordlist']
            #key = lex_t + '::' + lit_t
            lex_comp,lit_comp,match_num = compare_single(lex_wl,lit_wl)
            #lex_comps[key] = lex_comp
            #lit_comps[key] = lit_comp
            comps.append({'lit_t':lit_t,
                          'lit_name':df_tl_nonlex.loc[lit_t]['designation'],
                          'lex_t':lex_t,
                          'lex_name':df_tl_lex.loc[lex_t]['designation'],
                          'lex_comp':lex_comp,
                          'lit_comp':lit_comp,
                          'lit_uniq_words':len(lit_wl),
                          'lex_uniq_words':len(lex_wl),
                          'matches':match_num})
    df_comps = pd.DataFrame(comps)
    return df_comps

In [99]:
def multiword_connect(lex_tl,df_lines_lex,df_lines_nonlex):
    df_lines_onelex = df_lines_lex[df_lines_lex['id_text'] == lex_tl]
    
    lex_vocab = list(set(df_lines_onelex["lemma"]))
    lex_vocab = [lemma for lemma in lex_vocab if not "[na]na" in lemma]
    
    lex_vocab.sort(key=len, reverse=True)
    multiple_word = [entry for entry in lex_vocab if " " in entry]
    for item in tqdm.tqdm(multiple_word):
        findwhat = re.escape(item)
        replacewith = item.replace(" ", "*")
        df_lines_nonlex["lemma"] = df_lines_nonlex["lemma"].str.replace(findwhat, replacewith)
        
    lex_vocab = [lemma.replace(" ", "*") for lemma in lex_vocab]

In [100]:
def ngramify_line(s,n_max):
    ngram_list = []
    words = s.split()
    for n in range(n_max):
        for i in range(len(words)-n):
            ngram = ' '.join(words[i:i+n+1])
            if not '[na]na' in ngram:
                ngram_list.append(ngram)
    return ngram_list

In [101]:
def expand_ngram_df(df):
    d = []
    for i, row in df.iterrows():
        ngram_list = ngramify_line(row['lemma'],5)
        for ngram in ngram_list:
            row['ngram'] = ngram
            d.append(row.copy())
    return pd.DataFrame(d)

In [153]:
df_ngrams_nonlex

NameError: name 'df_ngrams_nonlex' is not defined

In [103]:
'''
df get_wordlists_multi(df_lex,df_lit):
        lex_vocab = list(set(lex_lines["lemma"]))
    lex_vocab = [lemma for lemma in lex_vocab if not "[na]na" in lemma ]
    
    lex_vocab.sort(key=len, reverse=True)
    multiple_word = [entry for entry in lex_vocab if " " in entry]
    for item in tqdm.tqdm(multiple_word):
        findwhat = re.escape(item)
        replacewith = item.replace(" ", "*")
        lit_lines["lemma"] = lit_lines["lemma"].str.replace(findwhat, replacewith)
        
    lex_vocab = [lemma.replace(" ", "*") for lemma in lex_vocab]
    
    lit_words2 = ' '.join(lit_lines["lemma"])
    lit_words_s2 = set(lit_words2.split())
    lit_words_s2 = {lemma for lemma in lit_words_s2 if not '[na]na' in lemma}
    lex_words_s2 = set(lex_vocab)
    return lex_words_s2
'''

'\ndf get_wordlists_multi(df_lex,df_lit):\n        lex_vocab = list(set(lex_lines["lemma"]))\n    lex_vocab = [lemma for lemma in lex_vocab if not "[na]na" in lemma ]\n    \n    lex_vocab.sort(key=len, reverse=True)\n    multiple_word = [entry for entry in lex_vocab if " " in entry]\n    for item in tqdm.tqdm(multiple_word):\n        findwhat = re.escape(item)\n        replacewith = item.replace(" ", "*")\n        lit_lines["lemma"] = lit_lines["lemma"].str.replace(findwhat, replacewith)\n        \n    lex_vocab = [lemma.replace(" ", "*") for lemma in lex_vocab]\n    \n    lit_words2 = \' \'.join(lit_lines["lemma"])\n    lit_words_s2 = set(lit_words2.split())\n    lit_words_s2 = {lemma for lemma in lit_words_s2 if not \'[na]na\' in lemma}\n    lex_words_s2 = set(lex_vocab)\n    return lex_words_s2\n'

In [117]:
tl_asone = etcsl_tl.copy()
tl_asone = 
tl_asone

Unnamed: 0,designation,period,uniq_words
ALL,ALL,ALL,0


In [143]:
def compare_words(lex_args,nonlex_args,word_args):
    df_inst_lex = lexical
    df_tl_lex   = lexical_tl
    df_lines_lex = lexical_lines
    if nonlex_args['corpus'] == 'etcsl':
        df_inst_nonlex = etcsl
        df_tl_nonlex   = etcsl_tl
        df_lines_nonlex = etcsl_lines
    elif nonlex_args['corpus'] == 'royal':
        df_inst_nonlex = royal
        df_tl_nonlex   = royal_tl
        df_lines_nonlex = royal_lines
    
    df_inst_lex_filter, df_tl_lex_filter, df_lines_lex_filter = filter_all(df_inst_lex,df_tl_lex,df_lines_lex,lex_args)
    df_inst_nonlex_filter, df_tl_nonlex_filter, df_lines_nonlex_filter = filter_all(df_inst_nonlex,df_tl_nonlex,df_lines_nonlex,nonlex_args)
    
    #df_inst_lex_filter.to_csv('temp1.csv',sep='\t',encoding='utf-16')
    #df_tl_lex_filter.to_csv('temp2.csv',sep='\t',encoding='utf-16')
    #df_inst_nonlex_filter.to_csv('temp1.csv',sep='\t',encoding='utf-16')
    #df_tl_nonlex_filter.to_csv('temp2.csv',sep='\t',encoding='utf-16')

    #LEX AS ONE
    df_inst_lex_asone = df_inst_lex_filter.copy()
    df_inst_lex_asone['id_text'] = 'ALL_LEX'
    df_tl_lex_asone = pd.DataFrame([{'designation':'ALL_LEX','uniq_words':0,'period':'ALL'}],index=['ALL_LEX'])
    
    #NONLEX AS ONE
    df_inst_nonlex_asone = df_inst_nonlex_filter.copy()
    df_inst_nonlex_asone['id_text'] = 'ALL_NONLEX'
    df_tl_nonlex_asone = pd.DataFrame([{'designation':'ALL_NONLEX','uniq_words':0,'period':'ALL'}],index=['ALL_NONLEX'])

    if word_args.get('multi',''):
        #df_wl_lex,df_wl_nonlex = get_wordlists_multi(df_lines_lex_filter,df_lines_nonlex_filter)
        df_ngrams_nonlex = expand_ngram_df(df_lines_nonlex_filter)
        
        df_wl_nonlex = pd.DataFrame(df_ngrams_nonlex.groupby(df_ngrams_nonlex['id_text']).apply(lambda x: set(x['ngram'])))
        df_wl_nonlex.columns = ['wordlist']
        df_wl_lex = pd.DataFrame(df_lines_lex_filter.groupby(df_lines_lex_filter['id_text']).apply(lambda x: set(x['lemma'])))
        df_wl_lex.columns = ['wordlist']
        
        df_wl_nonlex_asone = pd.DataFrame()

    else:
        df_wl_lex = pd.DataFrame(df_inst_lex_filter.groupby(df_inst_lex_filter['id_text']).apply(lambda x: set(x['lemma'])))
        df_wl_lex.columns = ['wordlist']
        df_wl_nonlex = pd.DataFrame(df_inst_nonlex_filter.groupby(df_inst_nonlex_filter['id_text']).apply(lambda x: set(x['lemma'])))
        df_wl_nonlex.columns = ['wordlist']
        
        df_wl_nonlex_asone = pd.DataFrame(df_inst_nonlex_asone.groupby(df_inst_nonlex_asone['id_text']).apply(lambda x: set(x['lemma'])))
        df_wl_nonlex_asone.columns = ['wordlist']
        
        df_wl_lex_asone = pd.DataFrame(df_inst_lex_asone.groupby(df_inst_lex_asone['id_text']).apply(lambda x: set(x['lemma'])))
        df_wl_lex_asone.columns = ['wordlist']
    
    
    df_comps = get_comparison_df(df_inst_lex_filter,df_inst_nonlex_filter,df_tl_lex_filter,df_tl_nonlex_filter,df_wl_lex,df_wl_nonlex)
    df_comps_nonlex_1 = get_comparison_df(df_inst_lex_filter,df_inst_nonlex_asone,df_tl_lex_filter,df_tl_nonlex_asone,df_wl_lex,df_wl_nonlex_asone)
    df_comps_all = get_comparison_df(df_inst_lex_asone,df_inst_nonlex_asone,df_tl_lex_asone,df_tl_nonlex_asone,df_wl_lex_asone,df_wl_nonlex_asone)
        
    return df_comps, df_comps_nonlex_1, df_comps_all

## IV. Set up the Arguments
And you're ready to go

In [151]:
word_args = {
    'multi': True #Expand to be any selection of n for ngrams
}
lex_args = {
    'periods': ['Old Babylonian'],
    'min_words': 1,
    'qnums_only': True
}

nonlex_args = {
    'corpus': 'royal',
    'periods': ['Old Babylonian'],
    'min_words': 1
}

In [152]:
df_comps,df_comps_nonlex_1, df_comps_all = compare_words(lex_args,nonlex_args,word_args)
#df_comps

UnboundLocalError: local variable 'df_wl_nonlex_asone' referenced before assignment

In [149]:
df_comps_all

Unnamed: 0,lex_comp,lex_name,lex_t,lex_uniq_words,lit_comp,lit_name,lit_t,lit_uniq_words,matches
0,0.542556,ALL_LEX,ALL_LEX,3325,0.387874,ALL_NONLEX,ALL_NONLEX,4651,1804


In [137]:
df_comps

Unnamed: 0,lex_comp,lex_name,lex_t,lex_uniq_words,lit_comp,lit_name,lit_t,lit_uniq_words,matches
0,0.080000,OB Nippur Ura 03,Q000001,350,0.256881,Samsu-iluna B,Q000525,109,28
1,0.025714,OB Nippur Ura 03,Q000001,350,0.391304,Samsu-iluna G,Q000530,23,9
2,0.071429,OB Nippur Ura 03,Q000001,350,0.324675,Abi-ešuh A,Q000532,77,25
3,0.037143,OB Nippur Ura 03,Q000001,350,0.302326,Hammu-rapi 01,Q002180,43,13
4,0.040000,OB Nippur Ura 03,Q000001,350,0.378378,Hammu-rapi 04,Q002182,37,14
5,0.008571,OB Nippur Ura 03,Q000001,350,0.166667,Hammu-rapi 08,Q002184,18,3
6,0.011429,OB Nippur Ura 03,Q000001,350,0.173913,Hammu-rapi 09,Q002185,23,4
7,0.020000,OB Nippur Ura 03,Q000001,350,0.179487,Hammu-rapi 10,Q002186,39,7
8,0.077143,OB Nippur Ura 03,Q000001,350,0.216000,Hammu-rapi 11,Q002187,125,27
9,0.008571,OB Nippur Ura 03,Q000001,350,0.272727,Hammu-rapi 13,Q002189,11,3


In [132]:
#df_comps[df_comps['lit_t'] == 'c.1.6.2'].sort_values(by='matches',ascending=False)

#Which texts contains the highest percentage of a particular lexical text
df_comps[df_comps['lex_t'] == 'Q000041'].sort_values(by='lex_comp',ascending=False)

#Which texts are more composed by words in a particular lexical text
#df_comps[df_comps['lex_t'] == 'Q000041'].sort_values(by='lit_comp',ascending=False)

Unnamed: 0,lex_comp,lex_name,lex_t,lex_uniq_words,lit_comp,lit_name,lit_t,lit_uniq_words,matches
1213,0.233871,OB Nippur Ura 04,Q000041,496,0.151436,Ninurta's exploits: a šir-sud (?) to Ninurta,c.1.6.2,766,116
1237,0.187500,OB Nippur Ura 04,Q000041,496,0.108772,The building of Ninŋirsu's temple (Gudea cylin...,c.2.1.7,855,93
1197,0.179435,OB Nippur Ura 04,Q000041,496,0.155594,Enki and the world order,c.1.1.3,572,89
1524,0.177419,OB Nippur Ura 04,Q000041,496,0.165725,The debate between Winter and Summer,c.5.3.3,531,88
1526,0.159274,OB Nippur Ura 04,Q000041,496,0.169165,The debate between Copper and Silver,c.5.3.6,467,79
1227,0.159274,OB Nippur Ura 04,Q000041,496,0.119335,Lugalbanda in the mountain cave,c.1.8.2.1,662,79
1229,0.157258,OB Nippur Ura 04,Q000041,496,0.142857,Enmerkar and the lord of Aratta,c.1.8.2.3,546,78
1228,0.153226,OB Nippur Ura 04,Q000041,496,0.167033,Lugalbanda and the Anzud bird,c.1.8.2.2,455,76
1235,0.149194,OB Nippur Ura 04,Q000041,496,0.153527,The cursing of Agade,c.2.1.5,482,74
1516,0.149194,OB Nippur Ura 04,Q000041,496,0.106322,The temple hymns,c.4.80.1,696,74


In [138]:
df_comps_nonlex_1

Unnamed: 0,lex_comp,lex_name,lex_t,lex_uniq_words,lit_comp,lit_name,lit_t,lit_uniq_words,matches
0,0.231429,OB Nippur Ura 03,Q000001,350,0.139415,ALL,ALL,581,81
1,0.179673,OB Nippur Ura 01,Q000039,551,0.170396,ALL,ALL,581,99
2,0.158004,OB Nippur Ura 02,Q000040,481,0.130809,ALL,ALL,581,76
3,0.131048,OB Nippur Ura 04,Q000041,496,0.111876,ALL,ALL,581,65
4,0.164835,OB Nippur Ura 05,Q000042,364,0.10327,ALL,ALL,581,60
5,0.583333,OB Nippur Ki-ulutin-bi-še,Q000045,84,0.084337,ALL,ALL,581,49
6,0.222052,OB Nippur Lu,Q000047,653,0.24957,ALL,ALL,581,145
7,0.336842,OB Nippur Kagal,Q000048,380,0.22031,ALL,ALL,581,128
8,0.287425,OB Nippur Izi,Q000050,668,0.330465,ALL,ALL,581,192
9,0.280556,Nippur Nigga,Q000052,360,0.173838,ALL,ALL,581,101


In [None]:
lex_wl = set(lexical['lemma'])
lit_wl = set(etcsl['lemma'])
len(lex_wl),len(lit_wl)

In [None]:
compare_single(lex_wl,lit_wl)

### Get Text Lists

In [None]:
lex_tl = set(lexical['cat_num'])
#lex_tl = {text for text in lex_tl if 'Q' in text}
lit_tl = set(etcsl['id_text'])

In [None]:
lex_tl

In [None]:
len(lex_tl),len(lit_tl)

In [None]:
compare_single(lex_wl,lit_wl)

NTS: rerun the lexical wordlist from dataframe

### Compare Single Lexical Lists to entire Literary Corpus

In [None]:
lit_comps = []
lit_wl = set(etcsl['lemma'])
lit_uniq_words = len(lit_wl)
for lex_t in lex_tl:
    lex_wl = set(lexical[lexical['cat_num'] == lex_t]['lemma'])
    #print(len(lex_wl))
    lex_comp,lit_comp,match_num = compare_single(lex_wl,lit_wl)
    lit_comps.append({'lex_t':lex_t,'lex_name': cat_df.loc[lex_t,'designation'],'lit_comp':lit_comp,'lex_comp':lex_comp,
                     'lex_uniq_words':len(lex_wl),'lit_uniq_words':lit_uniq_words,'matches':match_num})
df_lit_comps = pd.DataFrame(lit_comps)

In [None]:
df_lit_comps.sort_values(by='lit_comp',ascending=False)

In [None]:
df_lit_comps.sort_values(by='lex_comp',ascending=False)

#### Observations
As we might expect, the lexical lists with more unique words overall have higher matching percentages with the literary corpus as a whole.

This is true up until OB Nippur Ura 4, which has more unique words than OB Nippur Ura 2, but less matching. This is also true about OB Nippur Diri and OB Nippur Ura 5, but to a lesser degree

---
Most of the words in Ki-ulutin-bi-še appear in some form or other in literary texts. Given the shortness and the many common words in this list, this fact is not surprising

Despite the many words contained in OB Nippur Ea and OB Nippur Izi, more of their words appear in literary texts compared to other lexical lists. This is also true of OB Nippur Kagal, but the list also has fewer unique words

The Weidner God List has the least amount of hits in the lexical corpus. More than half of the gods in this list are never
attested there.

---

Check Ura 5 to see how field names are lemmatized

### Compare Single Literary Text to Entire Lexical Corpus

In [None]:
lex_comps = []
lex_wl = set(lexical['lemma'])
for lit_t in lit_tl:
    lit_wl = set(etcsl[etcsl['id_text'] == lit_t]['lemma'])
    #print(len(lex_wl))
    lex_comp,lit_comp,match_num = compare_single(lex_wl,lit_wl)
    lex_comps.append({'lit_t':lit_t,'lit_name':etcsl_cat.loc[lit_t]['text_name'],'lex_comp':lex_comp,'lit_comp':lit_comp,
                     'lit_uniq_words':len(lit_wl),'lex_uniq_words':len(lex_wl),'matches':match_num})
df_lex_comps = pd.DataFrame(lex_comps)

In [None]:
df_lex_comps.sort_values(by='lex_comp',ascending=False)

In [None]:
#LIMIT TO > 50 UNIQUE WORDS df_lex_comps[df_lex_comps['lit_uniq_words'] > 50]
#FEWER THAN %90 MATCH RATING df_lex_comps[(df_lex_comps['lit_uniq_words'] > 50) & (df_lex_comps['lit_comp'] < .8)]
df_lex_comps[df_lex_comps['lit_uniq_words'] > 50].sort_values(by='lit_comp',ascending=False)

#### Observations
Generally, the literary texts with more unique words overall match the lexical corpus better than the literary works which have fewer unique words. It is interesting that no single literary composition jumps out as having comparative more words in common with the lexical corpus.

---
The literary texts with few unique words overall also have the highest percentages of matches compared to those texts. It may be wise to remove such texts from further analysis.

Many texts (114 / 319) have a 90% or greater matching score with the lexical corpus. Only 23 texts have matching less than 80%. For example, the Sumerian King List has only a 22% match (the least) but has many (374) unique words. This is probably due to the many proper names. More consideration should go into why some of these other texts have so many words that do not appear in the lexical corpus

---

Why are there so many compositions which have high matching when the overall matching was around 50%? Take a look at texts like SKL etc. Maybe it is only a few texts that are contributing many of these literary terms that do not appear in the lexical lists

### Compare all texts individually

In [None]:
#Remove Ea and Izi df_comps[~df_comps['lex_t'].isin(['Q000050','Q000055'])]
df_comps[~df_comps['lex_t'].isin(['Q000050','Q000055'])].sort_values(by='lit_comp',ascending=False)

In [None]:
#Remove Ki-ulutin-bi-še df_comps[df_comps['lex_t'] != 'Q000045']
df_comps[df_comps['lex_t'] != 'Q000045'].sort_values(by='lex_comp',ascending=False)

#### Observations

It may be wise to look at literary texts individually...

---

It may be wise to remove Ki-ulutin-bi-še from this analysis

### Singular Views

Let us look at Lugal-e (a literary text)

Let us look at OB Nippur Ura 4 (a lexical text)

### Further tasks

* consider overall word frequency
* consider multiword expressions
* consider chopping up some of these lexical lists (like Ura) into sublists
* Include OB Royal Inscriptions
* Pull out the words that are the interesting ones
* Think about what comprises the OB literary corpuse (Gudea??? Tetrad/Decad)
 *   Robsen - House F
 *   Delnero 2016 in CNI 43 (Problems of Canonicity)

In [None]:
Sources

Set Arguments

In [None]:
def