In [2]:
%matplotlib inline
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer
from matplotlib import pyplot as plt
from matplotlib_venn import venn2
import tqdm
import re
import zipfile
import json

In [3]:
#COPYING NIEK VELDHUIS
file = "alltexts.csv"
etcsl = pd.read_csv(file, keep_default_na=False)
etcsl = etcsl.loc[etcsl["lang"].str.contains("sux")]  # throw out non-Sumerian words

In [4]:
#COPYING NIEK VELDHUIS
etcsl["lemma"] = etcsl.apply(lambda r: (r["cf"] + '[' + r["gw"] + ']' + r["pos"]) 
                            if r["cf"] != '' else r['form'] + '[NA]NA', axis=1)
etcsl['lemma'] = [lemma if not lemma == '[NA]NA' else '' for lemma in etcsl['lemma'] ] 
# kick out empty forms
etcsl["lemma"] = etcsl["lemma"].str.lower()

In [84]:
#Create ETCSL catalogue
etcsl_cat = pd.DataFrame(etcsl.groupby(['id_text','text_name']).agg('count')).reset_index(level=[0,1])
#etcsl_cat = etcsl_cat[['id_text','text_name']]
etcsl_cat = etcsl_cat.set_index('id_text')
etcsl_cat = pd.DataFrame(etcsl_cat['text_name'])
etcsl_cat

Unnamed: 0_level_0,text_name
id_text,Unnamed: 1_level_1
c.0.1.1,Ur III catalogue from Nibru (N1)
c.0.1.2,Ur III catalogue at Yale (Y1)
c.0.2.01,OB catalogue from Nibru (N2)
c.0.2.02,OB catalogue in the Louvre (L)
c.0.2.03,OB catalogue from Urim (U1)
c.0.2.04,OB catalogue from Urim (U2)
c.0.2.05,OB catalogue from Urim (U3)
c.0.2.06,OB catalogue from Nibru (N3)
c.0.2.07,OB catalogue possibly from Zimbir (B1)
c.0.2.08,OB catalogue from Nibru (N4)


In [5]:
#COPYING NIEK VELDHUIS
file = "parsed.csv"
lexical = pd.read_csv(file, keep_default_na=False)
lexical = lexical.loc[lexical["lang"].str.contains("sux")]

In [6]:
#COPYING NIEK VELDHUIS
lexical["lemma"] = lexical.apply(lambda r: (r["cf"] + '[' + r["gw"] + ']' + r["pos"]) 
                            if r["cf"] != '' else r['form'] + '[NA]NA', axis=1)
lexical['lemma'] = [lemma if not lemma == '[NA]NA' else '' for lemma in lexical['lemma'] ] 
# kick out empty forms
lexical["lemma"] = lexical["lemma"].str.lower()

In [7]:
#COPYING NIEK VELDHUIS
lexical = lexical[~lexical["field"].isin(["sg", "pr"])] # remove lemmas that derive from the fields "sign" 
# or "pronunciation" in sign lists.

In [12]:
file = "dcclt.zip"
z = zipfile.ZipFile(file) 
st = z.read("dcclt/catalogue.json").decode("utf-8")
j = json.loads(st)
cat_df = pd.DataFrame(j["members"]).T
cat_df = cat_df[["id_text", "period","designation"]] #added designation

In [16]:
ob = cat_df[cat_df["period"] == "Old Babylonian"]
#ob[:10]

In [17]:
keep = ob.index.values
keep = ['dcclt/' + id_text for id_text in keep]
lexical = lexical.loc[lexical["id_text"].isin(keep)]

In [38]:
lexical['cat_num'] = lexical['id_text'].apply(lambda x: x.replace('dcclt/',''))

JASON STUFF

In [18]:
etcsl = etcsl[~etcsl['lemma'].str.contains('\[na\]na')]

In [19]:
lexical = lexical[~lexical['lemma'].str.contains('\[na\]na')]

In [20]:
lexical.shape

(39453, 23)

In [52]:
def compare_single(lex_wl,lit_wl):
    inter_wl = lex_wl.intersection(lit_wl)
    lex_comp = len(inter_wl) / len(lex_wl)
    lit_comp = len(inter_wl) / len(lit_wl)
    return (lex_comp,lit_comp,len(inter_wl))

In [22]:
lex_wl = set(lexical['lemma'])
lit_wl = set(etcsl['lemma'])
len(lex_wl),len(lit_wl)

(4051, 4345)

In [53]:
compare_single(lex_wl,lit_wl)

(0.7170658682634731, 0.11024165707710011, 479)

### Get Text Lists

In [39]:
lex_tl = set(lexical['cat_num'])
lex_tl = {text for text in lex_tl if 'Q' in text}
lit_tl = set(etcsl['id_text'])

In [40]:
len(lex_tl),len(lit_tl)

(14, 394)

In [56]:
compare_single(lex_wl,lit_wl)

(0.7170658682634731, 0.11024165707710011, 479)

### Compare Single Lexical Lists to entire Literary Corpus

In [89]:
lit_comps = []
lit_wl = set(etcsl['lemma'])
lit_uniq_words = len(lit_wl)
for lex_t in lex_tl:
    lex_wl = set(lexical[lexical['cat_num'] == lex_t]['lemma'])
    #print(len(lex_wl))
    lex_comp,lit_comp,match_num = compare_single(lex_wl,lit_wl)
    lit_comps.append({'lex_t':lex_t,'lex_name': cat_df.loc[lex_t,'designation'],'lit_comp':lit_comp,'lex_comp':lex_comp,
                     'lex_uniq_words':len(lex_wl),'lit_uniq_words':lit_uniq_words,'matches':match_num})
df_lit_comps = pd.DataFrame(lit_comps)

In [90]:
df_lit_comps.sort_values(by='lit_comp',ascending=False)

Unnamed: 0,lex_comp,lex_name,lex_t,lex_uniq_words,lit_comp,lit_uniq_words,matches
7,0.777143,OB Nippur Ea,Q000055,700,0.125201,4345,544
13,0.717066,OB Nippur Izi,Q000050,668,0.110242,4345,479
2,0.686064,OB Nippur Lu,Q000047,653,0.103107,4345,448
0,0.653358,OB Nippur Ura 01,Q000039,551,0.082854,4345,360
12,0.654886,OB Nippur Ura 02,Q000040,481,0.072497,4345,315
5,0.757895,OB Nippur Kagal,Q000048,380,0.066283,4345,288
4,0.530242,OB Nippur Ura 04,Q000041,496,0.060529,4345,263
3,0.683333,Nippur Nigga,Q000052,360,0.056617,4345,246
8,0.7,OB Nippur Ura 03,Q000001,350,0.056387,4345,245
6,0.621333,OB Nippur Diri,Q000057,375,0.053625,4345,233


In [91]:
df_lit_comps.sort_values(by='lex_comp',ascending=False)

Unnamed: 0,lex_comp,lex_name,lex_t,lex_uniq_words,lit_comp,lit_uniq_words,matches
11,0.916667,OB Nippur Ki-ulutin-bi-še,Q000045,84,0.017722,4345,77
7,0.777143,OB Nippur Ea,Q000055,700,0.125201,4345,544
5,0.757895,OB Nippur Kagal,Q000048,380,0.066283,4345,288
13,0.717066,OB Nippur Izi,Q000050,668,0.110242,4345,479
9,0.704615,OB Lu₂-azlag₂ B-C,Q000302,325,0.052704,4345,229
8,0.7,OB Nippur Ura 03,Q000001,350,0.056387,4345,245
2,0.686064,OB Nippur Lu,Q000047,653,0.103107,4345,448
3,0.683333,Nippur Nigga,Q000052,360,0.056617,4345,246
12,0.654886,OB Nippur Ura 02,Q000040,481,0.072497,4345,315
0,0.653358,OB Nippur Ura 01,Q000039,551,0.082854,4345,360


#### Observations
As we might expect, the lexical lists with more unique words overall have higher matching percentages with the literary corpus as a whole.

This is true up until OB Nippur Ura 4, which has more unique words than OB Nippur Ura 2, but less matching. This is also true about OB Nippur Diri and OB Nippur Ura 5, but to a lesser degree

---
Most of the words in Ki-ulutin-bi-še appear in some form or other in literary texts. Given the shortness and the many common words in this list, this fact is not surprising

Despite the many words contained in OB Nippur Ea and OB Nippur Izi, more of their words appear in literary texts compared to other lexical lists. This is also true of OB Nippur Kagal, but the list also has fewer unique words

The Weidner God List has the least amount of hits in the lexical corpus. More than half of the gods in this list are never attested there.

### Compare Single Literary Text to Entire Lexical Corpus

In [94]:
lex_comps = []
lex_wl = set(lexical['lemma'])
for lit_t in lit_tl:
    lit_wl = set(etcsl[etcsl['id_text'] == lit_t]['lemma'])
    #print(len(lex_wl))
    lex_comp,lit_comp,match_num = compare_single(lex_wl,lit_wl)
    lex_comps.append({'lit_t':lit_t,'lit_name':etcsl_cat.loc[lit_t]['text_name'],'lex_comp':lex_comp,'lit_comp':lit_comp,
                     'lit_uniq_words':len(lit_wl),'lex_uniq_words':len(lex_wl),'matches':match_num})
df_lex_comps = pd.DataFrame(lex_comps)

In [95]:
df_lex_comps.sort_values(by='lex_comp',ascending=False)

Unnamed: 0,lex_comp,lex_uniq_words,lit_comp,lit_name,lit_t,lit_uniq_words,matches
283,0.150333,4051,0.744499,text_name The building of Ninŋirsu's temple...,c.2.1.7,818,609
51,0.148112,4051,0.803213,text_name Ninurta's exploits: a šir-sud (?)...,c.1.6.2,747,600
319,0.126389,4051,0.769925,text_name The lament for Sumer and Urim Nam...,c.2.2.3,665,512
118,0.123673,4051,0.846284,text_name Lugalbanda in the mountain cave N...,c.1.8.2.1,592,501
129,0.120464,4051,0.745038,"text_name The temple hymns Name: c.4.80.1, ...",c.4.80.1,655,488
354,0.117255,4051,0.862069,text_name Enki and the world order Name: c....,c.1.1.3,551,475
345,0.109356,4051,0.884232,text_name A praise poem of Išme-Dagan (Išme...,c.2.5.4.01,501,443
54,0.106147,4051,0.860000,text_name The debate between Winter and Sum...,c.5.3.3,500,430
257,0.105653,4051,0.812144,text_name Enmerkar and the lord of Aratta N...,c.1.8.2.3,527,428
222,0.104912,4051,0.867347,text_name Proverbs: from Urim Name: c.6.2.3...,c.6.2.3,490,425


In [115]:
#LIMIT TO > 50 UNIQUE WORDS df_lex_comps[df_lex_comps['lit_uniq_words'] > 50]
#FEWER THAN %90 MATCH RATING
df_lex_comps[(df_lex_comps['lit_uniq_words'] > 50) & (df_lex_comps['lit_comp'] < .8)].sort_values(by='lit_comp',ascending=False)

Unnamed: 0,lex_comp,lex_uniq_words,lit_comp,lit_name,lit_t,lit_uniq_words,matches
95,0.018761,4051,0.783505,text_name Letter from Aradŋu to Šulgi about...,c.3.1.01,97,76
78,0.011602,4051,0.783333,text_name A balbale to Bau for Šu-Suen (Šu-...,c.2.4.4.1,60,47
27,0.01333,4051,0.782609,text_name A šir-namšub to Ninisina (Ninisin...,c.4.22.2,69,54
370,0.019008,4051,0.777778,text_name A hymn to Nanna (Nanna O) Name: c...,c.4.13.15,99,77
327,0.09331,4051,0.777778,text_name The lament for Urim Name: c.2.2.2...,c.2.2.2,486,378
250,0.03135,4051,0.77439,"text_name Enlil and Ninlil Name: c.1.2.1, d...",c.1.2.1,164,127
133,0.010862,4051,0.77193,text_name A balbale to Inana (Dumuzid-Inana...,c.4.08.01,57,44
319,0.126389,4051,0.769925,text_name The lament for Sumer and Urim Nam...,c.2.2.3,665,512
261,0.014317,4051,0.763158,text_name A balbale to Nanna (Nanna B) Name...,c.4.13.02,76,58
214,0.011355,4051,0.754098,text_name A hymn to the E-kur Name: c.4.80....,c.4.80.4,61,46


#### Observations
Generally, the literary texts with more unique words overall match the lexical corpus better than the literary works which have fewer unique words. It is interesting that no single literary composition jumps out as having comparative more words in common with the lexical corpus.

---
The literary texts with few unique words overall also have the highest percentages of matches compared to those texts. It may be wise to remove such texts from further analysis.

Many texts (114 / 319) have a 90% or greater matching score with the lexical corpus. Only 23 texts have matching less than 80%. For example, the Sumerian King List has only a 22% match (the least) but has many (374) unique words. This is probably due to the many proper names. More consideration should go into why some of these other texts have so many words that do not appear in the lexical corpus

### Compare all texts individually

Kick out lexical texts that have too few words

In [97]:
min_words = 50
lit_tl_d = {}
for lit_t in lit_tl:
    lit_wl = set(etcsl[etcsl['id_text'] == lit_t]['lemma'])
    if len(lit_wl) >= min_words:
        lit_tl_d[lit_t] = lit_wl
        #print(len(lit_wl))

In [105]:
comps = []
for lex_t in lex_tl:
    #Maybe a good idea to omit OB Ki-ulutin-bi-še
    #if lex_t == 'dcclt/Q000045':
    #    continue
        
    ##Optionally omit Ea and Izi
    #if lex_t in ['dcclt/Q000050','dcclt/Q000055']:
    #    continue
    lex_wl = set(lexical[lexical['cat_num'] == lex_t]['lemma'])
    #print(lex_t + ': ' + str(len(lex_wl)))
    for lit_t in lit_tl_d: #Make sure to use the dictionary here
        lit_wl = lit_tl_d[lit_t]
        #key = lex_t + '::' + lit_t
        lex_comp,lit_comp,match_num = compare_single(lex_wl,lit_wl)
        #lex_comps[key] = lex_comp
        #lit_comps[key] = lit_comp
        comps.append({'lit_t':lit_t,'lit_name':etcsl_cat.loc[lit_t]['text_name'],'lex_t':lex_t,'lex_name':cat_df.loc[lex_t]['designation'],
                      'lex_comp':lex_comp,'lit_comp':lit_comp,'lit_uniq_words':len(lit_wl),'lex_uniq_words':len(lex_wl),
                      'matches':match_num})
df_comps = pd.DataFrame(comps)

In [117]:
#Remove Ea and Izi df_comps[~df_comps['lex_t'].isin(['Q000050','Q000055'])]
df_comps[~df_comps['lex_t'].isin(['Q000050','Q000055'])].sort_values(by='lit_comp',ascending=False)

Unnamed: 0,lex_comp,lex_name,lex_t,lex_uniq_words,lit_comp,lit_name,lit_t,lit_uniq_words,matches
708,0.055130,OB Nippur Lu,Q000047,653,0.571429,A hymn to Nibru and Išme-Dagan (Išme-Dagan C),c.2.5.4.03,63,36
962,0.067381,OB Nippur Lu,Q000047,653,0.564103,An adab to Enlil for Būr-Suen (Būr-Suen B),c.2.5.7.2,78,44
658,0.075038,OB Nippur Lu,Q000047,653,0.556818,A šir-gida to Nuska (Nuska B),c.4.29.2,88,49
1782,0.107895,OB Nippur Kagal,Q000048,380,0.554054,An adab to Bau for Luma (Luma A),c.2.3.1,74,41
807,0.055130,OB Nippur Lu,Q000047,653,0.545455,A tigi to Bau for Gudea (Gudea A),c.2.3.2,66,36
1898,0.078947,OB Nippur Kagal,Q000048,380,0.545455,A tigi to Nintur (Nintur A),c.4.26.1,55,30
929,0.045942,OB Nippur Lu,Q000047,653,0.545455,A tigi to Nintur (Nintur A),c.4.26.1,55,30
1776,0.094737,OB Nippur Kagal,Q000048,380,0.545455,A tigi to Bau for Gudea (Gudea A),c.2.3.2,66,36
1618,0.110526,OB Nippur Kagal,Q000048,380,0.545455,A šir-šag-hula to Damgalnuna (Damgalnuna A),c.4.03.1,77,42
805,0.084227,OB Nippur Lu,Q000047,653,0.544554,An adab to Enlil for Išme-Dagan (Išme-Dagan H),c.2.5.4.08,101,55


In [116]:
#Remove Ki-ulutin-bi-še df_comps[df_comps['lex_t'] != 'Q000045']
df_comps[df_comps['lex_t'] != 'Q000045'].sort_values(by='lex_comp',ascending=False)

Unnamed: 0,lex_comp,lex_name,lex_t,lex_uniq_words,lit_comp,lit_name,lit_t,lit_uniq_words,matches
1850,0.426316,OB Nippur Kagal,Q000048,380,0.198044,The building of Ninŋirsu's temple (Gudea cylin...,c.2.1.7,818,162
2496,0.420000,OB Nippur Ea,Q000055,700,0.359413,The building of Ninŋirsu's temple (Gudea cylin...,c.2.1.7,818,294
1657,0.418421,OB Nippur Kagal,Q000048,380,0.212851,Ninurta's exploits: a šir-sud (?) to Ninurta,c.1.6.2,747,159
2949,0.409231,OB Lu₂-azlag₂ B-C,Q000302,325,0.178046,Ninurta's exploits: a šir-sud (?) to Ninurta,c.1.6.2,747,133
2303,0.408571,OB Nippur Ea,Q000055,700,0.382865,Ninurta's exploits: a šir-sud (?) to Ninurta,c.1.6.2,747,286
1880,0.397368,OB Nippur Kagal,Q000048,380,0.227068,The lament for Sumer and Urim,c.2.2.3,665,151
1724,0.381579,OB Nippur Kagal,Q000048,380,0.221374,The temple hymns,c.4.80.1,655,145
4241,0.372754,OB Nippur Izi,Q000050,668,0.333333,Ninurta's exploits: a šir-sud (?) to Ninurta,c.1.6.2,747,249
1907,0.368421,OB Nippur Kagal,Q000048,380,0.254083,Enki and the world order,c.1.1.3,551,140
4434,0.360778,OB Nippur Izi,Q000050,668,0.294621,The building of Ninŋirsu's temple (Gudea cylin...,c.2.1.7,818,241


#### Observations

It may be wise to look at literary texts individually...

---

It may be wise to remove Ki-ulutin-bi-še from this analysis

### Singular Views

Let us look at Lugal-e (a literary text)

In [119]:
df_comps[df_comps['lit_t'] == 'c.1.6.2'].sort_values(by='matches',ascending=False)

Unnamed: 0,lex_comp,lex_name,lex_t,lex_uniq_words,lit_comp,lit_name,lit_t,lit_uniq_words,matches
2303,0.408571,OB Nippur Ea,Q000055,700,0.382865,Ninurta's exploits: a šir-sud (?) to Ninurta,c.1.6.2,747,286
4241,0.372754,OB Nippur Izi,Q000050,668,0.333333,Ninurta's exploits: a šir-sud (?) to Ninurta,c.1.6.2,747,249
688,0.283308,OB Nippur Lu,Q000047,653,0.247657,Ninurta's exploits: a šir-sud (?) to Ninurta,c.1.6.2,747,185
1657,0.418421,OB Nippur Kagal,Q000048,380,0.212851,Ninurta's exploits: a šir-sud (?) to Ninurta,c.1.6.2,747,159
42,0.268603,OB Nippur Ura 01,Q000039,551,0.198126,Ninurta's exploits: a šir-sud (?) to Ninurta,c.1.6.2,747,148
2949,0.409231,OB Lu₂-azlag₂ B-C,Q000302,325,0.178046,Ninurta's exploits: a šir-sud (?) to Ninurta,c.1.6.2,747,133
1011,0.358333,Nippur Nigga,Q000052,360,0.172691,Ninurta's exploits: a šir-sud (?) to Ninurta,c.1.6.2,747,129
3918,0.257796,OB Nippur Ura 02,Q000040,481,0.165997,Ninurta's exploits: a šir-sud (?) to Ninurta,c.1.6.2,747,124
2626,0.334286,OB Nippur Ura 03,Q000001,350,0.156627,Ninurta's exploits: a šir-sud (?) to Ninurta,c.1.6.2,747,117
1334,0.233871,OB Nippur Ura 04,Q000041,496,0.155288,Ninurta's exploits: a šir-sud (?) to Ninurta,c.1.6.2,747,116


Let us look at OB Nippur Ura 4 (a lexical text)

In [120]:
df_comps[df_comps['lex_t'] == 'Q000041'].sort_values(by='matches',ascending=False)

Unnamed: 0,lex_comp,lex_name,lex_t,lex_uniq_words,lit_comp,lit_name,lit_t,lit_uniq_words,matches
1334,0.233871,OB Nippur Ura 04,Q000041,496,0.155288,Ninurta's exploits: a šir-sud (?) to Ninurta,c.1.6.2,747,116
1527,0.187500,OB Nippur Ura 04,Q000041,496,0.113692,The building of Ninŋirsu's temple (Gudea cylin...,c.2.1.7,818,93
1584,0.179435,OB Nippur Ura 04,Q000041,496,0.161525,Enki and the world order,c.1.1.3,551,89
1336,0.177419,OB Nippur Ura 04,Q000041,496,0.176000,The debate between Winter and Summer,c.5.3.3,500,88
1392,0.159274,OB Nippur Ura 04,Q000041,496,0.133446,Lugalbanda in the mountain cave,c.1.8.2.1,592,79
1489,0.159274,OB Nippur Ura 04,Q000041,496,0.188095,The debate between Copper and Silver,c.5.3.6,420,79
1505,0.157258,OB Nippur Ura 04,Q000041,496,0.148008,Enmerkar and the lord of Aratta,c.1.8.2.3,527,78
1371,0.153226,OB Nippur Ura 04,Q000041,496,0.170787,Lugalbanda and the Anzud bird,c.1.8.2.2,445,76
1401,0.149194,OB Nippur Ura 04,Q000041,496,0.112977,The temple hymns,c.4.80.1,655,74
1402,0.149194,OB Nippur Ura 04,Q000041,496,0.156118,The cursing of Agade,c.2.1.5,474,74


### Further tasks

* consider overall word frequency
* consider multiword expressions
* consider chopping up some of these lexical lists (like Ura) into sublists
* Include OB Royal Inscriptions