In [1]:
import dill
import json
import string

import numpy as np
import scipy
import pandas as pd

import seaborn as sns

In [2]:
### UTIL
    
def clean(text):
    text = text.lower()
    text = text.encode("ascii", "ignore").decode("ascii")
    text = text.translate(str.maketrans('', '', string.punctuation))

    return text

In [3]:
processed_data_path = '../data/processed/'

corpus_narr_fn = processed_data_path + 'corpus_narr.pkl'
corpus_else_fn = processed_data_path + 'corpus_else.pkl'
fields_paths = processed_data_path + 'fields.pkl'

In [4]:
corpus_narr = dill.load(open(corpus_narr_fn, 'rb'))
corpus_else = dill.load(open(corpus_else_fn, 'rb'))
fields = dill.load(open(fields_paths, 'rb'))

In [5]:
for f,l in fields:
    print(f)

['birth' 'produce' 'bear' 'child']
['kill' 'beat' 'hit']
['grow' 'elder' 'big']
['prepare' 'boil' 'cook']
['young' 'guy' 'boy' 'son' 'small' 'brother' 'girl' 'sister' 'child']
['village' 'home' 'call']
['woman' 'speak' 'word' 'help' 'talk' 'keep' 'ask' 'story' 'call'
 'language']
['catch' 'grab' 'hold']
['ground' 'country' 'earth' 'land' 'world']
['cut' 'throw' 'fall']
['day' 'next' 'morning' 'tomorrow']
['dead' 'die' 'kill']
['sleep' 'fall' 'lie']
['feel' 'hear' 'listen']
['fishing' 'fish' 'line']
['wife' 'woman' 'grandmother' 'sister' 'mother' 'husband' 'marry']
['pass' 'many' 'happen']
['marry' 'wedding' 'marriage']
['plan' 'think' 'thought']
['sit' 'stay' 'settle']
['childhood' 'small']
['dear' 'friend']
['plant' 'planting']
['day' 'stop']
['face' 'side']
['animal' 'hunt']
['mind' 'thought']
['shill' 'shilling']
['curse' 'scold']
['dad' 'finished']
['hunger' 'hungry']
['bit' 'bite']
['bring' 'carry']
['keep' 'think']
['damage' 'hurt']
['ask' 'find']
['arm' 'hand']
['return' 'turn']

In [6]:
### INDEPENDENT VARIABLES

def freq_field(field, langcorp):
    count = 0
    total = 0
    
    for document in langcorp:
        for sentence in langcorp[document]:
            for word in langcorp[document][sentence]['spc']:
                total += 1
                if clean(word['lemma']) in field:
                    count += 1
    #print(count, '/', total, '=', count / total)
    if total == 0:
        return 0
    return count / total

In [7]:
### DEPENDENT VARIABLES

def h(X):
    p = lambda v: v / np.sum(X)
    return -np.sum([p(x) * np.log(p(x)) for x in X])

def measure_langfield_count(langfield):
    return len(langfield)

def measure_langfield_entropy(langfield):
    return h(list(langfield.values()))

In [8]:
### DATA TABLE

def populate_data(fields, corpus_narr, corpus_else):
    d = {
        'field': list(),
        'lang': list(),
        'narr_freq': list(),
        'narr_freq_log': list(),
        'else_freq': list(), 
        'else_freq_log': list(),
        'lf_count' : list(),
        'lf_count_log': list(),
        'lf_entropy': list()
    }
    
    i = 0
    for field, langfields in fields:
        for lang in langfields:
            
            narr_freq = freq_field(field, corpus_narr[lang])
            else_freq = freq_field(field, corpus_else[lang])
            lf_count = measure_langfield_count(langfields[lang])
            lf_entropy = measure_langfield_entropy(langfields[lang])

            d['field'].append(field)
            d['lang'].append(lang)
            d['narr_freq'].append(narr_freq)
            d['narr_freq_log'].append(np.log(1 + narr_freq * 1e6))
            d['else_freq'].append(else_freq)
            d['else_freq_log'].append(np.log(1 + else_freq * 1e6))
            d['lf_count'].append(lf_count)
            d['lf_count_log'].append(np.log(lf_count + 1))
            d['lf_entropy'].append(lf_entropy)

        i+=1
        print(i, '/', len(fields))
    
    return pd.DataFrame(d)
        

In [9]:
df = populate_data(fields, corpus_narr, corpus_else)

1 / 306
2 / 306
3 / 306
4 / 306
5 / 306
6 / 306
7 / 306
8 / 306
9 / 306
10 / 306
11 / 306
12 / 306
13 / 306
14 / 306
15 / 306
16 / 306
17 / 306
18 / 306
19 / 306


  return -np.sum([p(x) * np.log(p(x)) for x in X])
  return -np.sum([p(x) * np.log(p(x)) for x in X])


20 / 306
21 / 306
22 / 306
23 / 306
24 / 306
25 / 306
26 / 306
27 / 306
28 / 306
29 / 306
30 / 306
31 / 306
32 / 306
33 / 306
34 / 306
35 / 306
36 / 306
37 / 306
38 / 306
39 / 306
40 / 306
41 / 306
42 / 306
43 / 306
44 / 306
45 / 306
46 / 306
47 / 306
48 / 306
49 / 306
50 / 306
51 / 306
52 / 306
53 / 306
54 / 306
55 / 306
56 / 306
57 / 306
58 / 306
59 / 306
60 / 306
61 / 306
62 / 306
63 / 306
64 / 306
65 / 306
66 / 306
67 / 306
68 / 306
69 / 306
70 / 306
71 / 306
72 / 306
73 / 306
74 / 306
75 / 306
76 / 306
77 / 306
78 / 306
79 / 306
80 / 306
81 / 306
82 / 306
83 / 306
84 / 306
85 / 306
86 / 306
87 / 306
88 / 306
89 / 306
90 / 306
91 / 306
92 / 306
93 / 306
94 / 306
95 / 306
96 / 306
97 / 306
98 / 306
99 / 306
100 / 306
101 / 306
102 / 306
103 / 306
104 / 306
105 / 306
106 / 306
107 / 306
108 / 306
109 / 306
110 / 306
111 / 306
112 / 306
113 / 306
114 / 306
115 / 306
116 / 306
117 / 306
118 / 306
119 / 306
120 / 306
121 / 306
122 / 306
123 / 306
124 / 306
125 / 306
126 / 306
127 / 306


In [10]:
# filter dataframe
data = df[(df.narr_freq != 0) & (df.else_freq != 0)]

In [11]:
data.to_csv(processed_data_path + 'data.csv')