In [33]:
import dill
import json
import string

import numpy as np
import scipy
import pandas as pd

import seaborn as sns

In [32]:
### UTIL
    
def clean(text):
    text = text.lower()
    text = text.encode("ascii", "ignore").decode("ascii")
    text = text.translate(str.maketrans('', '', string.punctuation))

    return text

In [4]:
processed_data_path = '../data/processed/'

corpus_narr_fn = processed_data_path + 'corpus_narr.pkl'
corpus_else_fn = processed_data_path + 'corpus_else.pkl'
fields_paths = processed_data_path + 'fields.pkl'

In [5]:
corpus_narr = dill.load(open(corpus_narr_fn, 'rb'))
corpus_else = dill.load(open(corpus_else_fn, 'rb'))
fields = dill.load(open(fields_paths, 'rb'))

In [12]:
for f,l in fields:
    print(f)

['sleep' 'lie' 'fall' 'asleep']
['bad' 'hear' 'voice' 'listen' 'feel']
['fruit' 'birth' 'child' 'produce' 'bear']
['beat' 'dead' 'die' 'death' 'hit' 'kill']
['begin' 'catch' 'fish' 'trap']
['brother' 'elder' 'uncle' 'big' 'grow']
['different' 'change' 'bit']
['cook' 'boil' 'food' 'prepare']
['finished' 'parent' 'boy' 'child' 'law' 'guy' 'mother' 'brother' 'son'
 'small' 'sibling' 'girl' 'grandmother' 'young' 'sister']
['branch' 'tree' 'wood']
['build' 'fire' 'prepare']
['burn' 'fire' 'candle' 'light']
['keep' 'story' 'call' 'word' 'language' 'help' 'ask' 'speak' 'talk']
['home' 'village' 'call']
['hold' 'catch' 'grab']
['world' 'country' 'land' 'earth' 'ground']
['cut' 'ground' 'fall' 'throw']
['doctor' 'traditional' 'healer']
['finish' 'end' 'story']
['farm' 'farmer' 'field']
['pity' 'feel' 'sorry']
['fight' 'fighting' 'war']
['fish' 'line' 'fishing']
['happen' 'pass' 'many']
['thought' 'keep' 'heart' 'think' 'plan']
['marry' 'marriage' 'wedding']
['marry' 'wife' 'married']
['next' 'm

In [7]:
### INDEPENDENT VARIABLES

def freq_field(field, langcorp):
    count = 0
    total = 0
    
    for document in langcorp:
        for sentence in langcorp[document]:
            for word in langcorp[document][sentence]['spc']:
                total += 1
                if clean(word['lemma']) in field:
                    count += 1
    #print(count, '/', total, '=', count / total)
    if total == 0:
        return 0
    return count / total

In [8]:
### DEPENDENT VARIABLES

def h(X):
    p = lambda v: v / np.sum(X)
    return -np.sum([p(x) * np.log(p(x)) for x in X])

def measure_langfield_count(langfield):
    return len(langfield)

def measure_langfield_entropy(langfield):
    return h(list(langfield.values()))

In [9]:
### DATA TABLE

def populate_data(fields, corpus_narr, corpus_else):
    d = {
        'field': list(),
        'lang': list(),
        'narr_freq': list(), 
        'else_freq': list(), 
        'lf_count' : list(),
        'lf_entropy': list()
    }
    
    for field, langfields in fields:
        print(field)
        for lang in langfields:
            #print('\t',lang)
            
            d['field'].append(field)
            d['lang'].append(lang)
            d['narr_freq'].append(freq_field(field, narrcorp[lang]))
            d['else_freq'].append(freq_field(field, elsecorp[lang]))
            d['lf_count'].append(measure_langfield_count(langfields[lang]))
            d['lf_entropy'].append(measure_langfield_entropy(langfields[lang]))

    return pd.DataFrame(d)
        

In [27]:
df = populate_data(fields, corpus_narr, corpus_else)

['sleep' 'lie' 'fall' 'asleep']
['bad' 'hear' 'voice' 'listen' 'feel']
['fruit' 'birth' 'child' 'produce' 'bear']
['beat' 'dead' 'die' 'death' 'hit' 'kill']
['begin' 'catch' 'fish' 'trap']
['brother' 'elder' 'uncle' 'big' 'grow']
['different' 'change' 'bit']
['cook' 'boil' 'food' 'prepare']
['finished' 'parent' 'boy' 'child' 'law' 'guy' 'mother' 'brother' 'son'
 'small' 'sibling' 'girl' 'grandmother' 'young' 'sister']
['branch' 'tree' 'wood']
['build' 'fire' 'prepare']
['burn' 'fire' 'candle' 'light']
['keep' 'story' 'call' 'word' 'language' 'help' 'ask' 'speak' 'talk']
['home' 'village' 'call']
['hold' 'catch' 'grab']
['world' 'country' 'land' 'earth' 'ground']
['cut' 'ground' 'fall' 'throw']
['doctor' 'traditional' 'healer']
['finish' 'end' 'story']
['farm' 'farmer' 'field']
['pity' 'feel' 'sorry']
['fight' 'fighting' 'war']
['fish' 'line' 'fishing']
['happen' 'pass' 'many']
['thought' 'keep' 'heart' 'think' 'plan']
['marry' 'marriage' 'wedding']
['marry' 'wife' 'married']
['next' 'm

In [50]:
data = df[(df.narr_freq != 0) & (df.else_freq != 0)]

data['lf_count_log'] = np.log(data['lf_count'] + 1)
data['narr_freq_log'] = np.log(1e6 * data['narr_freq'])
data['else_freq_log'] = np.log(1e6 * data['else_freq'])

data.to_csv(processed_data_path + 'data.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lf_count_log'] = np.log(data['lf_count'] + 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['narr_freq_log'] = np.log(1e6 * data['narr_freq'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['else_freq_log'] = np.log(1e6 * data['else_freq'])


In [21]:
# run on R

RangeIndex(start=0, stop=1488, step=1)

In [54]:
### STATISTICAL MODEL

import statsmodels.formula.api as smf

count_model = smf.mixedlm("lf_count_log ~ narr_freq_log + else_freq_log", data, groups=data['lang'])
results_cm = count_model.fit()
print(results_cm.summary())

          Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: lf_count_log
No. Observations: 5634    Method:             REML        
No. Groups:       26      Scale:              0.2393      
Min. group size:  56      Log-Likelihood:     -4046.3594  
Max. group size:  373     Converged:          Yes         
Mean group size:  216.7                                   
----------------------------------------------------------
               Coef.  Std.Err.    z    P>|z| [0.025 0.975]
----------------------------------------------------------
Intercept      -3.013    0.124 -24.341 0.000 -3.255 -2.770
narr_freq_log   0.444    0.007  64.212 0.000  0.431  0.458
else_freq_log   0.309    0.007  45.614 0.000  0.296  0.323
Group Var       0.326    0.190                            



In [44]:
entropy_model = smf.mixedlm("lf_entropy ~ narr_freq_log + else_freq_log", data, groups=data['lang'])
results_em = entropy_model.fit()
print(results_em.summary())

          Mixed Linear Model Regression Results
Model:             MixedLM Dependent Variable: lf_entropy
No. Observations:  5634    Method:             REML      
No. Groups:        26      Scale:              0.3270    
Min. group size:   56      Log-Likelihood:     -4922.7964
Max. group size:   373     Converged:          Yes       
Mean group size:   216.7                                 
---------------------------------------------------------
              Coef.  Std.Err.    z    P>|z| [0.025 0.975]
---------------------------------------------------------
Intercept     -2.986    0.132 -22.644 0.000 -3.244 -2.727
narr_freq_log  0.405    0.008  50.026 0.000  0.389  0.421
else_freq_log  0.280    0.008  35.289 0.000  0.264  0.295
Group Var      0.353    0.176                            

