# detection of important sentences over "time"

## languages

In [None]:
import pandas as pd 

In [None]:
## classification by macroarea :)
## https://glottolog.org/meta/downloads

macroarea = pd.read_csv('languages_and_dialects_geo.csv',sep=',')

In [None]:
macroarea.dropna(inplace = True)

In [None]:
macroarea = macroarea[['isocodes','macroarea']]

In [None]:
## dictionary iso_code:macroarea

macroarea = dict(zip(macroarea['isocodes'], macroarea['macroarea']))

In [None]:
## filter by languages of the americas

macroarea = {language:macroarea[language] for language in macroarea.keys() if macroarea[language] in ['South America','North America']}

In [None]:
#macroarea

In [None]:
#import zipfile
#with zipfile.ZipFile('udhr.zip', 'r') as zip_ref:
#    zip_ref.extractall()

In [None]:
## corpus UDHR https://www.unicode.org/udhr/index.html

languages={}

for language in macroarea.keys():
    #with open('udhr_' + language + '.txt', 'r', encoding='utf-8') as file:
    try: 
        file = open('udhr/'+'udhr_'+language+'.txt', 'r')  
        language_text=file.read().split('\n')
        languages[language]=[line.strip() for line in language_text]
        languages[language]=[line for line in languages[language] if len(line)>0]
    except FileNotFoundError:
        pass

In [None]:
len(languages)

In [None]:
## mapudungun :)

languages['zro'][:10]

## basic statistics: types and tokens

In [None]:
def tokenize(s):
    return s.split(' ')

In [None]:
def clean(L):
    language=languages[L]
    table = str.maketrans({key: None for key in '``!"#$%&\¿()*+,-./:;<=>?@[\\]_{|}'})
    language=[list(filter(None, [w.lower().translate(table) for w in tokenize(sentence)])) for sentence in language if len([w.lower() for w in [w.translate(table) for w in tokenize(sentence)]])>0]
    language=[[w for w in s if w!="''"] for s in language] 
    language=[[w for w in s if w!='̃'] for s in language] 
    language=[[w for w in s if not w.isdigit()] for s in language]
    return language[9:]

In [None]:
clean_languages={}
for language in languages:
    C=clean(language)
    if len(C)>0:
        clean_languages[language]=clean(language)

In [None]:
len(clean_languages['kwi'])

In [None]:
#for language in clean_languages.keys():
#    print(language, clean_languages[language][0])

### simple statistics: types, tokens and entropy

In [None]:
from collections import Counter

In [None]:
words_entropy={}
words={}
mean_tokens={}
mean_types={}
number_sentences={}
for language in clean_languages.keys():
    number_sentences[language]=len(clean_languages[language])
    words_entropy[language]=[item for sublist in clean_languages[language] for item in sublist]
    words[language]=[len([item for sublist in clean_languages[language] for item in sublist]),len(set([item for sublist in clean_languages[language] for item in sublist])),Counter([item for sublist in clean_languages[language] for item in sublist])]
    mean_tokens[language]=len([item for sublist in clean_languages[language] for item in sublist])
    mean_types[language]=len(set([item for sublist in clean_languages[language] for item in sublist]))
    print(language,len(clean_languages[language]),len([item for sublist in clean_languages[language] for item in sublist]),len(set([item for sublist in clean_languages[language] for item in sublist])))

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
fig, ax = plt.subplots(dpi=800)

H=list(mean_tokens.values())
data = np.array(H)
plt.hist(data, bins='sturges', cumulative=False, linewidth=0.75, color='gold',alpha=0.95,histtype='stepfilled',stacked=False,density=False,
        zorder=5, edgecolor='k')
#H=list(mean_types.values())
#data = np.array(H)
#plt.hist(data, bins='doane', cumulative=False, color='r',alpha=0.75,stacked=False,density=True,
#        zorder=5, edgecolor='k')
#kde = sm.nonparametric.KDEUnivariate(data)
#kde.fit(bw=0.8) # Estimate the densities
#X=np.linspace(500, 3000, num=1000)

#ax.plot(X, [kde.evaluate(x) for x in X], '-', lw=2, color='r', zorder=10)

#plt.legend(loc='upper left',fontsize=12)
#plt.xlim([0., 2])
#plt.ylim([0., 3])
print(np.mean(data))
#plt.axvline(x=np.mean(data),linestyle='--',color='k')
plt.xlabel(r'tokens',fontsize=15)
plt.ylabel(r'frequency',fontsize=15)
plt.rcParams.update({'font.size': 10})
plt.savefig('tokens.pdf', format='pdf', transparent=True, bbox_inches='tight',dpi=800)
plt.show()

In [None]:
fig, ax = plt.subplots(dpi=800)

H=list(number_sentences.values())
data = np.array(H)
plt.hist(data, bins='sturges', cumulative=False, linewidth=0.75, color='lime',alpha=0.95,histtype='stepfilled',stacked=False,density=False,
        zorder=5, edgecolor='k')
#H=list(mean_types.values())
#data = np.array(H)
#plt.hist(data, bins='doane', cumulative=False, color='r',alpha=0.75,stacked=False,density=True,
#        zorder=5, edgecolor='k')
#kde = sm.nonparametric.KDEUnivariate(data)
#kde.fit(bw=0.8) # Estimate the densities
#X=np.linspace(500, 3000, num=1000)

#ax.plot(X, [kde.evaluate(x) for x in X], '-', lw=2, color='r', zorder=10)

#plt.legend(loc='upper left',fontsize=12)
#plt.xlim([0., 2])
#plt.ylim([0., 3])
print(np.mean(data))
#plt.axvline(x=np.mean(data),linestyle='--',color='k')
plt.xlabel(r'number of sentences',fontsize=15)
plt.ylabel(r'frequency',fontsize=15)
plt.rcParams.update({'font.size': 10})
plt.savefig('sentences.pdf', format='pdf', transparent=True, bbox_inches='tight',dpi=800)
plt.show()

In [None]:
len(clean_languages['ayr'])

## graphs!

In [None]:
import networkx as nx

In [None]:
## adjacency graph
## n_sent: graph with n_sent sentences

def GoW(text_clean,n_parts,radius=1):
    list_graphs=[]
    L = range(len(text_clean))
    parts = np.array_split(L, n_parts)
    
    for part in parts:
        G=nx.Graph()
        for sentence in [text_clean[i] for i in part]:
            if len(sentence)>1:
                pairs=[]
                for r in list(range(1,radius+1)):
                    pairs+=list(zip(sentence,sentence[r:]))#+list(zip(sentence,sentence[2:]))+list(zip(sentence,sentence[3:]))
                for pair in pairs:
                    if G.has_edge(pair[0],pair[1])==False:
                        G.add_edge(pair[0],pair[1],weight=1)
                    else:
                        x=G[pair[0]][pair[1]]['weight']
                        G[pair[0]][pair[1]]['weight']=x+1
                    
        #Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
        #G0 = G.subgraph(Gcc[0])
        list_graphs+=[G]
    
    return list_graphs

In [None]:
graphs={L:{} for L in clean_languages.keys()}


for language in clean_languages.keys():
    if mean_tokens[language]>mean_types[language]:
        #print(language)
        for n_parts in [1,2,3,4,5,6,7,8,9,10]:
            G=GoW(clean_languages[language],n_parts)
            graphs[language][n_parts]=G#nx.maximum_spanning_tree(G)

In [None]:
graphs['arn'][5]

### algoritmos!!!

In [None]:
!pip install python-louvain

In [None]:
import community as community_louvain

In [None]:
modularity_dict = {language:{} for language in graphs.keys()}

In [None]:
for language in graphs.keys():
    for n_part in graphs[language].keys():
        list_graphs=graphs[language][n_part]
        list_mod = {}
        for i in range(len(list_graphs)):
            G=list_graphs[i]
            if len(G.edges())>0:
                partition = community_louvain.best_partition(G)
                list_mod[i+1]=community_louvain.modularity(partition,G)
        modularity_dict[language][n_part]=list_mod

In [None]:
core_dict = {language:{} for language in graphs.keys()}

In [None]:
for language in graphs.keys():
    for n_part in graphs[language].keys():
        list_graphs=graphs[language][n_part]
        list_core = {}
        for i in range(len(list_graphs)):
            G=list_graphs[i]
            G.remove_edges_from(nx.selfloop_edges(G))
            list_core[i+1]=np.mean(list(nx.core_number(G).values()))
        core_dict[language][n_part]=list_core

In [None]:
k_core_dict = {}

In [None]:
for language in graphs.keys():
    G=graphs[language][1][0]
    G.remove_edges_from(nx.selfloop_edges(G))
    k_core_dict[language]=len(nx.k_core(G))/len(G)

In [None]:
fig, ax = plt.subplots(dpi=800)
    
ax.plot([mean_types[L]/mean_tokens[L] for L in k_core_size_dict.keys()],[core_dict[L][1][1] for L in k_core_size_dict.keys()],'H',color='orange',markersize=6,markeredgewidth=0.5,markeredgecolor='k',alpha=0.75,fillstyle='full',clip_on=True)

plt.grid(False)
#plt.legend(loc='best')
plt.ylabel(r'average core number',fontsize=12)
plt.xlabel(r'type-token ratio',fontsize=12)
plt.rcParams.update({'font.size': 10})
plt.savefig('core.pdf', format='pdf', transparent=True, bbox_inches='tight',dpi=800)
plt.show()

In [None]:
fig, ax = plt.subplots(dpi=800)
    
ax.plot([mean_types[L]/mean_tokens[L] for L in k_core_size_dict.keys()],[k_core_dict[L] for L in k_core_size_dict.keys()],'D',color='lime',markersize=6,markeredgewidth=0.5,markeredgecolor='k',alpha=0.75,fillstyle='full',clip_on=True)

plt.grid(False)
#plt.legend(loc='best')
plt.ylabel(r'$|k-core|/|G|$',fontsize=9)
plt.xlabel(r'type-token ratio',fontsize=9)
plt.rcParams.update({'font.size': 10})
plt.savefig('corevsG.pdf', format='pdf', transparent=True, bbox_inches='tight',dpi=800)
plt.show()