In [None]:
import nltk
import pandas as pd
import os
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from shutil import copyfile
from nltk.tokenize import SyllableTokenizer
import numpy as np
import time
import warnings
import random
from multiprocessing import Pool
warnings.filterwarnings("error")
import csv

# 0. Functions

In [None]:
SSP_hierarchy = [
    "aeiouyàáâäæãåāąèéêëēėęîïíīįìôöòóœøōõûüùúūůÿŷűőŵỳẁěýǫ", # vowels.
    "ŭwł", # approximants.
    "lrř", # liquids.
    "mnñńŋň", # nasals.
    "ßzvsfçćśŝĉĥhĵšžðđ", # fricatives.
    "xjźżĝč", # affricate.
    "bcdgtkpqþďť", # occlusives.
]


def paralelized(text):
    text = text.lower()          
    try:                    
        # Create vowels and consonants dictionary
        keyconsonant = "" 
        for group in SSP_hierarchy[1::]:
            keyconsonant += group
        keyvowel = SSP_hierarchy[0]

        consonant = dict.fromkeys(keyconsonant, 0)
        vowels = dict.fromkeys(keyvowel, 0)

        SSP = SyllableTokenizer(sonority_hierarchy = SSP_hierarchy)
        tokenized_word = SSP.tokenize(text)
        num_syllabes = len(tokenized_word)

        #Length of words
        sillenghts = []
        for i in range(0, num_syllabes):
            sillenghts.append(len(tokenized_word[i]))


    except UserWarning:
        pass         


    for t in text:
        if t in vowels: vowels[t] += 1
        if t in consonant: consonant[t] += 1


    total_vowels = sum(vowels.values())
    total_consonant = sum(consonant.values())
    if total_consonant + total_vowels == len(text):
        return([text, total_consonant, total_vowels, num_syllabes, total_consonant + total_vowels,
                (total_consonant + total_vowels) / num_syllabes, sillenghts])

# 1. Menzerath-Altman Class

In [None]:
class CorpusAnalysis:
    def __init__(self, lang):
        __slots__ = ['flag', 'consonants', 'vowels', 'words', 'syllabes', 'p_consonant', 'p_vowel',
                     'p_silence', 'menzerath', 'df', 'syllabeslength', 'p_syllabeslength']
        self.flag = True
        self.consonants = 0
        self.vowels = 0
        self.words = 0
        self.syllabes = 0
        self.books = 0
        
        self.result_path = "results/" + lang + "/"
        self.result_menzerathbyBook = self.result_path + "MALbyBook_" + lang + ".csv"

        with open(self.result_menzerathbyBook, "w", newline="") as f:
            writer = csv.writer(f)
        
        self.p_consonant = 0.0
        self.p_vowel = 0.0
        self.p_silence = 0.0
        self.menzerath = []
        self.model = []
        self.df = []
        self.syllabeslength = {0:[], 1:[], 2:[], 3:[], 4:[], 5:[], 6:[], 7:[], 8:[], 9:[], 10:[], 11:[], 12:[], 13:[], 14:[], 15:[], 16:[], 17:[], 18:[], 19:[], 20:[]}
        self.p_syllabeslength = {0:[], 1:[], 2:[], 3:[], 4:[], 5:[], 6:[], 7:[], 8:[], 9:[], 10:[], 11:[], 12:[], 13:[], 14:[], 15:[], 16:[], 17:[], 18:[], 19:[], 20:[]}


    def save_results(self, out_path):
        os.makedirs(os.path.dirname(self.result_path), exist_ok=True)

        self.menzerath.to_csv(self.result_path + "menzerath_altmann_" + out_path + ".csv")
        chars_list = [('Total used books', self.books),
                      ('Total used words', self.words),
                      ('Total syllabes', self.syllabes),
                      ('Total consonants', self.consonants),
                      ('Total vowels', self.vowels),
                      ('P(vowel)', self.p_vowel),
                      ('P(consonant)', self.p_consonant),
                      ('P(silence)', self.p_silence),
                      ('Ratio syllabes/vowels', self.syllabes/self.vowels)
                     ]
        # Save Characteristics
        pd.DataFrame(chars_list,
                     columns=['feature', 'value']).to_csv(self.result_path + "characteristics_" + out_path+ ".csv")

        # Save HMM model
        self.model.to_csv(self.result_path + "HMM_menzerath_" + out_path + ".csv")
        
        # Save plot
        self.plot(out_path)
        
        # Save Clusters size probabilities
        pd.DataFrame.from_dict(data=self.p_syllabeslength, 
                               orient='index').to_csv(self.result_path + 'clustersizes_' + out_path + '.csv', header=False)


    def add_corpus_analysis(self, token_list, language, method = "SSP"):
        """
        Add corpus and analyse number of words, consonant and silences and syllabificate
        """ 
        self.result_path = "results/" + language + "/"
        words = [word for word in token_list if word.isalpha()]

        with Pool(9) as p:
            full_features_list = p.map(paralelized, words)

            
        # Update syllable length depending on position
        features_list = []
        for lista in full_features_list:
            if lista is not None:
                self.syllabeslength[lista[3]].append(lista[-1])
                features_list.append(lista)
        del full_features_list
        
        featuresDF = pd.DataFrame(features_list, columns=['token', 'consonant', 'vowels',  'syllabes', 
                                                          'letters', 'mean_syllabe_length', 'sillengths'])
        
        # Save a sample the first time of each language
        if self.flag == True:
            os.makedirs(os.path.dirname(self.result_path), exist_ok=True)
            featuresDF.to_csv(self.result_path + "SAMPLE.csv", index=False)
            self.flag = False
        
        
        # Compute Vowels, Words, Consonant, Syllables
        self.consonants += int(featuresDF['consonant'].sum())
        self.vowels += int(featuresDF['vowels'].sum())
        self.words += len(featuresDF)
        self.syllabes += int(featuresDF['syllabes'].sum())
        self.books += 1
        
        # Save syllables and meanSyllableLength 
        self.df.extend([tuple(x) for x in featuresDF[['syllabes', 'mean_syllabe_length']].to_numpy(dtype='float16')])
        
        # Menzerath for each book
        MenzerathBook = featuresDF[['syllabes', 'mean_syllabe_length']].groupby("syllabes").agg({'mean_syllabe_length':['mean']})
        MenzerathBook = MenzerathBook['mean_syllabe_length']['mean'].to_list()
        MenzerathBook.extend([0] * (20 - len(MenzerathBook)))
        MenzerathBook = np.asarray(MenzerathBook)
        
        with open(self.result_menzerathbyBook, "ab") as f:
            f.write(b"\n")
            np.savetxt(f, MenzerathBook, delimiter=",", newline=" ", fmt='%1.4f')                                                                                   
                                                                                                
        
    def menzerath_altmann(self):
        """
        Compute BHMM model values
        """
        self.p_consonant = self.consonants / (self.consonants + self.vowels + self.words)
        self.p_vowel = self.vowels / (self.consonants + self.vowels + self.words)
        self.p_silence = self.words / (self.consonants + self.vowels + self.words)
        syllabeslength = self.syllabeslength
        for key, _ in enumerate(self.syllabeslength):
            if len(syllabeslength[key])>0:
                self.p_syllabeslength[key] = np.sum(np.array(self.syllabeslength[key]), 0)/len(syllabeslength[key])
        
        # Compute Menzerath
        self.menzerath = pd.DataFrame.from_records(self.df, columns=['syllabes', 'mean_syllabe_length']).groupby("syllabes").agg({'mean_syllabe_length':['mean', 'std', 'count']})
        self.HMM_model()

        
    def HMM_model(self):
        """
        Compute BHMM model values
        """
        p = self.p_consonant
        model = []
        for m in range(1, 20):
            mean_length = (p)/(1-p)/m + 1/(1-p)
            model.append((m, mean_length))

        self.model = pd.DataFrame(model, columns=['feature', 'value'])

        
    def plot(self, out_path):
        """
        Draft Plot
        """
        fig, ax = plt.subplots()
        ax.plot(self.menzerath.index.values, self.menzerath.mean_syllabe_length["mean"].values,
                '--o', lw=2, ms=9, label="Corpus")
        ax.plot(self.model.feature, self.model.value, '--o', lw=2, ms=9, label="HMM model")

        ax.set_xlabel("word length [Syllables]")
        ax.set_ylabel("mean syllable length [Characters]")
        ax.legend()
        fig.savefig(self.result_path + "menzerath_altmann_" + out_path + ".pdf")
        fig.savefig("results/menzerath_altmann_" + out_path + ".pdf")
        plt.close('all')

        
def gutenberg_corpus(metadata, data_path, lang, sizelimit=None, method = "SSP"):   
    analysis = CorpusAnalysis(lang)
    lista_ficheros = []
    metadata = metadata[metadata.type=="Text"]
    for idname in metadata[metadata.language=="['"+ lang + "']"].id:
        lista_ficheros.append(idname + "_tokens.txt")

    if lang=="en":
        random.seed(2)
        random.shuffle(lista_ficheros)
    
    for index, file in enumerate(lista_ficheros[0:sizelimit]):
        if index%10==0: print("Language:" + lang +" ,quedan: " + str(len(lista_ficheros)-index))
        huge_list = []
        try:
            with open(data_path+file, 'r') as input:
                for line in input:
                    huge_list.extend(line.split())
            a = time.time()
            if huge_list is not None: analysis.add_corpus_analysis(huge_list, lang, method)
            if index%10==0: print(time.time() -a )
                
        except:
            print("File " + data_path + file + " not found")
            pass
        
        if lang=="en":
            print(analysis.books)

        if analysis.books == 2500: # Max number of books
            break
        
    analysis.menzerath_altmann()    
    analysis.save_results(lang)



# 2. Languages and dataset

In [None]:
listaidiomas = ["en", "fr", "fi", "de", "it", "nl", "es", "pt", "hu", "sv", "eo", "la", "da",
                 "tl", "ca", "pl", "no", "cs", "cy", "is", "af"]

metadata = pd.read_csv("/path/from/metadata.csv")
data_path = "/path/to/gutenberg/data/tokens/"

# 3. Main analysis

In [None]:
for lang in listaidiomas[::-1]:
    print("Computing language: ****    " + lang + "  ****")
    gutenberg_corpus(metadata, data_path, lang, method = "SSP")