In [1]:
from urllib.parse import urlencode
from urllib.request import Request, urlopen
import json
import os
import spacy
import re
from os import listdir
from os.path import isfile, join
from tqdm import tqdm_notebook
import os.path
import time
import pyphen
def loadDocsFromFolder(folder=""):
    allDocs = {}
    for root, subFolders, files in os.walk(folder):
        for filename in files:
            text= open(root+"/"+filename, 'r', encoding="utf8").read()
            allDocs[filename]=text
    return allDocs
def removeCitations(text=""):
    text= re.sub(r'\[\d+\]', '', text)
    return text

class Tokenizer:
    
    def __init__(self,lang="en"):
        """
        :param config:
        """

        self.spacy_nlp = spacy.load('en')
        self.syllableSplitter = pyphen.Pyphen(lang=lang)

    def tokenize(self, input):
        return [x.text for x in self.spacy_nlp.tokenizer(input) if x.text != " "]
    
    def splitIntoSyllables(self,token):
        return self.syllableSplitter.inserted(token).split("-")

    def split_sentences(self, input):
        return [x.text for x in self.spacy_nlp(input).sents if x.text != " "]
tokenizer = Tokenizer()

In [4]:
def computeStatsForDoc(doc):
    stats={}
    stats["words"] = 0
    stats["sentences"] = 0
    stats["characters"] =0
    stats["syllables"]=0
    uniqueWords = set()
    for sentence in tokenizer.split_sentences(doc):
        stats["sentences"] = stats["sentences"]+1
        for token in tokenizer.tokenize(sentence):
            stats["words"] = stats["words"]+1
            stats["syllables"] = stats["syllables"] + len(tokenizer.splitIntoSyllables(token))
            stats["characters"] = stats["characters"]+len(token)
            uniqueWords.add(token)
            
    stats["charactersPerWord"] = stats["characters"]/stats["words"] if stats["words"]  else 0 
    stats["wordsPerSentence"] = stats["words"]/stats["sentences"] if stats["sentences"] else 0
    stats["uniqueWords"] = len(uniqueWords)
    stats["uniqueWordsRatio"] = len(uniqueWords)/stats["words"] if stats["words"]  else 0
    stats["syllablesPerWord"] =  stats["syllables"]/stats["words"] if stats["words"]  else 0
    stats["flesch-kincaid"]= 0.39* stats["wordsPerSentence"] + 11.8* stats["syllablesPerWord"] -15.59
    return stats


In [5]:
def computeStatsForFolder(folder):
    print(folder)
    docs= loadDocsFromFolder(folder)

    statsSum=None
    for dkey in tqdm_notebook(docs):
        docStats = computeStatsForDoc(docs[dkey])
        #print(docStats)
        if statsSum is None:
            statsSum = docStats

        else:
            for key in docStats:
                statsSum[key] = statsSum[key] + docStats[key]
        #print(statsSum)
    statsAvg={}
    for key in statsSum:
        statsAvg[key] = statsSum[key]/ len(docs)
    return statsAvg

In [6]:
computeStatsForFolder("../data/readabilityCorpora/wizenoze/raw/en/EN1/")

../data/readabilityCorpora/wizenoze/raw/en/EN1/


HBox(children=(IntProgress(value=0, max=151), HTML(value='')))




{'words': 34.66887417218543,
 'sentences': 3.033112582781457,
 'characters': 142.9337748344371,
 'syllables': 46.94039735099338,
 'charactersPerWord': 4.307414937057523,
 'wordsPerSentence': 11.33532008830022,
 'uniqueWords': 27.258278145695364,
 'uniqueWordsRatio': 0.8631695816063625,
 'syllablesPerWord': 1.4041691750728198,
 'flesch-kincaid': 5.3999711002963675}

In [7]:
computeStatsForFolder("../data/readabilityCorpora/wizenoze/raw/en/EN2/")

../data/readabilityCorpora/wizenoze/raw/en/EN2/


HBox(children=(IntProgress(value=0, max=43), HTML(value='')))




{'words': 128.27906976744185,
 'sentences': 10.232558139534884,
 'characters': 538.7209302325581,
 'syllables': 182.8372093023256,
 'charactersPerWord': 4.156358809477928,
 'wordsPerSentence': 14.332236101413676,
 'uniqueWords': 78.09302325581395,
 'uniqueWordsRatio': 0.790096803203777,
 'syllablesPerWord': 1.407813321137804,
 'flesch-kincaid': 6.611769268977423}

In [8]:
computeStatsForFolder("../data/readabilityCorpora/wizenoze/raw/en/EN3/")

../data/readabilityCorpora/wizenoze/raw/en/EN3/


HBox(children=(IntProgress(value=0, max=94), HTML(value='')))




{'words': 67.0,
 'sentences': 4.9787234042553195,
 'characters': 286.1063829787234,
 'syllables': 95.59574468085107,
 'charactersPerWord': 4.353386921583693,
 'wordsPerSentence': 13.62675155813454,
 'uniqueWords': 47.797872340425535,
 'uniqueWordsRatio': 0.7867133919875493,
 'syllablesPerWord': 1.4350187981930251,
 'flesch-kincaid': 6.657654926350162}

In [9]:
computeStatsForFolder("../data/readabilityCorpora/wizenoze/raw/en/EN4/")

../data/readabilityCorpora/wizenoze/raw/en/EN4/


HBox(children=(IntProgress(value=0, max=440), HTML(value='')))




{'words': 266.5977272727273,
 'sentences': 16.62272727272727,
 'characters': 1196.3045454545454,
 'syllables': 403.85,
 'charactersPerWord': 4.477811905010399,
 'wordsPerSentence': 16.055585334905828,
 'uniqueWords': 142.08636363636364,
 'uniqueWordsRatio': 0.6479923348642399,
 'syllablesPerWord': 1.5235489140723022,
 'flesch-kincaid': 8.649555466666436}

In [10]:
computeStatsForFolder("../data/readabilityCorpora/wizenoze/raw/en/EN5/")

../data/readabilityCorpora/wizenoze/raw/en/EN5/


HBox(children=(IntProgress(value=0, max=267), HTML(value='')))




{'words': 801.8352059925094,
 'sentences': 35.87265917602996,
 'characters': 3630.4644194756556,
 'syllables': 1221.7191011235955,
 'charactersPerWord': 4.5114086610034825,
 'wordsPerSentence': 21.33254436344231,
 'uniqueWords': 328.6367041198502,
 'uniqueWordsRatio': 0.5521822430529146,
 'syllablesPerWord': 1.5282446345148273,
 'flesch-kincaid': 10.762978989017464}

In [11]:
computeStatsForFolder("../data/readabilityCorpora/wizenoze/raw/du/NL1/")

../data/readabilityCorpora/wizenoze/raw/du/NL1/


HBox(children=(IntProgress(value=0, max=21), HTML(value='')))




{'words': 6.0,
 'sentences': 1.2857142857142858,
 'characters': 22.904761904761905,
 'syllables': 7.380952380952381,
 'charactersPerWord': 4.297496776488373,
 'wordsPerSentence': 3.880952380952381,
 'uniqueWords': 5.380952380952381,
 'uniqueWordsRatio': 0.9819927971188476,
 'syllablesPerWord': 1.3577564359076963,
 'flesch-kincaid': 1.9450973722822469}

In [12]:
computeStatsForFolder("../data/readabilityCorpora/wizenoze/raw/du/NL2/")

../data/readabilityCorpora/wizenoze/raw/du/NL2/


HBox(children=(IntProgress(value=0, max=58), HTML(value='')))




{'words': 135.55172413793105,
 'sentences': 13.775862068965518,
 'characters': 545.948275862069,
 'syllables': 171.06896551724137,
 'charactersPerWord': 4.655579168067991,
 'wordsPerSentence': 8.543603132007906,
 'uniqueWords': 61.327586206896555,
 'uniqueWordsRatio': 0.7740024664242019,
 'syllablesPerWord': 1.3454759534991498,
 'flesch-kincaid': 3.6186214727730497}

In [13]:
computeStatsForFolder("../data/readabilityCorpora/wizenoze/raw/du/NL3/")

../data/readabilityCorpora/wizenoze/raw/du/NL3/


HBox(children=(IntProgress(value=0, max=248), HTML(value='')))




{'words': 95.43951612903226,
 'sentences': 9.298387096774194,
 'characters': 404.0483870967742,
 'syllables': 126.95564516129032,
 'charactersPerWord': 5.101676844019246,
 'wordsPerSentence': 8.078810533166441,
 'uniqueWords': 52.73790322580645,
 'uniqueWordsRatio': 0.8157954283984014,
 'syllablesPerWord': 1.4884258677954736,
 'flesch-kincaid': 5.124161347921511}

In [14]:
computeStatsForFolder("../data/readabilityCorpora/wizenoze/raw/du/NL4/")

../data/readabilityCorpora/wizenoze/raw/du/NL4/


HBox(children=(IntProgress(value=0, max=426), HTML(value='')))




{'words': 244.42253521126761,
 'sentences': 22.143192488262912,
 'characters': 1108.4530516431926,
 'syllables': 344.5399061032864,
 'charactersPerWord': 4.574794159842148,
 'wordsPerSentence': 11.659315965377424,
 'uniqueWords': 135.85915492957747,
 'uniqueWordsRatio': 0.6195269439610605,
 'syllablesPerWord': 1.421035898419989,
 'flesch-kincaid': 5.725356827853065}

In [15]:
computeStatsForFolder("../data/readabilityCorpora/wizenoze/raw/du/NL5/")

../data/readabilityCorpora/wizenoze/raw/du/NL5/


HBox(children=(IntProgress(value=0, max=242), HTML(value='')))




{'words': 419.2809917355372,
 'sentences': 34.396694214876035,
 'characters': 1976.9256198347107,
 'syllables': 608.0826446280992,
 'charactersPerWord': 4.890408188209587,
 'wordsPerSentence': 11.81659449712277,
 'uniqueWords': 199.73553719008265,
 'uniqueWordsRatio': 0.5597424165306824,
 'syllablesPerWord': 1.4975511885018227,
 'flesch-kincaid': 6.689575878199379}