# Readability. Extra columns
## Add extra columns

In [None]:
import os
import re

import pandas as pd

from promovolt.readability import sentencesCounter, wordsCounter 
from promovolt.syllables import wordSyllablesCounterEn

In [None]:
df = pd.read_pickle('readability_20201124.pkl')  # 465 examples

### Add `asl_flesch` column

In [None]:
def asl_flesch(text):
    
    asl = None
    
    words_num, words = wordsCounter(text, 'en')  
    sentences_num = sentencesCounter(text, 'en')[0]
    
    for c in text:
        if c in (':', ';'):
             sentences_num += 1
                               
    if sentences_num != 0:
        asl = words_num / sentences_num
    return asl

In [None]:
df['asl_flesch'] = df.text.apply(asl_flesch)

### Add `asw_flesch` column

In [None]:
def asw_flesch(text):
    
    asw = None
    
    words_num, words = wordsCounter(text, 'en')
                
    syllables_num = 0
    for word in words:
        if len(word) <= 3:
            syllables_num += 1
        elif ((word[len(word)-2] + word[len(word)-1] == 'ed') or (word[len(word)-2] + word[len(word)-1] == 'es')):
            syllables_num += wordSyllablesCounterEn(word) - 1
        else:    
            syllables_num += wordSyllablesCounterEn(word)

    if words_num != 0:
        asw = syllables_num / words_num
    return asw

In [None]:
df['asw_flesch'] = df.text.apply(asw_flesch)

### Add `asl_fog` column

In [None]:
def asl_fog(text):
    
    asl = None
           
    totalWords, words = wordsCounter(text, 'en')
    totalSentences, sentences = sentencesCounter(text, 'en')                          
         
    if totalSentences != 0:
        asl = totalWords / totalSentences
    return asl

In [None]:
df['asl_fog'] = df.text.apply(asl_fog)

### Add `ppw_fog` column

In [None]:
def ppw_fog(text):
    
    ppw = None
    
    pathCompoundWordsDictEn = os.path.abspath('promovolt/resources/compound_words_en.txt')
    with open(pathCompoundWordsDictEn, 'r', encoding='utf-8') as f:
        compoundWordsEnDict = f.read().splitlines()
        
    totalWords, words = wordsCounter(text, 'en')
    totalSentences, sentences = sentencesCounter(text, 'en')
    
    totalHardWords = 0
    for word in words:
        if ((wordSyllablesCounterEn(word) == 3) and
            ((word[len(word)-2] + word[len(word)-1] == 'ed') or (word[len(word)-2] + word[len(word)-1] == 'es'))):
            continue          
        if wordSyllablesCounterEn(word) >= 3:
            if '-' not in word:
                lowerWord = word.lower()
                if lowerWord not in compoundWordsEnDict:
                    if not(word[0].isupper()):                          
                        totalHardWords += 1                
                    else:                  
                        wordPattern = re.compile("\\b" + word + "\\b")                        
                        startSwitch = None
                        insideSwitch = None
                        for sentence in sentences:                                               
                            if wordPattern.findall(sentence) != []:
                                if sentence.startswith(word): 
                                    startSwitch = True
                                else:
                                    insideSwitch = True
                                sentence = sentence[len(word):]
                                if wordPattern.findall(sentence) != []:
                                    insideSwitch = True
                        if (startSwitch == True) and (insideSwitch == None):
                            totalHardWords += 1
                            
         
    if totalWords != 0:    
        ppw =  100 * (totalHardWords / totalWords)
    return ppw

In [None]:
df['ppw_fog'] = df.text.apply(ppw_fog)

### Save to pickle file

In [None]:
df.to_pickle('readability_extra_20201124.pkl')

## Pearson correlation coefficient (PCC)

In [None]:
df = pd.read_pickle('readability_extra_20201124.pkl')

In [None]:
df.corr(method='pearson').cvr