# Readability

In [None]:
import os

import pandas as pd
from pandas_profiling import ProfileReport

from promovolt.readability import ari, bormuth, colemanLiau, daleChall, flesch, fleschKincaid, fog, lix, rix, smog, strain
from promovolt.readability import readabilityValuesInterpreter

In [None]:
df = pd.read_pickle('clustering_20201116.pkl')  # 465 examples

In [None]:
df['label'] = df.label_lancaster
df = df[['text', 'language', 'views', 'conversions', 'cvr', 'label']]

## Step 1: Calculate readability

In [None]:
language = 'en'
localization = 'en'
pathCompoundWordsDictEn = os.path.abspath('promovolt/resources/compound_words_en.txt')
pathDaleChallListEn = os.path.abspath('promovolt/resources/dale-chall_simple_words_en-1.0.txt')
pathAbbreviationsDictEn = os.path.abspath('promovolt/resources/abbreviations_en.txt')

### Add columns with readability features 

In [None]:
def dale_chall_score(text):
    return daleChall(text, pathDaleChallListEn)[1]

def fog_score(text):
    return fog(text, language, pathCompoundWordsDictEn)[0]

def lix_score(text):
    return lix(text, language)[1]

def rix_score(text):
    return rix(text, language)[1]

def smog_score(text):
    return smog(text, pathAbbreviationsDictEn)[0]

def ari_class(score):
    return readabilityValuesInterpreter(localization, ariScore=score)

def bormuth_class(score):
    return readabilityValuesInterpreter(localization, bormuthScore=score)

def coleman_liau_class(score):
    return readabilityValuesInterpreter(localization, colemanLiauScore=score)

def dale_chall_class(score):
    return readabilityValuesInterpreter(localization, daleChallScore=score)

def flesch_class(score):
    return readabilityValuesInterpreter(localization, fleschScore=score)

def flesch_kincaid_class(score):
    return readabilityValuesInterpreter(localization, fleschKincaidScore=score)

def fog_class(score):
    return readabilityValuesInterpreter(localization, fogScore=score)

def lix_class(score):
    return readabilityValuesInterpreter(localization, lixScore=score)

def rix_class(score):
    return readabilityValuesInterpreter(localization, rixScore=score)

def smog_class(text):
    smog_statistics = smog(text, pathAbbreviationsDictEn)
    if smog_statistics[0] is None: 
        return readabilityValuesInterpreter(localization, TPWC=smog_statistics[1])
    else:
        return readabilityValuesInterpreter(localization, smogScore=smog_statistics[0])

def strain_class(score):
    return readabilityValuesInterpreter(localization, strainScore=score)

In [None]:
df['ari_score'] = df.text.apply(ari)
df['ari_class'] = df.ari_score.apply(ari_class)

df['bormuth_score'] = df.text.apply(bormuth, pathDaleChallDict=pathDaleChallListEn)
df['bormuth_class'] = df.bormuth_score.apply(bormuth_class)

df['coleman_liau_score'] =  df.text.apply(colemanLiau)
df['coleman_liau_class'] =  df.coleman_liau_score.apply(coleman_liau_class)

df['flesch_score'] = df.text.apply(flesch, language=language)
df['flesch_class'] = df.flesch_score.apply(flesch_class)

df['flesch_kincaid_score'] = df.text.apply(fleschKincaid)
df['flesch_kincaid_class'] = df.flesch_kincaid_score.apply(flesch_kincaid_class)

df['fog_score'] = df.text.apply(fog_score)
df['fog_class'] = df.fog_score.apply(fog_class)

df['lix_score'] = df.text.apply(lix_score)
df['lix_class'] = df.lix_score.apply(lix_class)

df['new_dale_chall_score'] = df.text.apply(dale_chall_score)
df['new_dale_chall_class'] = df.new_dale_chall_score.apply(dale_chall_class)

df['rix_score'] = df.text.apply(rix_score)
df['rix_class'] = df.rix_score.apply(rix_class)

df['smog_score'] = df.text.apply(smog_score)
df['smog_class'] = df.text.apply(smog_class)

df['strain_score'] = df.text.apply(strain)
df['strain_class'] = df.strain_score.apply(strain_class)

### Add `readability` column

In [None]:
df['readability_score'] = (df['ari_class'] + df['bormuth_class'] + df['coleman_liau_class'] +
                           df['flesch_class'] + df['flesch_kincaid_class'] + df['fog_class'] +
                           df['lix_class'] + df['new_dale_chall_class'] + df['rix_class'] +
                           df['smog_class'] + df['strain_class']) / 11

### Save to pickle file

In [None]:
df.to_pickle('readability_20201124.pkl')

## Step 2: Pandas profiling report

In [None]:
df = pd.read_pickle('readability_20201124.pkl')

In [None]:
df = df[['cvr',
    'ari_score', 'ari_class', 'bormuth_score', 'bormuth_class',
    'coleman_liau_score', 'coleman_liau_class', 
    'flesch_score', 'flesch_class', 'flesch_kincaid_score', 'flesch_kincaid_class',
    'fog_score', 'fog_class', 'lix_score', 'lix_class',
    'new_dale_chall_score', 'new_dale_chall_class',
    'rix_score', 'rix_class', 'smog_score', 'smog_class',
    'strain_score', 'strain_class']]

In [None]:
profile = ProfileReport(df, title='Pandas profiling report')

In [None]:
profile.to_file('readability_20201228.html')

## Step 3: Correlation coefficients for CVR

In [None]:
df.corr(method='pearson')['cvr'].abs().sort_values(ascending=False)  # pearson | spearman | kendall

In [None]:
df.corr(method='pearson')['cvr'].sort_values(ascending=False)  # pearson | spearman | kendall