In [11]:
import os
import sys
import inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)

import pandas as pd
from wordEmbedders import WESCScore
import pickle
from latexTable import LatexTable
from re import sub

In [12]:
finalNames = pickle.load(open('./FinalNames.pickle', mode='rb'))
datasets = finalNames['datasets']
dimensions = finalNames['dimensions']
alldims = []
for aspect in finalNames['aspects'].keys():
    alldims += dimensions[aspect]

In [13]:
fleshKincaid = {
    0: '22-23',
    30: '18-19',
    50: '15-18',
    60: '13-15',
    70: '12-13',
    80: '11-12',
    90: '11-'
}
gunningFog = {
    0: '3-7',
    1: '7-11',
    5: '11-14',
    8: '14-17',
    11: '17+'
}
smog = gunningFog
daleChall = {
    0: '3-10',
    5: '10-12',
    6: '12-14',
    7: '14-16',
    8: '16-18',
    9: '18-22',
    10: '22+'
}
colmanLiau = smog
linsearWrite = smog
spache = smog
lix = {
    0: 'Childrens books',
    25: 'Simple texts',
    30: 'Fiction',
    40: 'Factual information',
    50: 'Technical texts',
    60: 'Difficult texts'
}
forcast = smog
formulaRanges = {
  'formulaFleshKincaid': (0, 100),
  'formulaGunningFog': (0, 12),
  'formulaSMOG': (0, 12),
  'formulaDaleChall': (0, 11),
  'formulaColemanLiau': (0, 12),
  'formulaLinsearWrite': (0, 12),
  'formulaSpacheSAT': (0, 12),
  'formulaSpacheDaleChall': (0, 12),
  'formulaLIX': (0, 70),
  'formulaFORCAST': (0, 12),
}

In [14]:
finalNames['aspects'].keys()

dict_keys(['base', 'postag', 'lexical', 'semantic', 'syntactic', 'postagwords', 'sentiment', 'formulas'])

In [15]:
for dataset in datasets:
    dimsFile = f"../data/{dataset}/Dimensions.csv"
    dims = pd.read_csv(dimsFile)
    desc = pd.DataFrame()
    for dim in alldims:
        col = dims[dim]
        if dim in dimensions['formulas']:
            (low, high) = formulaRanges[dim]
        else: 
            high = col.quantile(0.99)
            low = col.quantile(0.01)
        col = col[(col <= high) & (col >= low)]
        
        desc[dim] = col.describe()

    def val(dim, v):
        if dim in dimensions['base'] + dimensions['postag']:
            return f"{desc[dim][v]:.0f}"
        else:
            return f"{desc[dim][v]:.2f}"

    rows = []
    for aspect in ['base', 'postag', 'lexical']:
        n = len(dimensions[aspect])
        items = list(map(lambda dim: ["", finalNames['dimensionNames'][dim], val(dim, 'min'), val(dim, 'mean'), val(dim, 'max'), val(dim, 'std')], dimensions[aspect]))
        items[0][0] = "\parbox[t]{2mm}{\multirow{"+str(n)+"}{*}{\\rotatebox[origin=c]{90}{"+finalNames['aspects'][aspect]+"}}}"
        rows += items
    table = LatexTable()
    table.headers = ['Variable', 'Min', 'Mean', 'Max', 'STD']
    table.columnAlignments = ['l','l','r','r','r','r']
    table.emptyFirstHeader = True
    table.boldIndexColumn = False
    table.rows = rows
    result = table.render()
    rows = result.split("\n")
    out = []
    for i, r in enumerate(rows):
        if i + 1 < len(rows):
            nxt = rows[i+1]
        else:
            nxt = None
        if nxt is None or nxt[0] == ' ':
            r = sub(r"\\hline", r"\\cline{2-6}", r)
        out.append(r)
    #result = sub(r"\\hline", r"\\cline{2-6}", result)
    #print("\n".join(out))
    with open(f'./Appendix-Variables-{dataset}-01.tex',mode="w") as f:
        f.write("\n".join(out))

    rows = []
    for aspect in ['semantic', 'syntactic', 'postagwords', 'sentiment', 'formulas']:
        n = len(dimensions[aspect])
        items = list(map(lambda dim: ["", finalNames['dimensionNames'][dim], val(dim, 'min'), val(dim, 'mean'), val(dim, 'max'), val(dim, 'std')], dimensions[aspect]))
        items[0][0] = "\parbox[t]{2mm}{\multirow{"+str(n)+"}{*}{\\rotatebox[origin=c]{90}{"+finalNames['aspects'][aspect]+"}}}"
        rows += items
    table = LatexTable()
    table.headers = ['Variable', 'Min', 'Mean', 'Max', 'STD']
    table.columnAlignments = ['l','l','r','r','r','r']
    table.emptyFirstHeader = True
    table.boldIndexColumn = False
    table.rows = rows
    result = table.render()
    rows = result.split("\n")
    out = []
    for i, r in enumerate(rows):
        if i + 1 < len(rows):
            nxt = rows[i+1]
        else:
            nxt = None
        if nxt is None or nxt[0] == ' ':
            r = sub(r"\\hline", r"\\cline{2-6}", r)
        out.append(r)
    #result = sub(r"\\hline", r"\\cline{2-6}", result)
    #print("\n".join(out))
    with open(f'./Appendix-Variables-{dataset}-02.tex',mode="w") as f:
        f.write("\n".join(out))