In [49]:
import os
import sys
import inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)

import pandas as pd
from functions import dirs, canonicalNames
import pickle
import chevron
from re import sub

In [50]:
def selectRange(dct, value):
    keys = dct.keys()
    picked = 0
    for key in keys:
        if value >= key:
            picked = key
    return dct[picked]

In [51]:
datasets = dirs('../data')
names = canonicalNames('../data')
#datasets = ['AirlineTweets']
df = pd.DataFrame()
dimensions = pickle.load(open('../data/Dimensions-All.pickle', mode='rb'))
formulas = dimensions['formulas']

In [52]:
for dataset in datasets:
    dataFile = f'../data/{dataset}/Data-Cleaned.csv'
    dimsFile = f'../data/{dataset}/Dimensions.csv'
    if not os.path.exists(dataFile):
        raise ValueError(f'Dataset {dataset} has not been cleaned')
    if not os.path.exists(dimsFile):
        raise ValueError(f'Dataset {dataset} has no calculated dimensions')

    data = pd.read_csv(dataFile)
    dims = pd.read_csv(dimsFile)
    total = len(data)
    pos = len(data[data['sentiment'] == 'positive'])
    neg = total - pos
    column = [pos, neg, total]
    for formula in dims[formulas].mean(axis=0).values:
        column.append(formula)
    df[dataset] = column


In [53]:
df.index = ['pos', 'neg', 'total'] + formulas

In [54]:
dataFile = './Setup-Data-Overview.csv'
#df.to_csv(dataFile)
#df = pd.read_csv(dataFile, index_col=0)

In [55]:
def percentages(col):
    pos = (col.loc["pos"]/col.loc["total"]) * 100
    neg = (col.loc["neg"]/col.loc["total"]) * 100
    col.loc['pos'] = f'{pos:.0f}\%'
    col.loc['neg'] = f'{neg:.0f}\%'
    col.loc['total'] = f'{col["total"]:.0f}'
    return col

In [56]:
df2 = df.copy()
df2 = df2.apply(lambda col: percentages(col))

In [57]:
fleshKincaid = {
    0: '22-23',
    30: '18-19',
    50: '15-18',
    60: '13-15',
    70: '12-13',
    80: '11-12',
    90: '11-'
}
gunningFog = {
    0: '3-7',
    1: '7-11',
    5: '11-14',
    8: '14-17',
    11: '17+'
}
smog = gunningFog
daleChall = {
    0: '3-10',
    5: '10-12',
    6: '12-14',
    7: '14-16',
    8: '16-18',
    9: '18-22',
    10: '22+'
}
colmanLiau = smog
linsearWrite = smog
spache = smog
lix = {
    0: 'Childrens books',
    25: 'Simple texts',
    30: 'Fiction',
    40: 'Factual information',
    50: 'Technical texts',
    60: 'Difficult texts'
}
forcast = smog

In [58]:

def mapRow(df, rowname, dct):
    df2.loc[rowname] = df2.loc[rowname].map(lambda val: selectRange(dct, val))

mapRow(df2, 'formulaFleshKincaid', fleshKincaid)
mapRow(df2, 'formulaGunningFog', gunningFog)
mapRow(df2, 'formulaSMOG', smog)
mapRow(df2, 'formulaDaleChall', daleChall)
mapRow(df2, 'formulaColemanLiau', colmanLiau)  
mapRow(df2, 'formulaLinsearWrite', linsearWrite)
mapRow(df2, 'formulaSpacheSAT', spache)
mapRow(df2, 'formulaSpacheDaleChall', spache)
mapRow(df2, 'formulaLIX', lix)
mapRow(df2, 'formulaFORCAST', forcast)

In [59]:
df2

Unnamed: 0,AirlineTweets2,IMDB
pos,17\%,50\%
neg,83\%,50\%
total,8897,50000
formulaFleshKincaid,11-12,13-15
formulaGunningFog,7-11,14-17
formulaSMOG,7-11,14-17
formulaDaleChall,14-16,14-16
formulaColemanLiau,11-14,11-14
formulaLinsearWrite,7-11,17+
formulaSpacheSAT,7-11,7-11


In [60]:
finalNames = pickle.load(open('./FinalNames.pickle', mode='rb'))

In [61]:
f2 = {}
for index, row in df2[df2.index.isin(formulas)].iterrows():
    f2[index] = [sub('formula', '', index)] + list(row)

f2 = list(map(f2.get, finalNames['dimensions']['formulas']))

In [62]:
from latexTable import LatexTable

In [63]:
list(df2.loc['total'])

['8897', '50000']

In [64]:
#|p{45mm}|p{35mm}|p{35mm}|

table = LatexTable()
table.headers = ['Airline tweets', 'IMDB reviews']
table.emptyFirstHeader = True
table.rows = [
    ['Domain', 'Twitter', 'Movie reviews'],
    ['Time period', 'February 2015', 'June 2011'],
    ['Labelling', 'Externally assesed', 'Self-provided'],
    "!boldLine",
    ['Total observations'] + list(map(lambda t: f'{int(t):,}', df2.loc['total'])),
    ['Positive sentiments']  + list(df2.loc['pos']),
    ['Negative sentiments']  + list(df2.loc['neg']),
    "!boldLine"
] + f2
result = table.render()
print(result)
outputFile = 'Setup-Data-Overview.tex'
with open(outputFile, mode='w') as output:
    output.write(result)

\begin{tabular}{|l|l|l|} \cline{2-3}
\multicolumn{1}{c|}{} & \textbf{Airline tweets} & \textbf{IMDB reviews} \\ \hline
\textbf{Domain} & Twitter & Movie reviews \\ \hline
\textbf{Time period} & February 2015 & June 2011 \\ \hline
\textbf{Labelling} & Externally assesed & Self-provided\\ \hline
 \multicolumn{3}{c}{} \\ [-1.5ex] \hline
\textbf{Total observations} & 8,897 & 50,000 \\ \hline
\textbf{Positive sentiments} & 17\% & 50\% \\ \hline
\textbf{Negative sentiments} & 83\% & 50\%\\ \hline
 \multicolumn{3}{c}{} \\ [-1.5ex] \hline
\textbf{DaleChall} & 14-16 & 14-16 \\ \hline
\textbf{GunningFog} & 7-11 & 14-17 \\ \hline
\textbf{SpacheSAT} & 7-11 & 7-11 \\ \hline
\textbf{SpacheDaleChall} & 7-11 & 7-11 \\ \hline
\textbf{ColemanLiau} & 11-14 & 11-14 \\ \hline
\textbf{LIX} & Fiction & Technical texts \\ \hline
\textbf{SMOG} & 7-11 & 14-17 \\ \hline
\textbf{FORCAST} & 14-17 & 14-17 \\ \hline
\textbf{LinsearWrite} & 7-11 & 17+ \\ \hline
\textbf{FleshKincaid} & 11-12 & 13-15 \\ \hline
\end{tab