In [1]:
import os
import sys
import inspect
import pandas as pd
import pickle

In [2]:
def selectRange(dct, value):
    keys = dct.keys()
    picked = 0
    for key in keys:
        if value >= key:
            picked = key
    return dct[picked]

In [3]:
def percentages(col):
    pos = (col.loc["pos"]/col.loc["total"]) * 100
    neg = (col.loc["neg"]/col.loc["total"]) * 100
    col.loc['pos'] = f'{pos:.0f}\%'
    col.loc['neg'] = f'{neg:.0f}\%'
    col.loc['total'] = f'{col["total"]:.0f}'
    return col

In [4]:
finalNames = pickle.load(open('./FinalNames.pickle', mode='rb'))
datasets = finalNames['datasets'].keys()
dimensions = finalNames['dimensions']
formulas = dimensions['formulas']

In [5]:
df = pd.DataFrame()

In [6]:
for dataset in datasets:
    dataFile = f'../data/{dataset}/Data-Cleaned.csv'
    dimsFile = f'../data/{dataset}/Dimensions.csv'
    if not os.path.exists(dataFile):
        raise ValueError(f'Dataset {dataset} has not been cleaned')
    if not os.path.exists(dimsFile):
        raise ValueError(f'Dataset {dataset} has no calculated dimensions')

    data = pd.read_csv(dataFile)
    dims = pd.read_csv(dimsFile)
    total = len(data)
    pos = len(data[data['sentiment'] == 'positive'])
    neg = total - pos
    column = [pos, neg, total]
    for formula in dims[formulas].mean(axis=0).values:
        column.append(formula)
    df[dataset] = column


In [7]:
df.index = ['pos', 'neg', 'total'] + formulas

In [8]:
df2 = df.copy()
df2 = df2.apply(lambda col: percentages(col))

In [9]:
fleshKincaid = {
    0: '22-23',
    30: '18-19',
    50: '15-18',
    60: '13-15',
    70: '12-13',
    80: '11-12',
    90: '11-'
}
gunningFog = {
    0: '3-7',
    1: '7-11',
    5: '11-14',
    8: '14-17',
    11: '17+'
}
smog = gunningFog
daleChall = {
    0: '3-10',
    5: '10-12',
    6: '12-14',
    7: '14-16',
    8: '16-18',
    9: '18-22',
    10: '22+'
}
colmanLiau = smog
linsearWrite = smog
spache = smog
lix = {
    0: 'Childrens books',
    25: 'Simple texts',
    30: 'Fiction',
    40: 'Factual information',
    50: 'Technical texts',
    60: 'Difficult texts'
}
forcast = smog

In [10]:

def mapRow(df, rowname, dct):
    df2.loc[rowname] = df2.loc[rowname].map(lambda val: selectRange(dct, val))

mapRow(df2, 'formulaFleshKincaid', fleshKincaid)
mapRow(df2, 'formulaGunningFog', gunningFog)
mapRow(df2, 'formulaSMOG', smog)
mapRow(df2, 'formulaDaleChall', daleChall)
mapRow(df2, 'formulaColemanLiau', colmanLiau)  
mapRow(df2, 'formulaLinsearWrite', linsearWrite)
#mapRow(df2, 'formulaSpacheSAT', spache)
#mapRow(df2, 'formulaSpacheDaleChall', spache)
#mapRow(df2, 'formulaLIX', lix)
mapRow(df2, 'formulaFORCAST', forcast)

In [11]:
df2

Unnamed: 0,AirlineTweets2,IMDB
pos,17\%,50\%
neg,83\%,50\%
total,8897,50000
formulaFleshKincaid,11-12,13-15
formulaDaleChall,14-16,16-18
formulaGunningFog,7-11,14-17
formulaColemanLiau,11-14,11-14
formulaLIX,36.965123,50.66418
formulaSMOG,7-11,14-17
formulaFORCAST,14-17,14-17


In [12]:
f2 = {}
for index, row in df2[df2.index.isin(formulas)].iterrows():
    f2[index] = [finalNames['formulaNames'][index]] + list(row)

formulas = [x for x in finalNames['dimensions']['formulas'] if not x == 'formulaLIX']
f2 = list(map(f2.get, formulas))

In [13]:
from latexTable import LatexTable

In [14]:
#|p{45mm}|p{35mm}|p{35mm}|

table = LatexTable()
table.headers = ['Airline tweets', 'IMDB reviews']
table.emptyFirstHeader = True
table.rows = [
    ['Domain', 'Twitter', 'Movie reviews'],
    ['Time period', 'February 2015', 'June 2011'],
    ['Labelling', 'Externally assesed', 'Self-provided'],
    "!boldLine",
    ['Total observations'] + list(map(lambda t: f'{int(t):,}', df2.loc['total'])),
    ['Positive sentiments']  + list(df2.loc['pos']),
    ['Negative sentiments']  + list(df2.loc['neg']),
    "!boldLine"
] + f2
result = table.render()
print(result)
outputFile = 'Setup-Data-Overview.tex'
with open(outputFile, mode='w') as output:
    output.write(result)

\begin{tabular}{|l|l|l|} \cline{2-3}
\multicolumn{1}{c|}{} & \textbf{Airline tweets} & \textbf{IMDB reviews} \\ \hline
\textbf{Domain} & Twitter & Movie reviews \\ \hline
\textbf{Time period} & February 2015 & June 2011 \\ \hline
\textbf{Labelling} & Externally assesed & Self-provided\\ \hline
 \multicolumn{3}{c}{} \\ [-1.5ex] \hline
\textbf{Total observations} & 8,897 & 50,000 \\ \hline
\textbf{Positive sentiments} & 17\% & 50\% \\ \hline
\textbf{Negative sentiments} & 83\% & 50\%\\ \hline
 \multicolumn{3}{c}{} \\ [-1.5ex] \hline
\textbf{Flesh-Kincaid} & 11-12 & 13-15 \\ \hline
\textbf{Dale-Chall} & 14-16 & 16-18 \\ \hline
\textbf{Gunning-Fog} & 7-11 & 14-17 \\ \hline
\textbf{Coleman-Liau} & 11-14 & 11-14 \\ \hline
\textbf{SMOG} & 7-11 & 14-17 \\ \hline
\textbf{FORCAST} & 14-17 & 14-17 \\ \hline
\textbf{Linsear-Write} & 7-11 & 17+ \\ \hline
\end{tabular}
