In [15]:
import os
import sys
import inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)

import pandas as pd
from functions import dirs, readFile, canonicalNames
from wordEmbedders import WESCScore
import pickle
import chevron
from re import sub
from latexTable import LatexTable

In [16]:
result = pickle.load(open("../data/AirlineTweets2/Regression-Positive.pickle", mode='rb'))

In [17]:
result.params

const                                1.659492
nrPossesives/nrWords                 6.238790
nrStrongSentimentWords/nrWords      -0.703717
nrConjunctions/nrWords              -6.137505
nrSynsets/nrWords                   -0.058234
nrAdjectives/nrWords                -1.288943
opinionPolarity                     -0.094735
nrDeterminers/nrWords               -3.478717
nrAmbiguousSentimentWords/nrWords   -1.704586
nrDifficultWordsDaleChall/nrWords   -2.341873
nrDifficultWordsSAT/nrWords         -1.743940
nrSlangWords/nrWords                 2.663263
uniquenessMean                       4.854494
nrWords/nrSentences                  0.033311
uniquenessSTD                        6.711025
nrAdverbs/nrWords                    0.092676
nrLetters/nrWords                   -0.439648
nrComplexVerbs/nrWords               3.046090
nrSentences                          0.615171
dtype: float64

In [18]:
def stars(z):
    (coef, p) = z
    res = f"{coef:.3f}"
    if p < 0.01:
        res += '*'
    if p < 0.05:
        res += '*'
    if p < 0.1:
        res += '*'
    return res

In [19]:
datasets = dirs('../data')
names = canonicalNames('../data')
sentiments = ['Positive', 'Negative']

results = {}
df = pd.DataFrame()
for dataset in datasets:
    for sentiment in sentiments:
        regressionFile = f"../data/{dataset}/Regression-{sentiment}.pickle"
        result = pickle.load(open(regressionFile, mode='rb'))
        coefs = list(map(stars, zip(result.params, result.pvalues)))
        col = [result.prsquared] + coefs
        df[f"{dataset}-{sentiment}"] = col

In [20]:
df.index = ['r2'] + list(result.params.index)
df.reindex([
    'r2','nrPossesives/nrWords', 'nrStrongSentimentWords/nrWords',
       'nrConjunctions/nrWords', 'nrSynsets/nrWords', 'nrAdjectives/nrWords',
       'opinionPolarity', 'nrDeterminers/nrWords',
       'nrAmbiguousSentimentWords/nrWords',
       'nrDifficultWordsDaleChall/nrWords', 'nrDifficultWordsSAT/nrWords',
       'nrSlangWords/nrWords', 'nrWords/nrSentences', 'uniquenessMean', 
       'uniquenessSTD', 'nrAdverbs/nrWords', 'nrLetters/nrWords',
       'nrComplexVerbs/nrWords',
       'const'
])

Unnamed: 0,AirlineTweets2-Positive,AirlineTweets2-Negative,IMDB-Positive,IMDB-Negative
r2,0.18786,0.169807,0.20689,0.201375
nrPossesives/nrWords,6.239***,-6.876***,2.361**,-2.790
nrStrongSentimentWords/nrWords,-0.704,8.795***,46.573***,14.943***
nrConjunctions/nrWords,-6.138***,8.689***,7.274***,-8.608***
nrSynsets/nrWords,-0.058**,0.013,0.232***,-0.261***
nrAdjectives/nrWords,-1.289,1.140**,6.862***,-3.075*
opinionPolarity,-0.095,0.055*,0.166***,-0.115***
nrDeterminers/nrWords,-3.479***,5.155***,-2.142***,8.419***
nrAmbiguousSentimentWords/nrWords,-1.705,6.300***,6.769***,-1.832
nrDifficultWordsDaleChall/nrWords,-2.342***,1.760***,11.902***,-14.563***


In [23]:
order = [
    'nrPossesives/nrWords', 'nrStrongSentimentWords/nrWords',
       'nrConjunctions/nrWords', 'nrSynsets/nrWords', 'nrAdjectives/nrWords',
       'opinionPolarity', 'nrDeterminers/nrWords',
       'nrAmbiguousSentimentWords/nrWords',
       'nrDifficultWordsDaleChall/nrWords', 'nrDifficultWordsSAT/nrWords',
       'nrSlangWords/nrWords', 'nrWords/nrSentences', 'uniquenessMean', 
       'uniquenessSTD', 'nrAdverbs/nrWords', 'nrLetters/nrWords',
       'nrComplexVerbs/nrWords',
       'nrSentences',
       'const'
]

coefs = list(map(lambda row: ["{\scriptsize "+row+"}"] + list(df.loc[row]), order))


table = LatexTable()
table.emptyFirstHeader = True
table.boldHeaders = False
table.nrColumns = 5
table.customHeader = "\multicolumn{1}{c|}{} & \multicolumn{2}{c|}{\\textbf{AirlineTweets}} & \multicolumn{2}{c|}{\\textbf{IMDB}}\\\\ \cline{2-5}\n\multicolumn{1}{c|}{} & \\textbf{Positive} & \\textbf{Negative} & \\textbf{Positive} & \\textbf{Negative} \\\\ \\hline\n"
table.headers = ['\multicolumn{2}{|c|}{\\textbf{AirlineTweets}}', '\multicolumn{2}{|c|}{\\textbf{IMDB}}']
table.rows = [
    ['R2'] + list(map(lambda x: f"{x:.3f}", df.loc['r2'])),
    '!boldLine'
] + coefs
results = table.render()
outputFile = 'Results-Regression.tex'
with open(outputFile, mode='w') as output:
    output.write(results)
print(results)

\begin{tabular}{|l|l|l|l|l|} \cline{2-5}
\multicolumn{1}{c|}{} & \multicolumn{2}{c|}{\textbf{AirlineTweets}} & \multicolumn{2}{c|}{\textbf{IMDB}}\\ \cline{2-5}
\multicolumn{1}{c|}{} & \textbf{Positive} & \textbf{Negative} & \textbf{Positive} & \textbf{Negative} \\ \hline
\textbf{R2} & 0.188 & 0.170 & 0.207 & 0.201 \\ \Xhline{3\arrayrulewidth}
\textbf{{\scriptsize nrPossesives/nrWords}} & 6.239*** & -6.876*** & 2.361** & -2.790 \\ \hline
\textbf{{\scriptsize nrStrongSentimentWords/nrWords}} & -0.704 & 8.795*** & 46.573*** & 14.943*** \\ \hline
\textbf{{\scriptsize nrConjunctions/nrWords}} & -6.138*** & 8.689*** & 7.274*** & -8.608*** \\ \hline
\textbf{{\scriptsize nrSynsets/nrWords}} & -0.058** & 0.013 & 0.232*** & -0.261*** \\ \hline
\textbf{{\scriptsize nrAdjectives/nrWords}} & -1.289 & 1.140** & 6.862*** & -3.075* \\ \hline
\textbf{{\scriptsize opinionPolarity}} & -0.095 & 0.055* & 0.166*** & -0.115*** \\ \hline
\textbf{{\scriptsize nrDeterminers/nrWords}} & -3.479*** & 5.155*** & -2