In [5]:
import os
import sys
import inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)

import pandas as pd
from functions import dirs, readFile, VIF, canonicalNames, readSet
from wordEmbedders import WESCScore
import pickle
import chevron
from re import sub
from latexTable import LatexTable

In [6]:
datasets = dirs('../data')
names = canonicalNames('../data')
picked = readSet('../data/Dimensions-Picked-Final.txt')
sentiments = ['positive', 'negative']
results = pd.DataFrame()
for dataset in datasets:
    dimsFile = f"../data/{dataset}/Dimensions.csv"
    predictionFile = f"../data/{dataset}/Word2Vec-Prediction.csv"
    dims = pd.read_csv(dimsFile)
    pred = pd.read_csv(predictionFile)
    df = dims.join(pred)
    for sentiment in sentiments:
        data = df[df['truth'] == sentiment]
        vif = VIF(data[picked])
        results[f'{dataset}-{sentiment}'] = vif['VIF']

In [8]:
results.drop(index='const')

Unnamed: 0,AirlineTweets2-positive,AirlineTweets2-negative,IMDB-positive,IMDB-negative
opinionPolarity,1.01748,1.010084,1.039026,1.022426
nrDifficultWordsSAT/nrWords,1.036761,1.044613,1.2612,1.224441
nrAmbiguousSentimentWords/nrWords,1.037591,1.040261,1.043062,1.037706
nrSlangWords/nrWords,1.049202,1.068859,1.009201,1.009617
nrComplexVerbs/nrWords,1.120409,1.060648,1.095479,1.103613
nrAdverbs/nrWords,1.239097,1.189919,1.177629,1.188436
nrSynsets/nrWords,1.375087,1.203702,1.333606,1.357769
uniquenessSTD,1.393254,1.472337,3.287188,3.160417
nrDeterminers/nrWords,1.404318,1.264766,1.481571,1.453753
nrStrongSentimentWords/nrWords,1.405019,1.125765,1.331973,1.328463


In [17]:
order = [
    'nrPossesives/nrWords', 'nrStrongSentimentWords/nrWords',
       'nrConjunctions/nrWords', 'nrSynsets/nrWords', 'nrAdjectives/nrWords',
       'opinionPolarity', 'nrDeterminers/nrWords',
       'nrAmbiguousSentimentWords/nrWords',
       'nrDifficultWordsDaleChall/nrWords', 'nrDifficultWordsSAT/nrWords',
       'nrSlangWords/nrWords', 'nrWords/nrSentences', 'uniquenessMean', 
       'uniquenessSTD', 'nrAdverbs/nrWords', 'nrLetters/nrWords',
       'nrComplexVerbs/nrWords'
]

rows = list(map(lambda row: ["{\scriptsize "+row+"}"] + list(map(lambda f: f"{f:.2f}", results.loc[row])), order))

table = LatexTable()
table.emptyFirstHeader = True
table.boldHeaders = False
table.nrColumns = 5
table.customHeader = "\multicolumn{1}{c|}{} & \multicolumn{2}{c|}{\\textbf{AirlineTweets}} & \multicolumn{2}{c|}{\\textbf{IMDB}}\\\\ \cline{2-5}\n\multicolumn{1}{c|}{} & \\textbf{Positive} & \\textbf{Negative} & \\textbf{Positive} & \\textbf{Negative} \\\\ \\hline\n"
table.rows = rows + [
    "!boldLine",
    ["{\scriptsize Mean}"] + list(map(lambda f: f"{f:.2f}", results.mean(axis=0)))
]
results = table.render()
outputFile = 'Results-VIF.tex'
with open(outputFile, mode='w') as output:
    output.write(results)
print(results)

\begin{tabular}{|l|l|l|l|l|} \cline{2-5}
\multicolumn{1}{c|}{} & \multicolumn{2}{c|}{\textbf{AirlineTweets}} & \multicolumn{2}{c|}{\textbf{IMDB}}\\ \cline{2-5}
\multicolumn{1}{c|}{} & \textbf{Positive} & \textbf{Negative} & \textbf{Positive} & \textbf{Negative} \\ \hline
\textbf{{\scriptsize nrPossesives/nrWords}} & 2.79 & 1.55 & 1.93 & 1.71 \\ \hline
\textbf{{\scriptsize nrStrongSentimentWords/nrWords}} & 1.41 & 1.13 & 1.33 & 1.33 \\ \hline
\textbf{{\scriptsize nrConjunctions/nrWords}} & 1.53 & 1.27 & 1.29 & 1.28 \\ \hline
\textbf{{\scriptsize nrSynsets/nrWords}} & 1.38 & 1.20 & 1.33 & 1.36 \\ \hline
\textbf{{\scriptsize nrAdjectives/nrWords}} & 1.54 & 1.27 & 1.50 & 1.48 \\ \hline
\textbf{{\scriptsize opinionPolarity}} & 1.02 & 1.01 & 1.04 & 1.02 \\ \hline
\textbf{{\scriptsize nrDeterminers/nrWords}} & 1.40 & 1.26 & 1.48 & 1.45 \\ \hline
\textbf{{\scriptsize nrAmbiguousSentimentWords/nrWords}} & 1.04 & 1.04 & 1.04 & 1.04 \\ \hline
\textbf{{\scriptsize nrDifficultWordsDaleChall/nrWords

AirlineTweets2-positive     7.637811
AirlineTweets2-negative     9.742087
IMDB-positive              48.328242
IMDB-negative              53.925573
dtype: float64