In [1]:
import os
import nltk
import math
import pickle
import pyphen
import itertools
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from nltk.corpus import sentiwordnet as swn
from functions import readSet, columnNames, divide, add
tqdm.pandas()

In [2]:
#wordlists
sylTool = pyphen.Pyphen(lang='en_US') #syllables
difficultWordsSAT  = readSet('./wordlists/difficultWordsSAT.txt')
easyWordsDaleChall = readSet('./wordlists/easyWordsDaleChall.txt')
postags            = readSet('./wordlists/postags.txt')
slangWords         = readSet('./wordlists/slang.txt')

# Base variables

In [3]:
@columnNames('nrLetters')
def nrLetters(row):
    vector = [len(word) for word in row['words']]
    return np.sum(vector)

In [4]:
@columnNames('nrWords')
def nrWords(row):
    return len(row['words'])

In [5]:
@columnNames('nrSentences')
def nrSentences(row):
    return len(row['sentences'].split(','))

In [6]:
@columnNames(*postags)
def nrPOSTags(row):
    lst = nltk.pos_tag(row['words'])
    tags = [token[1] for token in lst]
    dct = dict(zip(postags, np.zeros(len(postags)))) #zero for each tag
    for tag in tags:
        if tag in postags:
            dct[tag] += 1
    ret = [dct[tag] for tag in postags]
    return tuple(ret)

In [7]:
@columnNames('nrSyllables', 'nrMonoSyllables', 'nrBiSyllables', 'nrPolySyllables')
def nrSyllables(row):
    s = 0
    mono = 0
    bi = 0
    poly = 0
    for word in row['words']:
        syllables = len(sylTool.inserted(word).split('-'))
        s += syllables

        if syllables == 1:
            mono += 1
        if syllables == 2:
            bi += 1
        if syllables >= 3:
            poly += 1

    return s, mono, bi, poly

In [8]:
@columnNames('nrDifficultWordsSAT')
def nrDifficultWordsSAT(row):
    s = 0
    for word in row['words']:
        if word in difficultWordsSAT:
            s += 1

    return s

In [9]:

@columnNames('nrDifficultWordsDaleChall')
def nrDifficultWordsDaleChall(row):
    s = 0
    for word in row['words']:
        if word not in easyWordsDaleChall:
            s += 1

    return s

In [10]:
@columnNames('nrLongWords')
def nrLongWords(row):
    s = 0
    for word in row['words']:
        if len(word) >= 6:
            s += 1
    
    return s

In [11]:
@columnNames('nrSynsets')
def nrSynsets(row):
    s = 0
    for word in row['words']:
        s += len([x for x in swn.senti_synsets(word)])
    return s

In [12]:
@columnNames('nrSlangWords')
def nrSlangWords(row):
    s = 0
    for word in row['words']:
        if word in slangWords:
            s += 1

    return s

# Lexical metrics

In [13]:
@columnNames('meanUniqueness', 'stdUniqueness')
def uniqueness(row, dct, tfidf):
    bow = dct.doc2bow(row['words'])
    vector = [tupl[1] for tupl in tfidf[bow]]
    return np.mean(vector), np.std(vector)

# Sentiment Enablers

In [14]:
def sentenceOpinion(text):
    words = text.split()
    synsets = []
    for word in words:
        scores = [(x.pos_score(), x.neg_score(), x.obj_score()) for x in swn.senti_synsets(word)]
        if len(scores) > 0:
            synsets.append(np.mean(scores, axis=0))
    score = np.mean(synsets, axis=0)
    if np.isscalar(score): #weird hack to check for nan
        return 0
    if score[0] > score[1]:
        return 1 #positive
    return -1 #negative

In [15]:
@columnNames('opinionMixScore')
def opinionMixScore(row):
    sentences = row['sentences'].split(',')
    pos = 1
    neg = 1
    for sent in sentences:
        op = sentenceOpinion(sent)
        if op > 0:
            pos += 1
        else:
            neg += 1
    minimun = min([pos, neg])
    maximun = max([pos, neg])
    return maximun / minimun

In [16]:
def ambiguousSentimentWord(word):
    synsets = [[x.pos_score(), x.neg_score(), x.obj_score()] for x in swn.senti_synsets(word)]
    pos = False
    neg = False
    for s in synsets:
        if s[2] != max(s): #not objective
            if s[0] > s[1]:
                pos = True
            else:
                neg = True
    
    if pos and neg:
        return 1
    return 0

In [17]:
@columnNames('nrAmbiguousSentimentWords')
def nrAmbiguousSentimentWords(row):
    s = 0
    for word in row['words']:
        s += ambiguousSentimentWord(word)
    return s

In [18]:
@columnNames('nrStrongSentimentWords')
def nrStrongSentimentWords(row, strongWords):
    s = 0
    for word in row['words']:
        if word in strongWords:
            s += 1
    return s

# Readability formulas

In [19]:
@columnNames('formulaLIX')
def formulaLIX(row):
    first  = row['nrWords/nrSentences']
    second = row['nrLongWords/nrWords']
    return first + (second * 100)

In [20]:
@columnNames('formulaFleshKincaid')
def formulaFleshKincaid(row):
    first  = row['nrWords/nrSentences']
    second = row['nrSyllables/nrWords']
    return 206.835 - (1.015 * first) - (84.6 * second)

In [21]:
@columnNames('formulaSMOG')
def formulaSMOG(row):
    first = row['nrPolySyllables/nrSentences']
    return 1.043 * math.sqrt((first * 30) + 3.1291)

In [22]:
@columnNames('formulaGunningFog')
def formulaGunningFog(row):
    first  = row['nrWords/nrSentences']
    second = row['nrPolySyllables/nrWords']
    return (first + second) * 0.4

In [23]:
@columnNames('formulaDaleChall')
def formulaDaleChall(row):
    first  = row['nrDifficultWordsDaleChall/nrWords']
    second = row['nrWords/nrSentences']
    return (0.1579 * first * 100) + (0.0496 * second)

In [24]:
@columnNames('formulaColemanLiau')
def formulaColemanLiau(row):
    L = row['nrLetters/nrWords']
    S = row['nrSentences'] / row['nrWords']
    return (0.0588 * L * 100) - (0.296 * S * 100) - 15.8

In [25]:
@columnNames('formulaLinsearWrite')
def formulaLinsearWrite(row):
    easyWords = row['nrMonoSyllables'] + row['nrBiSyllables']
    hardWords = row['nrPolySyllables'] * 3
    score = (easyWords + hardWords) / row['nrSentences']
    if score > 20:
        score = score / 2
    else:
        score = (score / 2) - 1
    return score

In [26]:
 @columnNames('formulaSpacheSAT', 'formulaSpacheDaleChall')
 def formulaSpache(row):
     first           = row['nrWords/nrSentences']
     secondSAT       = row['nrDifficultWordsSAT/nrWords']
     secondDaleChall = row['nrDifficultWordsDaleChall/nrWords']
     scoreSAT       = (0.121 * first) + (0.082 * secondSAT)       + 0.659
     scoreDaleChall = (0.121 * first) + (0.082 * secondDaleChall) + 0.659
     return scoreSAT, scoreDaleChall

In [27]:
@columnNames('formulaFORCAST')
def formulaFORCAST(row):
    N = row['nrMonoSyllables/nrWords'] * 150
    return 20 - (N/10)

# Applying to datasets

In [28]:
#datasets = os.listdir('./data')
datasets = ['AirlineTweets']
#datasets = ['Sentiment140']

In [29]:
for dataset in tqdm(datasets, desc="Datasets"):
    inputFile  = f'./data/{dataset}/Data-Predicted.csv'
    outputFile = f'./data/{dataset}/Data-With-Dimensions.csv'
    tfidfFile  = f'./models/{dataset}/TF-IDF.model'
    dictFile   = f'./models/{dataset}/Dictionary.model'

    positiveWords = ["good", "nice", "cool", "lovely", "wonderful", "great", "awesome", "fantastic", "amazing", "fun", "excellent"]
    negativeWords = ["bad", "horrible", "terrible", "awful", "worst", "shitty", "crappy", "sucks", "hate"]
    strongWords = set(positiveWords + negativeWords)
    
    if not os.path.exists(inputFile):
        raise ValueError(f"Dataset {dataset} has not been predicted")

    if not os.path.exists(tfidfFile):
        raise ValueError(f"TFIDF for {dataset} has not been created")

    tfidf = TfidfModel.load(tfidfFile)
    dct   = Dictionary.load(dictFile)

    #df = pd.read_csv(inputFile)
    if os.path.exists(outputFile):
        df = pd.read_csv(outputFile)
    else:
        df = pd.read_csv(inputFile)

    df['words'] = df.progress_apply(lambda row: row['text'].split(), axis=1)

    ###Base variables
    base = [
        nrLetters(df),
        nrSyllables(df), #including mono, bi and poly counts
        nrWords(df),
        nrSentences(df),
        nrPOSTags(df),
        nrDifficultWordsSAT(df),
        nrDifficultWordsDaleChall(df),
        nrLongWords(df),
        nrSynsets(df),
        nrSlangWords(df),
    ]
    postag = [
        add(df, ['CC','IN'],          into='nrConjunctions'),
        add(df, ['JJ','JJR','JJS'],   into='nrAdjectives'),
        add(df, ['RB','RBR','RBS'],   into='nrAdverbs'),
        add(df, ['MD','VBG'],         into='nrComplexVerbs'),
        add(df, ['POS','PRP','PRP$'], into='nrPossesives'),
        add(df, ['DT','PDT'],         into='nrDeterminers'),
    ]
    postagwords = [
        divide(df, list(itertools.chain(*postag)), by='nrWords')
    ]

    ###Lexical metrics
    lexical = [
        uniqueness(df, dct, tfidf),
        divide(df, [
            'nrLetters',
            'nrSyllables',
            'nrMonoSyllables',
            'nrBiSyllables',
            'nrPolySyllables',
            'nrLongWords',],
            by='nrWords'
        ),
    ]

    ###Syntactic metrics
    syntactic = [
        divide(df, [
            'nrLetters',
            'nrWords',
            'nrSyllables',
            'nrMonoSyllables',
            'nrBiSyllables',
            'nrPolySyllables',
            'nrLongWords',],
            by='nrSentences'
        )
    ]

    ###Semantic metrics
    semantic = [
        divide(df, [
            'nrDifficultWordsSAT',
            'nrDifficultWordsDaleChall',
            'nrSynsets'],
            by='nrWords'
        )
    ]

    ###Sentiment Enablers
    sentiment = [
        opinionMixScore(df),
        nrAmbiguousSentimentWords(df),
        nrStrongSentimentWords(df, strongWords),
    ]

    ###Readability formulas
    formulas = [
        formulaFleshKincaid(df),
        formulaGunningFog(df),
        formulaSMOG(df),
        formulaDaleChall(df),
        formulaColemanLiau(df),
        formulaLinsearWrite(df),
        formulaSpache(df),
        formulaLIX(df),
        formulaFORCAST(df),
    ]

    df = df.drop(columns=['words'])

    df.to_csv(outputFile, index=False)

Datasets:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

In [30]:
dimensions = {
    'base': list(itertools.chain(*base)),
    'postag': list(itertools.chain(*postag)),
    'postagwords': list(itertools.chain(*postagwords)),
    'lexical': list(itertools.chain(*lexical)),
    'syntactic': list(itertools.chain(*syntactic)),
    'semantic': list(itertools.chain(*semantic)),
    'sentiment': list(itertools.chain(*sentiment)),
    'formulas': list(itertools.chain(*formulas)),
}

In [32]:
pickle.dump(dimensions, open('./Data-Dimensions.pickle', mode='wb'))