In [1]:
import pandas as pd

In [2]:
#data = pd.read_csv("datasets/dataset_caption.csv")
data = pd.read_csv("datasets2/dataset_caption.csv")
data.head()

Unnamed: 0,id,rotulo,texto
0,53,FALSO,a bunch of different types of fruit on a table...
1,1136,VERDADE,a bunch of items that are on a table . a bunch...
2,695,FALSO,a birthday cake with a train on it a birthday ...
3,3312,VERDADE,a white dog laying on top of a bed . a white d...
4,1314,VERDADE,a close up of a cake with a train on it a clos...


## Tramanto para começar a extração do texto

In [3]:
df = data[['id', 'texto', 'rotulo']].copy()
df.head()

Unnamed: 0,id,texto,rotulo
0,53,a bunch of different types of fruit on a table...,FALSO
1,1136,a bunch of items that are on a table . a bunch...,VERDADE
2,695,a birthday cake with a train on it a birthday ...,FALSO
3,3312,a white dog laying on top of a bed . a white d...,VERDADE
4,1314,a close up of a cake with a train on it a clos...,VERDADE


In [4]:
#replace($TEXTO_ORIG$,". ." ,". " )
#Remove acentuação errada
df["texto"] = df["texto"].apply(lambda x: x.replace(". .", ". "))
len(df.index)

992

## Contagem no Texto

In [5]:
from nltk import tokenize    
import re
from collections import Counter
from nltk.corpus import stopwords
from string import punctuation

def count_letters(text):
    counts = "".join(text.split())   
    return len(counts)


def count_words(text):
    stop_words = set(stopwords.words('english') + list(punctuation))
    palavras = tokenize.word_tokenize(text.lower())
    palavras_sem_stopwords = [palavra for palavra in palavras if palavra not in stop_words]
    
    return len(palavras_sem_stopwords)

def count_upper_letters(text):
    uppers = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    nonPunct = re.compile('.*[A-Za-z0-9].*')
    filtered = [w for w in text if nonPunct.match(w) and w in uppers]
    #counts = Counter(filtered)
    
    return len(filtered) or 0

def count_special_char(text, char):
    filtered = [w for w in text if w in char]
    return len(filtered) or 0

    
def weird_division(n, d):
    return n / d if d else 0

df['PALAVRAS'] = 0.0
df['LETRAS'] = 0.0
df['MAIUSCULAS'] = 0.0
df['EXCLAMA'] = 0.0
df['ASPAS'] = 0.0
    
for (index, row) in df.iterrows():
    #print("words: ",count_words(row['texto'])) 
    #print("letters: ",count_letters(row['texto']))
    letters = count_letters(row['texto'])
    df.at[index,'PALAVRAS'] = count_words(row['texto'])
    df.at[index,'LETRAS'] = letters
    df.at[index,'MAIUSCULAS'] = weird_division(count_upper_letters(row['texto']),letters )
    df.at[index,'EXCLAMA'] = count_special_char(row['texto'],"!") / letters
    df.at[index,'ASPAS'] = weird_division(count_special_char(row['texto'],"\""), letters)

df.head()

Unnamed: 0,id,texto,rotulo,PALAVRAS,LETRAS,MAIUSCULAS,EXCLAMA,ASPAS
0,53,a bunch of different types of fruit on a table...,FALSO,16.0,121.0,0.0,0.0,0.0
1,1136,a bunch of items that are on a table . a bunch...,VERDADE,10.0,96.0,0.0,0.0,0.0
2,695,a birthday cake with a train on it a birthday ...,FALSO,11.0,91.0,0.0,0.0,0.0
3,3312,a white dog laying on top of a bed . a white d...,VERDADE,17.0,98.0,0.0,0.0,0.0
4,1314,a close up of a cake with a train on it a clos...,VERDADE,12.0,100.0,0.0,0.0,0.0


## Extração de Polaridade

In [6]:
sentilexpt = open("lexicos/sentilex-EN.txt","r")
dic_palavra_polaridade = {}

for i in sentilexpt.readlines():
    linha = i.strip()
    pos_ponto = linha.find(" ")
    palavra = (linha[:pos_ponto])
    polaridade = (linha[pos_ponto+1:pos_ponto+2])
    val_pol = 0;
    if polaridade == '+':
        val_pol = 1
    elif polaridade == '-':
        val_pol = -1
    else:
        val_pol = 0
    #print(linha, polaridade, val_pol )    
    dic_palavra_polaridade[palavra] = val_pol
    

def score_sentimento(frase):
    frase = frase.lower()
    l_sentimento = []
    for p in frase.split():
        l_sentimento.append(int(dic_palavra_polaridade.get(p, 0)))
    score = sum(l_sentimento)
    if score > 0:
        return 'Positivo', score
    elif score == 0:
        return 'Neutro', score
    else:
        return 'Negativo', score

df['Score_Pol'] = 0.0 
df['POL'] = "" 

for (index, row) in df.iterrows():
    pol, score = score_sentimento(row['texto'])
    df.at[index,'Score_Pol'] = score
    df.at[index,'POL'] = pol

df.head()    


Unnamed: 0,id,texto,rotulo,PALAVRAS,LETRAS,MAIUSCULAS,EXCLAMA,ASPAS,Score_Pol,POL
0,53,a bunch of different types of fruit on a table...,FALSO,16.0,121.0,0.0,0.0,0.0,0.0,Neutro
1,1136,a bunch of items that are on a table . a bunch...,VERDADE,10.0,96.0,0.0,0.0,0.0,3.0,Positivo
2,695,a birthday cake with a train on it a birthday ...,FALSO,11.0,91.0,0.0,0.0,0.0,0.0,Neutro
3,3312,a white dog laying on top of a bed . a white d...,VERDADE,17.0,98.0,0.0,0.0,0.0,2.0,Positivo
4,1314,a close up of a cake with a train on it a clos...,VERDADE,12.0,100.0,0.0,0.0,0.0,-6.0,Negativo


## Normalizando os Valores de Polaridade 

In [7]:
#from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

dfn =  df[['Score_Pol']].copy()
names = dfn.columns

scaler = MinMaxScaler().fit(dfn)
dfn = scaler.transform(dfn)
dfn = pd.DataFrame(dfn, columns= names)  

df["POL_ABSOLUTO"] = dfn['Score_Pol']

df.head()

Unnamed: 0,id,texto,rotulo,PALAVRAS,LETRAS,MAIUSCULAS,EXCLAMA,ASPAS,Score_Pol,POL,POL_ABSOLUTO
0,53,a bunch of different types of fruit on a table...,FALSO,16.0,121.0,0.0,0.0,0.0,0.0,Neutro,0.75
1,1136,a bunch of items that are on a table . a bunch...,VERDADE,10.0,96.0,0.0,0.0,0.0,3.0,Positivo,0.9
2,695,a birthday cake with a train on it a birthday ...,FALSO,11.0,91.0,0.0,0.0,0.0,0.0,Neutro,0.75
3,3312,a white dog laying on top of a bed . a white d...,VERDADE,17.0,98.0,0.0,0.0,0.0,2.0,Positivo,0.85
4,1314,a close up of a cake with a train on it a clos...,VERDADE,12.0,100.0,0.0,0.0,0.0,-6.0,Negativo,0.45


## Análise gramatical com LIWC 2015

In [8]:
import math
import time
from liwc import Liwc

def lexical_builder(df, dic_path, dic_fields):
    #Carrega a biblioteca do LIWC
    LIWC_FILEPATH = dic_path
    liwc = Liwc(LIWC_FILEPATH)

    colunas = "id,"   
    colunas += ','.join(liwc.categories.values())
    colunas = colunas.split(',')

    #Create empty table
    dfout = pd.DataFrame(columns=(colunas))

    result = []
    for index, row in df.iterrows():
        indice = row['id']
        news = str(row['texto']).lower()    
        emocoes = (liwc.parse(news.split(' ')))
        # Cria matriz esparsa
        avalia = dict.fromkeys(liwc.categories.values(),1 )
        for categ in liwc.categories:
            avalia[liwc.categories[categ]] = emocoes[liwc.categories[categ]] / row['LETRAS']
        temp = {"indice": indice}
        dest = {**temp, ** avalia}    
        result.append(dest)

    for dados in result:
        #print (dados.values())
        b = pd.DataFrame([dados.values()], columns=(colunas))
        dfout = dfout.append(b , ignore_index=True)

    dfout = dfout[dic_fields].copy()
    return dfout
    

dfgram = lexical_builder(df,
                         "lexicos/LIWC2015_en-utf8.dic",
                         ['id','pronoun','ppron','i','we','you','shehe','they','ipron','article','prep',
                          'auxverb','adverb','conj','negate','verb','adj','interrog'])
dfgram.head()


Unnamed: 0,id,pronoun,ppron,i,we,you,shehe,they,ipron,article,prep,auxverb,adverb,conj,negate,verb,adj,interrog
0,53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.049587,0.07438,0.0,0.0,0.0,0.0,0.008264,0.024793,0.0
1,1136,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.0625,0.0625,0.03125,0.0,0.0,0.0,0.041667,0.0,0.0
2,695,0.032967,0.0,0.0,0.0,0.0,0.0,0.0,0.032967,0.076923,0.065934,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3312,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.061224,0.061224,0.0,0.0,0.0,0.0,0.030612,0.081633,0.0
4,1314,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.08,0.11,0.0,0.0,0.0,0.0,0.0,0.02,0.0


In [9]:
df = pd.merge(df, dfgram,  left_on='id', right_on='id')
df.head()

Unnamed: 0,id,texto,rotulo,PALAVRAS,LETRAS,MAIUSCULAS,EXCLAMA,ASPAS,Score_Pol,POL,...,ipron,article,prep,auxverb,adverb,conj,negate,verb,adj,interrog
0,53,a bunch of different types of fruit on a table...,FALSO,16.0,121.0,0.0,0.0,0.0,0.0,Neutro,...,0.0,0.049587,0.07438,0.0,0.0,0.0,0.0,0.008264,0.024793,0.0
1,1136,a bunch of items that are on a table . a bunch...,VERDADE,10.0,96.0,0.0,0.0,0.0,3.0,Positivo,...,0.03125,0.0625,0.0625,0.03125,0.0,0.0,0.0,0.041667,0.0,0.0
2,695,a birthday cake with a train on it a birthday ...,FALSO,11.0,91.0,0.0,0.0,0.0,0.0,Neutro,...,0.032967,0.076923,0.065934,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3312,a white dog laying on top of a bed . a white d...,VERDADE,17.0,98.0,0.0,0.0,0.0,2.0,Positivo,...,0.0,0.061224,0.061224,0.0,0.0,0.0,0.0,0.030612,0.081633,0.0
4,1314,a close up of a cake with a train on it a clos...,VERDADE,12.0,100.0,0.0,0.0,0.0,-6.0,Negativo,...,0.01,0.08,0.11,0.0,0.0,0.0,0.0,0.0,0.02,0.0


## Análise de Sentimentos Affect-BR

In [10]:

dfaffect = lexical_builder(df,
                         "lexicos/LIWC2015_en-utf8.dic",
                         ['id','affect','posemo','negemo','anx','anger','sad'])
dfaffect.head()

Unnamed: 0,id,affect,posemo,negemo,anx,anger,sad
0,53,0.0,0.0,0.0,0.0,0.0,0.0
1,1136,0.0,0.0,0.0,0.0,0.0,0.0
2,695,0.010989,0.010989,0.0,0.0,0.0,0.0
3,3312,0.0,0.0,0.0,0.0,0.0,0.0
4,1314,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
df = pd.merge(df, dfaffect,  left_on='id', right_on='id')
df.head()

Unnamed: 0,id,texto,rotulo,PALAVRAS,LETRAS,MAIUSCULAS,EXCLAMA,ASPAS,Score_Pol,POL,...,negate,verb,adj,interrog,affect,posemo,negemo,anx,anger,sad
0,53,a bunch of different types of fruit on a table...,FALSO,16.0,121.0,0.0,0.0,0.0,0.0,Neutro,...,0.0,0.008264,0.024793,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1136,a bunch of items that are on a table . a bunch...,VERDADE,10.0,96.0,0.0,0.0,0.0,3.0,Positivo,...,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,695,a birthday cake with a train on it a birthday ...,FALSO,11.0,91.0,0.0,0.0,0.0,0.0,Neutro,...,0.0,0.0,0.0,0.0,0.010989,0.010989,0.0,0.0,0.0,0.0
3,3312,a white dog laying on top of a bed . a white d...,VERDADE,17.0,98.0,0.0,0.0,0.0,2.0,Positivo,...,0.0,0.030612,0.081633,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1314,a close up of a cake with a train on it a clos...,VERDADE,12.0,100.0,0.0,0.0,0.0,-6.0,Negativo,...,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Normalizando os valores Gramticais e Emoções

In [12]:
dfn = df[[
    'POL_ABSOLUTO',
    'MAIUSCULAS',
    'EXCLAMA',
    'ASPAS',
    'pronoun',
    'ppron',
    'i',
    'we',
    'you',
    'shehe',
    'they',
    'ipron',
    'article',
    'prep',
    'auxverb',
    'adverb',
    'conj',
    'negate',
    'verb',
    'adj',
    'interrog',
    'affect',
    'posemo',
    'negemo',
    'anx',
    'anger',
    'sad']].copy()

names = dfn.columns

scaler = MinMaxScaler().fit(dfn)
dfn = scaler.transform(dfn)
dfn = pd.DataFrame(dfn, columns= names)  
dfn.head()

Unnamed: 0,POL_ABSOLUTO,MAIUSCULAS,EXCLAMA,ASPAS,pronoun,ppron,i,we,you,shehe,...,negate,verb,adj,interrog,affect,posemo,negemo,anx,anger,sad
0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.108815,0.18595,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.9,0.0,0.0,0.0,0.50625,0.0,0.0,0.0,0.0,0.0,...,0.0,0.548611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.75,0.0,0.0,0.0,0.534066,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.307692,0.307692,0.0,0.0,0.0,0.0
3,0.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.403061,0.612245,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.45,0.0,0.0,0.0,0.162,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df['POL_ABSOLUTO'] = dfn['POL_ABSOLUTO']
df['MAIUSCULAS'] = dfn['MAIUSCULAS']
df['EXCLAMA'] = dfn['EXCLAMA']
df['ASPAS'] = dfn['ASPAS']
df['pronoun'] = dfn['pronoun']
df['ppron'] = dfn['ppron']
df['i'] = dfn['i']
df['we'] = dfn['we']
df['you'] = dfn['you']
df['shehe'] = dfn['shehe']
df['they'] = dfn['they']
df['ipron']= dfn['ipron']
df['article'] = dfn['article']
df['prep'] = dfn['prep']
df['auxverb'] = dfn['auxverb']
df['adverb'] = dfn['adverb']
df['conj'] = dfn['conj']
df['negate'] = dfn['negate']
df['verb'] = dfn['verb']
df['adj'] = dfn['adj']
df['interrog'] = dfn['interrog']
df['affect'] = dfn['affect']
df['posemo'] = dfn['posemo']
df['negemo'] = dfn['negemo']
df['anx'] = dfn['anx']
df['anger'] = dfn['anger']
df['sad'] = dfn['sad']

df.head()

Unnamed: 0,id,texto,rotulo,PALAVRAS,LETRAS,MAIUSCULAS,EXCLAMA,ASPAS,Score_Pol,POL,...,negate,verb,adj,interrog,affect,posemo,negemo,anx,anger,sad
0,53,a bunch of different types of fruit on a table...,FALSO,16.0,121.0,0.0,0.0,0.0,0.0,Neutro,...,0.0,0.108815,0.18595,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1136,a bunch of items that are on a table . a bunch...,VERDADE,10.0,96.0,0.0,0.0,0.0,3.0,Positivo,...,0.0,0.548611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,695,a birthday cake with a train on it a birthday ...,FALSO,11.0,91.0,0.0,0.0,0.0,0.0,Neutro,...,0.0,0.0,0.0,0.0,0.307692,0.307692,0.0,0.0,0.0,0.0
3,3312,a white dog laying on top of a bed . a white d...,VERDADE,17.0,98.0,0.0,0.0,0.0,2.0,Positivo,...,0.0,0.403061,0.612245,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1314,a close up of a cake with a train on it a clos...,VERDADE,12.0,100.0,0.0,0.0,0.0,-6.0,Negativo,...,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Grava o Dataset para usar nos Algoritmos de ML

In [14]:
df = df.drop('texto',axis=1)
df = df.drop('Score_Pol',axis=1)
df = df.drop('POL',axis=1)
df = df.drop('PALAVRAS',axis=1)
df = df.drop('LETRAS',axis=1)

#df.to_csv('datasets/ds_caption.csv', index=False) 
df.to_csv('datasets2/ds_caption.csv', index=False) 