In [1]:
import pandas as pd

In [18]:
data = pd.read_csv("datasets/dataset_caption.csv")
data.head()

Unnamed: 0,id,rotulo,texto,texto_pt
0,626,FALSO,a man is holding a cell phone to his ear . a m...,Um homem está segurando um celular perto do ou...
1,972,FALSO,a man in a suit and tie holding a microphone ....,Um homem de terno e gravata segurando um micro...
2,42,FALSO,a group of young children standing next to eac...,um grupo de crianças pequenas em pé ao lado um...
3,1043,FALSO,a man and a woman laying on a bed . a man and ...,Um homem e uma mulher deitados em uma cama. Um...
4,831,FALSO,a man in a suit and tie holding a microphone ....,Um homem de terno e gravata segurando um micro...


## Tramanto para começar a extração do texto

In [19]:
df = data[['id', 'texto_pt', 'rotulo']].copy()
df['texto'] = df['texto_pt']
df = df.drop('texto_pt', axis=1)
df.head()

Unnamed: 0,id,rotulo,texto
0,626,FALSO,Um homem está segurando um celular perto do ou...
1,972,FALSO,Um homem de terno e gravata segurando um micro...
2,42,FALSO,um grupo de crianças pequenas em pé ao lado um...
3,1043,FALSO,Um homem e uma mulher deitados em uma cama. Um...
4,831,FALSO,Um homem de terno e gravata segurando um micro...


In [20]:
#replace($TEXTO_ORIG$,". ." ,". " )
#Remove acentuação errada
df["texto"] = df["texto"].apply(lambda x: x.replace(". .", ". "))
len(df.index)

251

## Contagem no Texto

In [21]:
from nltk import tokenize    
import re
from collections import Counter
from nltk.corpus import stopwords
from string import punctuation

def count_letters(text):
    nonPunct = re.compile('.*[A-Za-z0-9].*')
    filtered = [w for w in text if nonPunct.match(w)]
    counts = Counter(filtered)
    
    return len(counts)

def count_words(text):
    stop_words = set(stopwords.words('english') + list(punctuation))
    palavras = tokenize.word_tokenize(text.lower())
    palavras_sem_stopwords = [palavra for palavra in palavras if palavra not in stop_words]
    
    return len(palavras_sem_stopwords)

def count_upper_letters(text):
    uppers = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    nonPunct = re.compile('.*[A-Za-z0-9].*')
    filtered = [w for w in text if nonPunct.match(w) and w in uppers]
    counts = Counter(filtered)
    
    return len(counts)

def count_exclamation(text):
    uppers = "!"
    nonPunct = re.compile('.*[A-Za-z0-9].*')
    filtered = [w for w in text if nonPunct.match(w) and w in uppers]
    counts = Counter(filtered)
    
    return len(counts)

def count_quote(text):
    quote = "\""
    nonPunct = re.compile('.*[A-Za-z0-9].*')
    filtered = [w for w in text if nonPunct.match(w) and w in quote]
    counts = Counter(filtered)
    
    return len(counts)
    
    
df['PALAVRAS'] = 0.0
df['LETRAS'] = 0.0
df['MAIUSCULAS'] = 0.0
df['EXCLAMA'] = 0.0
df['ASPAS'] = 0.0
    
for (index, row) in df.iterrows():
    #print("words: ",count_words(row['texto'])) 
    #print("letters: ",count_letters(row['texto']))
    df.at[index,'PALAVRAS'] = count_words(row['texto'])
    df.at[index,'LETRAS'] = count_letters(row['texto'])
    df.at[index,'MAIUSCULAS'] = count_upper_letters(row['texto']) / count_letters(row['texto'])
    df.at[index,'EXCLAMA'] = count_exclamation(row['texto'])
    df.at[index,'ASPAS'] = count_quote(row['texto']) / count_letters(row['texto'])

df.head()

Unnamed: 0,id,rotulo,texto,PALAVRAS,LETRAS,MAIUSCULAS,EXCLAMA,ASPAS
0,626,FALSO,Um homem está segurando um celular perto do ou...,24.0,18.0,0.055556,0.0,0.0
1,972,FALSO,Um homem de terno e gravata segurando um micro...,30.0,18.0,0.055556,0.0,0.0
2,42,FALSO,um grupo de crianças pequenas em pé ao lado um...,28.0,19.0,0.0,0.0,0.0
3,1043,FALSO,Um homem e uma mulher deitados em uma cama. Um...,27.0,15.0,0.066667,0.0,0.0
4,831,FALSO,Um homem de terno e gravata segurando um micro...,28.0,17.0,0.058824,0.0,0.0


## Extração de Polaridade

In [22]:
sentilexpt = open("lexicos/SentiLex-lem-PT02.txt","r")
dic_palavra_polaridade = {}

for i in sentilexpt.readlines():
    linha = i.strip()
    pos_ponto = linha.find(" ")
    palavra = (linha[:pos_ponto])
    polaridade = (linha[pos_ponto+1:pos_ponto+2])
    val_pol = 0;
    if polaridade == '+':
        val_pol = 1
    elif polaridade == '-':
        val_pol = -1
    else:
        val_pol = 0
    #print(linha, polaridade, val_pol )    
    dic_palavra_polaridade[palavra] = val_pol
    

def score_sentimento(frase):
    frase = frase.lower()
    l_sentimento = []
    for p in frase.split():
        l_sentimento.append(int(dic_palavra_polaridade.get(p, 0)))
    score = sum(l_sentimento)
    if score > 0:
        return 'Positivo', score
    elif score == 0:
        return 'Neutro', score
    else:
        return 'Negativo', score

df['Score_Pol'] = 0.0 
df['POL'] = "" 

for (index, row) in df.iterrows():
    pol, score = score_sentimento(row['texto'])
    df.at[index,'Score_Pol'] = score
    df.at[index,'POL'] = pol

df.head()    


Unnamed: 0,id,rotulo,texto,PALAVRAS,LETRAS,MAIUSCULAS,EXCLAMA,ASPAS,Score_Pol,POL
0,626,FALSO,Um homem está segurando um celular perto do ou...,24.0,18.0,0.055556,0.0,0.0,0.0,Neutro
1,972,FALSO,Um homem de terno e gravata segurando um micro...,30.0,18.0,0.055556,0.0,0.0,0.0,Neutro
2,42,FALSO,um grupo de crianças pequenas em pé ao lado um...,28.0,19.0,0.0,0.0,0.0,0.0,Neutro
3,1043,FALSO,Um homem e uma mulher deitados em uma cama. Um...,27.0,15.0,0.066667,0.0,0.0,0.0,Neutro
4,831,FALSO,Um homem de terno e gravata segurando um micro...,28.0,17.0,0.058824,0.0,0.0,0.0,Neutro


## Normalizando os Valores de Polaridade 

In [23]:
#from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

dfn =  df[['Score_Pol']].copy()
names = dfn.columns

scaler = MinMaxScaler().fit(dfn)
dfn = scaler.transform(dfn)
dfn = pd.DataFrame(dfn, columns= names)  

df["POL_ABSOLUTO"] = dfn['Score_Pol']

df.head()

Unnamed: 0,id,rotulo,texto,PALAVRAS,LETRAS,MAIUSCULAS,EXCLAMA,ASPAS,Score_Pol,POL,POL_ABSOLUTO
0,626,FALSO,Um homem está segurando um celular perto do ou...,24.0,18.0,0.055556,0.0,0.0,0.0,Neutro,0.0
1,972,FALSO,Um homem de terno e gravata segurando um micro...,30.0,18.0,0.055556,0.0,0.0,0.0,Neutro,0.0
2,42,FALSO,um grupo de crianças pequenas em pé ao lado um...,28.0,19.0,0.0,0.0,0.0,0.0,Neutro,0.0
3,1043,FALSO,Um homem e uma mulher deitados em uma cama. Um...,27.0,15.0,0.066667,0.0,0.0,0.0,Neutro,0.0
4,831,FALSO,Um homem de terno e gravata segurando um micro...,28.0,17.0,0.058824,0.0,0.0,0.0,Neutro,0.0


## Análise gramatical com LIWC 2015

In [24]:
import math
import time
from liwc import Liwc

def lexical_builder(df, dic_path, dic_fields):
    #Carrega a biblioteca do LIWC
    LIWC_FILEPATH = dic_path
    liwc = Liwc(LIWC_FILEPATH)

    colunas = "id,"   
    colunas += ','.join(liwc.categories.values())
    colunas = colunas.split(',')

    #Create empty table
    dfout = pd.DataFrame(columns=(colunas))

    result = []
    for index, row in df.iterrows():
        indice = row['id']
        news = str(row['texto']).lower()    
        emocoes = (liwc.parse(news.split(' ')))
        # Cria matriz esparsa
        avalia = dict.fromkeys(liwc.categories.values(),1 )
        for categ in liwc.categories:
            avalia[liwc.categories[categ]] = emocoes[liwc.categories[categ]] / row['LETRAS']
        temp = {"indice": indice}
        dest = {**temp, ** avalia}    
        result.append(dest)

    for dados in result:
        #print (dados.values())
        b = pd.DataFrame([dados.values()], columns=(colunas))
        dfout = dfout.append(b , ignore_index=True)

    dfout = dfout[dic_fields].copy()
    return dfout
    

dfgram = lexical_builder(df,
                         "lexicos/LIWC2015_pt-utf8.dic",
                         ['id','pronoun','ppron','i','we','you','shehe','they','ipron','article','prep',
                          'auxverb','adverb','conj','negate','verb','adj','interrog'])
dfgram.head()


Unnamed: 0,id,pronoun,ppron,i,we,you,shehe,they,ipron,article,prep,auxverb,adverb,conj,negate,verb,adj,interrog
0,626,0.388889,0.055556,0.0,0.0,0.0,0.055556,0.0,0.388889,0.5,0.166667,0.166667,0.111111,0.0,0.0,0.166667,0.0,0.0
1,972,0.277778,0.0,0.0,0.0,0.0,0.0,0.0,0.277778,0.333333,0.333333,0.055556,0.0,0.166667,0.0,0.055556,0.0,0.0
2,42,0.315789,0.052632,0.0,0.0,0.0,0.052632,0.0,0.315789,0.368421,0.421053,0.0,0.0,0.0,0.0,0.052632,0.157895,0.0
3,1043,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.6,0.2,0.0,0.0,0.2,0.0,0.0,0.0,0.0
4,831,0.352941,0.0,0.0,0.0,0.0,0.0,0.0,0.352941,0.352941,0.176471,0.058824,0.0,0.176471,0.0,0.058824,0.0,0.0


In [25]:
df = pd.merge(df, dfgram,  left_on='id', right_on='id')
df.head()

Unnamed: 0,id,rotulo,texto,PALAVRAS,LETRAS,MAIUSCULAS,EXCLAMA,ASPAS,Score_Pol,POL,...,ipron,article,prep,auxverb,adverb,conj,negate,verb,adj,interrog
0,626,FALSO,Um homem está segurando um celular perto do ou...,24.0,18.0,0.055556,0.0,0.0,0.0,Neutro,...,0.388889,0.5,0.166667,0.166667,0.111111,0.0,0.0,0.166667,0.0,0.0
1,972,FALSO,Um homem de terno e gravata segurando um micro...,30.0,18.0,0.055556,0.0,0.0,0.0,Neutro,...,0.277778,0.333333,0.333333,0.055556,0.0,0.166667,0.0,0.055556,0.0,0.0
2,42,FALSO,um grupo de crianças pequenas em pé ao lado um...,28.0,19.0,0.0,0.0,0.0,0.0,Neutro,...,0.315789,0.368421,0.421053,0.0,0.0,0.0,0.0,0.052632,0.157895,0.0
3,1043,FALSO,Um homem e uma mulher deitados em uma cama. Um...,27.0,15.0,0.066667,0.0,0.0,0.0,Neutro,...,0.6,0.6,0.2,0.0,0.0,0.2,0.0,0.0,0.0,0.0
4,831,FALSO,Um homem de terno e gravata segurando um micro...,28.0,17.0,0.058824,0.0,0.0,0.0,Neutro,...,0.352941,0.352941,0.176471,0.058824,0.0,0.176471,0.0,0.058824,0.0,0.0


## Análise de Sentimentos Affect-BR

In [26]:

dfaffect = lexical_builder(df,
                         "lexicos/LIWC2015_pt-utf8.dic",
                         ['id','affect','posemo','negemo','anx','anger','sad'])
dfaffect.head()

Unnamed: 0,id,affect,posemo,negemo,anx,anger,sad
0,626,0.0,0.0,0.0,0.0,0.0,0.0
1,972,0.0,0.0,0.0,0.0,0.0,0.0
2,42,0.0,0.0,0.0,0.0,0.0,0.0
3,1043,0.0,0.0,0.0,0.0,0.0,0.0
4,831,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
#df = df.drop('affect', axis=1)
#df = df.drop('posemo', axis=1)
#df = df.drop('negemo', axis=1)
#df = df.drop('anx', axis=1)
#df = df.drop('anger', axis=1)
#df = df.drop('sad', axis=1)

df = pd.merge(df, dfaffect,  left_on='id', right_on='id')
df.head()

Unnamed: 0,id,rotulo,texto,PALAVRAS,LETRAS,MAIUSCULAS,EXCLAMA,ASPAS,Score_Pol,POL,...,negate,verb,adj,interrog,affect,posemo,negemo,anx,anger,sad
0,626,FALSO,Um homem está segurando um celular perto do ou...,24.0,18.0,0.055556,0.0,0.0,0.0,Neutro,...,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,972,FALSO,Um homem de terno e gravata segurando um micro...,30.0,18.0,0.055556,0.0,0.0,0.0,Neutro,...,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,42,FALSO,um grupo de crianças pequenas em pé ao lado um...,28.0,19.0,0.0,0.0,0.0,0.0,Neutro,...,0.0,0.052632,0.157895,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1043,FALSO,Um homem e uma mulher deitados em uma cama. Um...,27.0,15.0,0.066667,0.0,0.0,0.0,Neutro,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,831,FALSO,Um homem de terno e gravata segurando um micro...,28.0,17.0,0.058824,0.0,0.0,0.0,Neutro,...,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Normalizando os valores Gramticais e Emoções

In [30]:
dfn = df[[
    'POL_ABSOLUTO',
    'MAIUSCULAS',
    'EXCLAMA',
    'ASPAS',
    'pronoun',
    'ppron',
    'i',
    'we',
    'you',
    'shehe',
    'they',
    'ipron',
    'article',
    'prep',
    'auxverb',
    'adverb',
    'conj',
    'negate',
    'verb',
    'adj',
    'interrog',
    'affect',
    'posemo',
    'negemo',
    'anx',
    'anger',
    'sad']].copy()

names = dfn.columns

scaler = MinMaxScaler().fit(dfn)
dfn = scaler.transform(dfn)
dfn = pd.DataFrame(dfn, columns= names)  
dfn.head()

Unnamed: 0,POL_ABSOLUTO,MAIUSCULAS,EXCLAMA,ASPAS,pronoun,ppron,i,we,you,shehe,...,negate,verb,adj,interrog,affect,posemo,negemo,anx,anger,sad
0,0.0,0.291667,0.0,0.0,0.341085,0.090278,0.0,0.0,0.0,0.090278,...,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.291667,0.0,0.0,0.170543,0.0,0.0,0.0,0.0,0.0,...,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.228886,0.085526,0.0,0.0,0.0,0.085526,...,0.0,0.157895,0.368421,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.35,0.0,0.0,0.665116,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.308824,0.0,0.0,0.28591,0.0,0.0,0.0,0.0,0.0,...,0.0,0.176471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
df['POL_ABSOLUTO'] = dfn['POL_ABSOLUTO']
df['MAIUSCULAS'] = dfn['MAIUSCULAS']
df['EXCLAMA'] = dfn['EXCLAMA']
df['ASPAS'] = dfn['ASPAS']
df['pronoun'] = dfn['pronoun']
df['ppron'] = dfn['ppron']
df['i'] = dfn['i']
df['we'] = dfn['we']
df['you'] = dfn['you']
df['shehe'] = dfn['shehe']
df['they'] = dfn['they']
df['ipron']= dfn['ipron']
df['article'] = dfn['article']
df['prep'] = dfn['prep']
df['auxverb'] = dfn['auxverb']
df['adverb'] = dfn['adverb']
df['conj'] = dfn['conj']
df['negate'] = dfn['negate']
df['verb'] = dfn['verb']
df['adj'] = dfn['adj']
df['interrog'] = dfn['interrog']
df['affect'] = dfn['affect']
df['posemo'] = dfn['posemo']
df['negemo'] = dfn['negemo']
df['anx'] = dfn['anx']
df['anger'] = dfn['anger']
df['sad'] = dfn['sad']

df.head()

Unnamed: 0,id,rotulo,texto,PALAVRAS,LETRAS,MAIUSCULAS,EXCLAMA,ASPAS,Score_Pol,POL,...,negate,verb,adj,interrog,affect,posemo,negemo,anx,anger,sad
0,626,FALSO,Um homem está segurando um celular perto do ou...,24.0,18.0,0.291667,0.0,0.0,0.0,Neutro,...,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,972,FALSO,Um homem de terno e gravata segurando um micro...,30.0,18.0,0.291667,0.0,0.0,0.0,Neutro,...,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,42,FALSO,um grupo de crianças pequenas em pé ao lado um...,28.0,19.0,0.0,0.0,0.0,0.0,Neutro,...,0.0,0.157895,0.368421,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1043,FALSO,Um homem e uma mulher deitados em uma cama. Um...,27.0,15.0,0.35,0.0,0.0,0.0,Neutro,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,831,FALSO,Um homem de terno e gravata segurando um micro...,28.0,17.0,0.308824,0.0,0.0,0.0,Neutro,...,0.0,0.176471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Grava o Dataset para usar nos Algoritmos de ML

In [32]:
df = df.drop('texto',axis=1)
df = df.drop('Score_Pol',axis=1)
df = df.drop('POL',axis=1)
df = df.drop('PALAVRAS',axis=1)
df = df.drop('LETRAS',axis=1)

df.to_csv('datasets/ds_caption_pt.csv', index=False) 