In [72]:
import numpy as np
import pandas as pd
import re

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from nltk.corpus import wordnet

In [73]:
# dataのload
data_sentence = pd.read_table('./Emobank-master/corpus/raw.tsv')
data_vad = pd.read_table('./Emobank-master/corpus/reader.tsv')
data_merged = pd.merge(data_sentence,data_vad,on='id')
print(len(data_sentence),len(data_vad),len(data_merged))

10358 10325 10142


In [74]:
ser = data_merged['sentence']

# 数字を全て0に
p_num = re.compile(r'[0-9]+')
ser = ser.apply(lambda x:p_num.sub("0",x))

# urlをremove 
p_url = re.compile(r'http[a-zA-Z0-9\!\#\$\%\&\'\*\+\-\.\^_\`\|\~\:]*')
ser = ser.apply(lambda x:p_url.sub(" ",x))

# replace .,! -> \s
ser = ser.apply(lambda x: x.replace("."," ").replace(","," ").replace("!"," ").replace("'"," ").replace("\""," ").replace("“"," ").replace("”"," "))

# lowerに
ser = ser.apply(lambda x:x.lower())

data_preprocessed = data_merged.assign(
    reg = ser
)


In [75]:
from nltk import word_tokenize
# sent = "This is my text, this is a nice way  to input text. I'm angry"
# word_tokenize(sent)
data_preprocessed = data_preprocessed.assign(
    words = data_preprocessed.apply(lambda x: len(word_tokenize(x['reg'])), axis=1)
)

In [76]:
data_preprocessed.to_csv('data_preprocessed.csv',encoding="utf-16")

In [77]:
data_preprocessed.head()

Unnamed: 0,id,sentence,Arousal,Dominance,Valence,sd.Arousal,sd.Dominance,sd.Valence,freq,reg,words
0,Acephalous-Cant-believe_4_47,I can't believe I wrote all that last year.,3.4,3.2,3.0,0.8,0.4,0.0,5,i can t believe i wrote all that last year,10
1,Acephalous-Cant-believe_83_354,Because I've been grading all damn day and am ...,3.2,3.2,2.8,0.4,0.4,0.4,5,because i ve been grading all damn day and am ...,49
2,Acephalous-Cant-believe_355_499,"However, when I started looking through my arc...",3.0,3.2,3.4,0.0,0.4,0.489898,5,however when i started looking through my arc...,26
3,Acephalous-Cant-believe_500_515,What do I mean?,3.0,3.0,3.0,0.0,0.0,0.0,5,what do i mean?,5
4,Acephalous-Cant-believe_517_626,The posts I consider foundational to my curren...,3.0,3.0,3.0,0.0,0.0,0.0,5,the posts i consider foundational to my curren...,18


In [78]:
data_valence = pd.concat(
    [
        data_preprocessed['id'],data_preprocessed['Valence'],data_preprocessed['sentence'],
        data_preprocessed['sd.Valence'],data_preprocessed['reg']
    ],
    axis=1
)
data_valence.head()

Unnamed: 0,id,Valence,sentence,sd.Valence,reg
0,Acephalous-Cant-believe_4_47,3.0,I can't believe I wrote all that last year.,0.0,i can t believe i wrote all that last year
1,Acephalous-Cant-believe_83_354,2.8,Because I've been grading all damn day and am ...,0.4,because i ve been grading all damn day and am ...
2,Acephalous-Cant-believe_355_499,3.4,"However, when I started looking through my arc...",0.489898,however when i started looking through my arc...
3,Acephalous-Cant-believe_500_515,3.0,What do I mean?,0.0,what do i mean?
4,Acephalous-Cant-believe_517_626,3.0,The posts I consider foundational to my curren...,0.0,the posts i consider foundational to my curren...


In [79]:
num = len(data_preprocessed[
    (data_preprocessed['Valence']>=2.8)&(data_preprocessed['Valence']<=3.2)
    &(data_preprocessed['Arousal']>=2.8)&(data_preprocessed['Arousal']<=3.2)
    &(data_preprocessed['Dominance']>=2.8)&(data_preprocessed['Dominance']<=3.2)
])
print(num / len(data_preprocessed))

0.3974561230526523


In [80]:
vad_mask = (data_preprocessed['Valence']>=2.8)&(data_preprocessed['Valence']<=3.2)\
          &(data_preprocessed['Arousal']>=2.8)&(data_preprocessed['Arousal']<=3.2)\
          &(data_preprocessed['Dominance']>=2.8)&(data_preprocessed['Dominance']<=3.2)
data_1 = data_preprocessed[vad_mask]
data_2 = data_preprocessed[~(vad_mask)]

In [81]:
len(data_preprocessed) == len(data_1) + len(data_2) # confirm

True

In [82]:
data_use,data_no_use = train_test_split(data_1, test_size=0.8)
len(data_use)/len(data_1)

0.19995038451997024

In [83]:
data_cut = pd.concat([data_use,data_2])
print(len(data_preprocessed[(data_preprocessed['Valence']>=2.8)&(data_preprocessed['Valence']<=3.2)])/len(data_preprocessed))
print(len(data_cut[(data_cut['Valence']>=2.8)&(data_cut['Valence']<=3.2)])/len(data_cut))
print(len(data_preprocessed[(data_preprocessed['Arousal']>=2.8)&(data_preprocessed['Arousal']<=3.2)])/len(data_preprocessed))
print(len(data_cut[(data_cut['Arousal']>=2.8)&(data_cut['Arousal']<=3.2)])/len(data_cut))

0.620193255768093
0.44311117536504263
0.6566752119897457
0.49660257336995806


In [84]:
train, dev_test = train_test_split(data_cut,test_size=0.4)
dev, test = train_test_split(dev_test,test_size=0.25)
# train/dev/test = 6/3/1

In [85]:
train = train.assign(data_type='train')
dev = dev.assign(data_type='dev')
test = test.assign(data_type='test')

In [86]:
data_cut = pd.concat([train,dev,test])

In [87]:
print(len(data_cut[data_cut['data_type']=='train'])/len(data_cut))
print(len(data_cut[data_cut['data_type']=='dev'])/len(data_cut))

0.5999710857308082
0.2999855428654041


In [88]:
a=1
b=5
data_cut = data_cut.assign(
    Valence_reg = data_cut.apply(lambda x: 2*(x['Valence']-a)/(b-a)-1, axis=1),
    Arousal_reg = data_cut.apply(lambda x: 2*(x['Arousal']-a)/(b-a)-1, axis=1),
    Dominance_reg = data_cut.apply(lambda x: 2*(x['Dominance']-a)/(b-a)-1, axis=1)
)
# [-1,1]で正規化


In [89]:
data_cut.head()

Unnamed: 0,id,sentence,Arousal,Dominance,Valence,sd.Arousal,sd.Dominance,sd.Valence,freq,reg,words,data_type,Valence_reg,Arousal_reg,Dominance_reg
448,detroit_15065_15155,Otters in post-Valdez Alaska are clawing their...,3.4,3.4,2.0,1.019804,0.8,0.894427,5,otters in post-valdez alaska are clawing their...,13,train,-0.5,0.2,0.2
6658,20000410_nyt-NEW_47_77,By Scott Montgomery Washington,2.0,3.0,3.0,1.0,0.0,0.0,2,by scott montgomery washington,4,train,0.0,-0.5,0.0
3528,hotel-california_12145_12198,I would immerse myself in new virtual reality ...,3.4,3.2,3.2,0.489898,0.4,0.4,5,i would immerse myself in new virtual reality ...,9,train,0.1,0.2,0.1
9570,SemEval_918,Barbaro's legacy: saving other horses,3.4,3.0,3.4,0.489898,0.632456,0.489898,5,barbaro s legacy: saving other horses,7,train,0.2,0.2,0.0
5871,118CWL050_3607_3658,"When you give to Big Sisters, you can get 50% ...",3.6,3.0,3.4,0.489898,0.0,0.489898,5,when you give to big sisters you can get 0% b...,12,train,0.2,0.3,0.0


In [90]:
data_cut.to_csv('data_cut.csv',encoding="utf-16")

In [70]:
type(data_cut.ix[1074]['sentence'])

str