In [1]:
# csv 파일 불러오기
import pandas as pd
import numpy as np
from tqdm import tqdm

df = pd.read_csv('./TroFi_example.csv')
df.head()

Unnamed: 0,***absorb***,Unnamed: 1,Unnamed: 2
0,*nonliteral cluster*,,
1,wsj02:2251,U,Another option will be to try to curb the grow...
2,wsj03:2839,N,But in the short-term it will absorb a lot of ...
3,wsj03:9412,U,"That merger , valued at about $ 1 billion , he..."
4,wsj04:10022,N,Shocks from one-time changes in the terms of t...


In [2]:
df.columns = ["index", "Label", "Text"]

In [3]:
df.head()

Unnamed: 0,index,Label,Text
0,*nonliteral cluster*,,
1,wsj02:2251,U,Another option will be to try to curb the grow...
2,wsj03:2839,N,But in the short-term it will absorb a lot of ...
3,wsj03:9412,U,"That merger , valued at about $ 1 billion , he..."
4,wsj04:10022,N,Shocks from one-time changes in the terms of t...


In [4]:
df.isnull().sum()

index    149
Label    348
Text     348
dtype: int64

In [5]:
# NaN이 있는 행을 지워도 되는지 확인하는 작업
df[df['Text'].isnull()][300:]

Unnamed: 0,index,Label,Text
5854,***stick***,,
5855,*nonliteral cluster*,,
6007,,,
6008,*literal cluster*,,
6039,,,
6040,********************,,
6041,,,
6042,***strike***,,
6043,*nonliteral cluster*,,
6104,,,


In [6]:
# NaN이 있는 행 모두 제거
df.dropna(inplace=True)

In [8]:
df_copy = df.copy()

In [9]:
df_copy.drop(["index"], axis=1, inplace=True)

In [10]:
"""
U: Unannotated
N: Non-literal
L: Literal
"""
df_copy['Label'].value_counts()

U    2699
N    2145
L    1592
Name: Label, dtype: int64

In [11]:
df_copy.head()

Unnamed: 0,Label,Text
1,U,Another option will be to try to curb the grow...
2,N,But in the short-term it will absorb a lot of ...
3,U,"That merger , valued at about $ 1 billion , he..."
4,N,Shocks from one-time changes in the terms of t...
5,U,"R.J. Reynolds Tobacco Co. has been a rich , fa..."


In [13]:
## 전처리
# 1. Nan값 처리 어떻게 할 것인지
# 2. Label이 U인 행 제거
# 3. 간단한 특수문자 제거

In [14]:
df_copy.reset_index(inplace=True, drop=True)

In [21]:
# 논문에 있는 예시 문장이 들어있는지 체크
## N-pos / L-neg / U-제거
for i in range(len(df_copy['Text'])):
    if 'Ever since' in df_copy['Text'][i]:
        print(df_copy['Text'][i])
        print(i)
        print(df_copy['Label'][i])

Ever since , Banner has been besieged by hundreds of thrill-seeking callers ./.
417
N
Ever since your scoop ran on the wire , the stock market has been flooded with buy orders ./.
2492
U
Ever since President Reagan in February 1986 called for U.S. development of an `` Orient Express , '' a futuristic aircraft that would fly from New York to Tokyo in two hours , MITI 's aerospace mavens have been itching to get Japan into the action ./.
2774
L


In [22]:
# Label이 U인 행 제거
idx_label_U = df_copy[df_copy['Label']=='U'].index
new_df = df_copy.drop(idx_label_U)

In [23]:
new_df.head()

Unnamed: 0,Label,Text
1,N,But in the short-term it will absorb a lot of ...
3,N,Shocks from one-time changes in the terms of t...
6,N,"During the past 18 months , the Houston-based ..."
7,N,"In the years since 1853 , when 4 , 058 steambo..."
11,N,Mr. Wyss of Data Resources suggested that fact...


In [28]:
# 간단한 특수문자 제거
preprocessed_sen = []
for txt in new_df['Text']:
    preprocessed_sen.append(txt.replace('./.', '').strip())

In [29]:
new_df['Text'] = preprocessed_sen

In [30]:
# 잘 제거됐는지 확인
new_df['Text'][1]

"But in the short-term it will absorb a lot of top management 's energy and attention , '' says Philippe Haspeslagh , a business professor at the European management school , Insead , in Paris"

In [39]:
new_df

Unnamed: 0,Label,Text
0,N,But in the short-term it will absorb a lot of ...
1,N,Shocks from one-time changes in the terms of t...
2,N,"During the past 18 months , the Houston-based ..."
3,N,"In the years since 1853 , when 4 , 058 steambo..."
4,N,Mr. Wyss of Data Resources suggested that fact...
...,...,...
3732,N,But the OTC market was withering badly near th...
3733,N,"But the ANC has grown , and black supremacist ..."
3734,N,The combination was supposed to make First Rep...
3735,N,"Indeed , thanks to a well-oiled political mach..."


In [33]:
new_df.reset_index(inplace=True, drop=True)

In [35]:
new_df['Label'].value_counts()

N    2145
L    1592
Name: Label, dtype: int64

In [36]:
# Label값을 N-1, L-0 으로 변환
labels = []

for l in new_df['Label']:
    if l == 'N':
        labels.append(1)
    elif l == 'L':
        labels.append(0)

In [40]:
df_idiom = new_df.copy()

In [41]:
df_idiom['Label'] = labels

In [43]:
df_idiom['Label'].value_counts()

1    2145
0    1592
Name: Label, dtype: int64

In [45]:
df_idiom

Unnamed: 0,Label,Text
0,1,But in the short-term it will absorb a lot of ...
1,1,Shocks from one-time changes in the terms of t...
2,1,"During the past 18 months , the Houston-based ..."
3,1,"In the years since 1853 , when 4 , 058 steambo..."
4,1,Mr. Wyss of Data Resources suggested that fact...
...,...,...
3732,1,But the OTC market was withering badly near th...
3733,1,"But the ANC has grown , and black supremacist ..."
3734,1,The combination was supposed to make First Rep...
3735,1,"Indeed , thanks to a well-oiled political mach..."


In [53]:
# 추가 전처리 작업
add_preprocess = []
for txt in df_idiom['Text']:
    add_preprocess.append(txt.replace("''/''", '').replace("?/.", '').strip())

In [57]:
add_preprocess[9]

'Mr. Ennis said he believes Rockwood got into financial difficulty because its expansion efforts `` outstripped its ability to absorb those activities .'

In [58]:
df_idiom['Text'] = add_preprocess

In [60]:
# 잘 들어갔는지 확인
df_idiom['Text'][9]

'Mr. Ennis said he believes Rockwood got into financial difficulty because its expansion efforts `` outstripped its ability to absorb those activities .'

In [61]:
df_idiom.to_csv('~/Desktop/TroFi_dataset.csv', index=False)

In [67]:
# pickle 파일로 저장
import pickle

with open('./TroFi_dataset.pkl', 'wb') as f:
    pickle.dump(df_idiom, f)