# 안경 박사 데이터 전처리 (colab)
## 1. 드라이브 연결

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## 2. 코퍼스 불러오기
- Pandas 사용
- 1052개의 데이터로 구성 돼 있음 



In [None]:
import spacy
import os
import pandas as pd

nlp=spacy.load("en_core_web_sm")
text = pd.read_csv('./gdrive/MyDrive/NLP(자연어처리)/data/face_corpus.tsv', delimiter = '\t', error_bad_lines=False)

print(text)

     filename                                        description label
0       0.png  She is a Westerner who appears to be in her 30...     A
1       0.png  She looks like 30-year-old. She has medium rou...     C
2       0.png  She is a middle-aged Western woman. She has ey...     A
3       1.png  Looking middle-aged, he has grayish brown hair...     N
4       1.png  He looks like 40-year-old. He has thick flat e...     D
...       ...                                                ...   ...
1047  348.png  He is an oriental middle-aged man. He has bear...     B
1048  348.png  He is a middle-aged man. He has deep eyes and ...     B
1049  349.png  She looks like late 20s to early 30s. It is a ...     B
1050  349.png  She is a middle-aged Western woman. She has ey...     A
1051  349.png  She is a middle-aged woman. She has thick doub...     B

[1052 rows x 3 columns]


b'Skipping line 31: expected 3 fields, saw 4\n'


 0번째 열 description 출력

In [None]:
text['description'][0]

"She is a Westerner who appears to be in her 30s. It has white, red, and yellow skin. It has long brown straight hair. her skin is good. They are thin eyebrows with arches and have double eyelids and thick eyelashes. There's a fat under my eye. It has an iris mixed with brown and green. She has a high nose with a slight view of her nostrils. She has a smile that reveals her teeth. She has front cheekbones, round face, and small ears."

## 3. DataFrame to List

In [None]:
myList=list()

for e in range(len(text)):
    myList.append(text['description'][e])

0번째 열 문장 출력

In [None]:
print(myList[0])

She is a Westerner who appears to be in her 30s. It has white, red, and yellow skin. It has long brown straight hair. her skin is good. They are thin eyebrows with arches and have double eyelids and thick eyelashes. There's a fat under my eye. It has an iris mixed with brown and green. She has a high nose with a slight view of her nostrils. She has a smile that reveals her teeth. She has front cheekbones, round face, and small ears.


## 4. 문장별로 리스트로 분리하기
2차원 배열

In [None]:
mlist=list()

for i in myList:
    t=i
    doc=nlp(i)
    line=[]
    for j in doc.sents:
        line.append(j.text)
    mlist.append(line)

0번째 열 분리된 문장 출력

In [None]:
print(mlist[0])

['She is a Westerner who appears to be in her 30s.', 'It has white, red, and yellow skin.', 'It has long brown straight hair.', 'her skin is good.', 'They are thin eyebrows with arches and have double eyelids and thick eyelashes.', "There's a fat under my eye.", 'It has an iris mixed with brown and green.', 'She has a high nose with a slight view of her nostrils.', 'She has a smile that reveals her teeth.', 'She has front cheekbones, round face, and small ears.']


## 5. 필요한 단어가 들어간 문장 추출
`face`, `jawline`, `jaw` 단어가 들어간 문장만 사용

In [None]:
faceList=[]
for paragraph in mlist:
    word = ''
    for sentence in paragraph:
        doc2 = nlp(sentence)
        for token in doc2:
            if(token.text == 'face' or token.text == 'jawline' or token.text == 'jaw'):
                word += sentence + ' '
        data_index = text['label'][mlist.index(paragraph)]
    faceList.append([word, data_index])

0번째 열 description과 label 출력

In [None]:
print(faceList[0])

['She has front cheekbones, round face, and small ears. ', 'A']


## 6. 문장 깔끔하게 만들기
- 1) **[ 's ] 삭제** *(ex) It's*
- 2) **영문자 이외의 문자 삭제**
- 3) **대문자 -> 소문자**
- 4) **불용어 제거**
- 5) **표제어 추출** *(ex) making -> make*

In [None]:
import re
def cleaning(data):
    # 1 ------------------------------------------
    data = re.sub("'s",' ', data)
    # 2 ------------------------------------------
    only_english = re.sub('[^a-zA-Z]', ' ', data)
    # 3 ------------------------------------------
    no_capitals = only_english.lower().split()
    # 4 ------------------------------------------
    stops = spacy.lang.en.stop_words.STOP_WORDS
    no_stops = ''
    for word in no_capitals:
      if not word in stops:
         no_stops += word + ' '
    # 5  ------------------------------------------
    sentence = nlp(no_stops)
    lemma = ''
    for token in sentence:
        lemma += token.lemma_ + ' '
 
    #  ---------------------------------------------
    return lemma#no_stops#stemmer_words

In [None]:
Clean_text = []
for i in range(len(faceList)):
  clean = cleaning(faceList[i][0])
  Clean_text.append([faceList[i][1], clean])

깔끔하게 만든 0번째 열 description과 label 출력

In [None]:
print(Clean_text[0])

['A', 'cheekbone round face small ear ']


## 7. List to DataFrame

In [None]:
text_ = pd.DataFrame(Clean_text, columns=['label', 'description'])
print(text_)

     label                                        description
0        A                    cheekbone round face small ear 
1        C            rectangular face shape large cheekbone 
2        A      skin color bright mole face face round shape 
3        N                         form angle face large ear 
4        D                                 oblong face shape 
...    ...                                                ...
1047     B                                    face egg shape 
1048     B  eye black wrinkles eye lot freckle face skin l...
1049     B  face shape oval shape color face cool sand che...
1050     A                                  face round shape 
1051     B            face shape egg shape bright white skin 

[1052 rows x 2 columns]


## 8. 데이터 정리하기
### 8.1 비어있는 데이터 삭제
불필요한 데이터 삭제

1052개 -> 1027개

In [None]:
text_drop = text_.dropna()

In [None]:
text_drop

Unnamed: 0,label,description
0,A,cheekbone round face small ear
1,C,rectangular face shape large cheekbone
2,A,skin color bright mole face face round shape
3,N,form angle face large ear
4,D,oblong face shape
...,...,...
1047,B,face egg shape
1048,B,eye black wrinkles eye lot freckle face skin l...
1049,B,face shape oval shape color face cool sand che...
1050,A,face round shape


### 8.2 잘못 입력된 데이터 바꾸기

In [None]:
text_change = text_drop
text_change.loc[text_change['description'] == '', 'label'] = 'N'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
text_change

Unnamed: 0,label,description
0,A,cheekbone round face small ear
1,C,rectangular face shape large cheekbone
2,A,skin color bright mole face face round shape
3,N,form angle face large ear
4,D,oblong face shape
...,...,...
1047,B,face egg shape
1048,B,eye black wrinkles eye lot freckle face skin l...
1049,B,face shape oval shape color face cool sand che...
1050,A,face round shape


문장이 비어있음에도 불구하고 label이 지정되어 있는 경우 Label을 'N'로 바꿈

In [None]:
text_.loc[1027]

label          C
description     
Name: 1027, dtype: object

In [None]:
text_change.loc[1027]

label          N
description     
Name: 1027, dtype: object

## 9. Shuffle (섞기)

In [None]:
df_shuffled = text_change.sample(frac=1).reset_index(drop=True)
df_shuffled

Unnamed: 0,label,description
0,B,face egg shape
1,N,doesn t fat face
2,C,face shape square
3,B,round egg shape face wrinkle face color yellow...
4,D,beard long face
...,...,...
1022,A,square chin blunt face shape tip chin
1023,E,wear large square frame black horn frame large...
1024,A,bread round face
1025,N,cheeks chin chubby jawline clear


## 10. Train과 Text 데이터 생성
train : test = 9 : 1 비율로 나눔

In [None]:
train_ratio = 0.9

# train dataset
s,e = 0, int(df_shuffled.shape[0] * train_ratio) # of raws
df_train = pd.DataFrame({'label':df_shuffled['label'][s:e],
                        'description':df_shuffled['description'][s:e]})
print("index for train: %d ~ %d"%(s,e))

# test dataset
s,e = e, e+int(df_shuffled.shape[0] * (1.0 - train_ratio))
print("index for train: %d ~ %d"%(s,e))
df_test = pd.DataFrame({'label':df_shuffled['label'][s:e],
                        'description':df_shuffled['description'][s:e]})

index for train: 0 ~ 924
index for train: 924 ~ 1026


각 데이터의 개수

In [None]:
print(df_train.shape)
print(df_test.shape)

(924, 2)
(102, 2)


## 11. 전처리한 데이터 저장하기

In [None]:
df_train.to_csv('./gdrive/MyDrive/NLP(자연어처리)/data/GD_train.tsv', 
                header=False, index=False, sep='\t')
df_test.to_csv('./gdrive/MyDrive/NLP(자연어처리)/data/GD_test.tsv', 
               header=False, index=False, sep='\t')