# 이미지 크기 정규화

TODO
* 이상치를 제외한 각 단어별 이미지 정규화
* 라벨링 데이터 만들기

In [7]:
import pandas as pd
from PIL import Image

word_data = pd.read_csv('word_data_with_size_outlier.csv')

In [8]:
word_data

Unnamed: 0,image_file_name,sex,age,age_range,word,word_len,width,height,aspect_ratio,is_outlier
0,IMG_OCR_53_4PR_09305_1.png,female,41,40s,강원도,3,186,66,2.818182,False
1,IMG_OCR_53_4PR_09305_2.png,female,41,40s,경상북도,4,235,69,3.405797,False
2,IMG_OCR_53_4PR_09305_3.png,female,41,40s,경기도,3,181,62,2.919355,False
3,IMG_OCR_53_4PR_09305_4.png,female,41,40s,인제군,3,157,64,2.453125,False
4,IMG_OCR_53_4PR_09305_5.png,female,41,40s,영광군,3,162,65,2.492308,False
...,...,...,...,...,...,...,...,...,...,...
462888,IMG_OCR_53_4PR_99941_28.png,male,22,20s,서운면,3,177,85,2.082353,False
462889,IMG_OCR_53_4PR_99941_29.png,male,22,20s,복세편살,4,246,94,2.617021,False
462890,IMG_OCR_53_4PR_99941_30.png,male,22,20s,버카충,3,181,80,2.262500,False
462891,IMG_OCR_53_4PR_99941_31.png,male,22,20s,별다줄,3,191,81,2.358025,False


## 이상치를 제외한 각 단어별 이미지 정규화

### 2글자

| | 평균 | 표준편차 |
|--|--|--|
| 가로 | 109.970811 | 29.449407 |
| 세로 | 68.390976 | 18.969999 |
| 비율 | 1.654096 | 0.384947 |

-> `가로` = 세로 평균 * 비율 평균 = $113.125$, `세로` = $68.391$

$\therefore$ `가로` = 113, `세로` = 68

### 3글자

| | 평균 | 표준편차 |
|--|--|--|
| 가로 | 176.854643 | 45.656841 |
| 세로 | 76.079599 | 19.289802 |
| 비율 | 2.369767 | 0.479966 |

-> `가로` = 세로 평균 * 비율 평균 = $180.291$, `세로` = $76.080$

$\therefore$ `가로` = 180, `세로` = 76

### 4글자

| | 평균 | 표준편차 |
|--|--|--|
| 가로 | 214.001306 | 52.740179 |
| 세로 | 71.883658 | 18.842780 |
| 비율 | 3.042366 | 0.58456 |

-> `가로` = 세로 평균 * 비율 평균 = $209.696$, `세로` = $71.884$

$\therefore$ `가로` = 210, `세로` = 72

In [9]:
word_data.drop(['width', 'height', 'aspect_ratio'], axis=1, inplace=True)
word_data

Unnamed: 0,image_file_name,sex,age,age_range,word,word_len,is_outlier
0,IMG_OCR_53_4PR_09305_1.png,female,41,40s,강원도,3,False
1,IMG_OCR_53_4PR_09305_2.png,female,41,40s,경상북도,4,False
2,IMG_OCR_53_4PR_09305_3.png,female,41,40s,경기도,3,False
3,IMG_OCR_53_4PR_09305_4.png,female,41,40s,인제군,3,False
4,IMG_OCR_53_4PR_09305_5.png,female,41,40s,영광군,3,False
...,...,...,...,...,...,...,...
462888,IMG_OCR_53_4PR_99941_28.png,male,22,20s,서운면,3,False
462889,IMG_OCR_53_4PR_99941_29.png,male,22,20s,복세편살,4,False
462890,IMG_OCR_53_4PR_99941_30.png,male,22,20s,버카충,3,False
462891,IMG_OCR_53_4PR_99941_31.png,male,22,20s,별다줄,3,False


In [10]:
word_data.drop(word_data.loc[word_data['is_outlier']==True].index, axis=0, inplace=True)
word_data

Unnamed: 0,image_file_name,sex,age,age_range,word,word_len,is_outlier
0,IMG_OCR_53_4PR_09305_1.png,female,41,40s,강원도,3,False
1,IMG_OCR_53_4PR_09305_2.png,female,41,40s,경상북도,4,False
2,IMG_OCR_53_4PR_09305_3.png,female,41,40s,경기도,3,False
3,IMG_OCR_53_4PR_09305_4.png,female,41,40s,인제군,3,False
4,IMG_OCR_53_4PR_09305_5.png,female,41,40s,영광군,3,False
...,...,...,...,...,...,...,...
462888,IMG_OCR_53_4PR_99941_28.png,male,22,20s,서운면,3,False
462889,IMG_OCR_53_4PR_99941_29.png,male,22,20s,복세편살,4,False
462890,IMG_OCR_53_4PR_99941_30.png,male,22,20s,버카충,3,False
462891,IMG_OCR_53_4PR_99941_31.png,male,22,20s,별다줄,3,False


In [11]:
size_dict = { 2: (113, 68), 3: (180, 76), 4: (210, 72) }

In [12]:
for _, row in word_data.iterrows():
    image = Image.open(f'./Preprocess_Word/{row["image_file_name"]}')
    image = image.resize((size_dict[row['word_len']]))
    image.save(f'./Final_Image_Data/{row["image_file_name"]}')