# Corpus Merging.

## CSAT 만들기

CSAT의 경우, 자체적인 POS 값들 달기 애매하기 때문에 E-lexicon proj에서 제공하는 POS 값을 우선적으로 가져와서 붙여 넣자. 아마 대부분 있지 않을까?

### Module Import

In [1]:
import pandas as pd # 데이터 처리
import numpy as np # 데이터 처리
import seaborn as sns # 데이터 시각화
from tqdm import tqdm # 데이터 처리
import re # 정규식으로 특수기호 및 char 처리
import json # 데이터 가공 후 저장할 형식
from typing import Any, Dict, List  # python data-type 정의
from pathlib import Path # 파일 경로 처리
import os # 파일 경로 및 처리
from json_handler import JsonFileHandler
from preprocessor import Preprocessor, TargetCorpusPreprocessor

### file path setting

In [4]:
# 파일 경로 확인.
proj_path = os.path.dirname(os.path.dirname(Path.cwd()))
corpora_dir = os.path.join(proj_path, 'Corpora')
CSAT_json_path = save_path = os.path.join(corpora_dir, 'CSAT-latest')

# 파일 경로 유효성 확인.
if not os.path.exists(proj_path):
    raise FileNotFoundError(f"Project path not found at {proj_path}")
if not os.path.exists(CSAT_json_path):
    raise FileNotFoundError(f"CSAT json path not found at {CSAT_json_path}")
if not any(file.endswith('.json') for file in os.listdir(CSAT_json_path)):
    raise FileExistsError(f"Any .json file doesn't exist in {CSAT_json_path}")
print("All file paths are valid.")

All file paths are valid.


### Utils: generate_word_list
중복을 허용하여 단어를 extend 한 list 생성.
이후 pandas 다른 method(ex: value_count)를 사용해서 빈도수 계산.

In [5]:
def generate_word_list(document_list: list, document_dir: str):
    # article 별 단어 분석.
    word_list = []   # word 정보를 담을 리스트
    json_handler = JsonFileHandler()

    for document_name in document_list: # corpus에서 하나의 json 파일을 가져와서
        document = json_handler.load_data(os.path.join(document_dir, document_name))  # 하나의 json 파일 내에 있는 모든 파일을 가져옴.

        if isinstance(document, list):   # 한 json 내에 여러 obj가 있는 경우, 
            for sentence in document:  # json 파일 내 있는 여러 문장들 수 
                word_list.extend(sentence['tokens'])
        else:   # 한 json 내에 하나의 obj가 있는 경우, 
            word_list.extend(document['tokens'])   

    return word_list

### CSAT(only): 수능 + 모의고사

In [6]:
json_handler = JsonFileHandler()
CSAT_path = os.path.join(corpora_dir, 'CSAT-latest')
corpus_list = os.listdir(CSAT_path)
corpus_list = [file for file in corpus_list if file.endswith('.json')]
reading_corpus = [file for file in corpus_list if 'reading' in file]
listening_corpus = [file for file in corpus_list if 'listening' in file]

len(reading_corpus), len(listening_corpus)

(52, 49)

In [7]:
listening_word_list = generate_word_list(listening_corpus, CSAT_path)
reading_word_list = generate_word_list(reading_corpus, CSAT_path)
CSAT_only_word_list = pd.Series(listening_word_list + reading_word_list)

In [8]:
CSAT_only_word_list.shape

(482265,)

In [9]:
CSAT_only_word_list.value_counts()

.            32877
,            21944
the          21540
to           12665
of           10010
             ...  
arabian          1
sapphire         1
sandstone        1
khyber           1
doubly           1
Name: count, Length: 17438, dtype: int64

In [10]:
CSAT_only_word_list_df = CSAT_only_word_list.value_counts().to_frame().reset_index().rename(columns={'index': 'Word', 'count': 'Freq'})
CSAT_only_word_list_df.to_excel(os.path.join(corpora_dir, 'merged_corpora', 'CSAT_only.xlsx'), index=False)
CSAT_only_word_list_df

Unnamed: 0,Word,Freq
0,.,32877
1,",",21944
2,the,21540
3,to,12665
4,of,10010
...,...,...
17433,arabian,1
17434,sapphire,1
17435,sandstone,1
17436,khyber,1


### textbook(only)

In [11]:
json_handler = JsonFileHandler()
CSAT_path = os.path.join(corpora_dir, 'CSAT-latest')
corpus_list = os.listdir(CSAT_path)
corpus_list = [file for file in corpus_list if file.endswith('.json')]
textbook_corpus = [file for file in corpus_list if 'textbook' in file]

len(textbook_corpus)

71

In [12]:
textbook_word_list = generate_word_list(textbook_corpus, CSAT_path)
textbook_only_word_list = pd.Series(textbook_word_list)

In [13]:
textbook_only_word_list.shape

(352760,)

In [14]:
textbook_only_word_list.value_counts()

.                 22088
the               16763
,                 16722
to                 9000
and                7844
                  ...  
elegance              1
geographically        1
breakfasts            1
whiteboard            1
maturity              1
Name: count, Length: 14822, dtype: int64

In [15]:
textbook_only_word_list_df = textbook_only_word_list.value_counts().to_frame().reset_index().rename(columns={'index': 'Word', 'count': 'Freq'})
textbook_only_word_list_df.to_excel(os.path.join(corpora_dir, 'merged_corpora', 'textbook_only.xlsx'), index=False)
textbook_only_word_list_df

Unnamed: 0,Word,Freq
0,.,22088
1,the,16763
2,",",16722
3,to,9000
4,and,7844
...,...,...
14817,elegance,1
14818,geographically,1
14819,breakfasts,1
14820,whiteboard,1


### CSAT Total

In [16]:
json_handler = JsonFileHandler()
CSAT_path = os.path.join(corpora_dir, 'CSAT-latest')
corpus_list = os.listdir(CSAT_path)
corpus_list = [file for file in corpus_list if file.endswith('.json')]
reading_corpus = [file for file in corpus_list if 'reading' in file]
listening_corpus = [file for file in corpus_list if 'listening' in file]
textbook_corpus = [file for file in corpus_list if 'textbook' in file]

len(reading_corpus), len(listening_corpus), len(textbook_corpus)

(52, 49, 71)

In [17]:
listening_word_list = generate_word_list(listening_corpus, CSAT_path)
reading_word_list = generate_word_list(reading_corpus, CSAT_path)
textbook_word_list = generate_word_list(textbook_corpus, CSAT_path)
CSAT_word_list = pd.Series(listening_word_list + reading_word_list + textbook_word_list)

In [58]:
CSAT_word_list.shape

(835025,)

In [59]:
CSAT_word_list.value_counts()

.             54965
,             38666
the           38303
to            21665
a             17505
              ...  
tersely           1
trimly            1
imbricate         1
sepals            1
regretting        1
Name: count, Length: 22147, dtype: int64

In [60]:
CSAT_word_list_df = CSAT_word_list.value_counts().to_frame().reset_index().rename(columns={'index': 'Word', 'count': 'Freq'})
CSAT_word_list_df.to_excel(os.path.join(corpora_dir, 'merged_corpora', 'CSAT.xlsx'), index=False)
CSAT_word_list_df

Unnamed: 0,Word,Freq
0,.,54965
1,",",38666
2,the,38303
3,to,21665
4,a,17505
...,...,...
22142,tersely,1
22143,trimly,1
22144,imbricate,1
22145,sepals,1


## E-lexicon proj

E-lexicon proj를 보면, a의 orthographic N이 1이다. 즉, b,c,d,e 같이 치환된 케이스 중 비단어는 제외했다는 말이니, CSAT도 그대로 사용하기는 무리. 따라서 E-lexicon이랑 intersection을 먼저 시켜봐야 할 듯.

In [21]:
E_lexicon_proj_path = os.path.join(corpora_dir, 'E-lexicon-proj')

In [22]:
E_lexicon_df = pd.read_csv(os.path.join(E_lexicon_proj_path, 'English Lexicon Project Items original.csv'))
E_lexicon_df.head()

Unnamed: 0,Word,Length,Freq_HAL,Log_Freq_HAL,SUBTLWF,LgSUBTLWF,SUBTLCD,LgSUBTLCD,Ortho_N,OLD,POS
0,a,1,10610626,16.177,20415.27,6.018,99.93,3.923,1,1.45,minor|NN
1,aah,3,222,5.403,52.71,3.43,7.56,2.803,2,1.85,#
2,Aaron,5,10806,9.288,14.65,2.874,1.93,2.212,3,1.85,NN
3,aback,5,387,5.958,0.29,1.204,0.18,1.204,0,1.95,RB
4,abacus,6,513,6.24,0.24,1.114,0.12,1.041,0,2.9,NN


In [23]:
# null이라는 단어가 있는데, type 지정을 안 해주면 null 값으로 분류됨.
E_lexicon_df['Word'] = E_lexicon_df['Word'].astype(str)
E_lexicon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40481 entries, 0 to 40480
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Word          40481 non-null  object 
 1   Length        40481 non-null  int64  
 2   Freq_HAL      40481 non-null  object 
 3   Log_Freq_HAL  40481 non-null  float64
 4   SUBTLWF       40481 non-null  object 
 5   LgSUBTLWF     40481 non-null  object 
 6   SUBTLCD       40481 non-null  object 
 7   LgSUBTLCD     40481 non-null  object 
 8   Ortho_N       40481 non-null  int64  
 9   OLD           40481 non-null  object 
 10  POS           40481 non-null  object 
dtypes: float64(1), int64(2), object(8)
memory usage: 3.4+ MB


E-lexicon proj에서도 Word는 lower 적용.

In [24]:
E_lexicon_df['Word'] = E_lexicon_df['Word'].str.lower()
E_lexicon_df.head()

Unnamed: 0,Word,Length,Freq_HAL,Log_Freq_HAL,SUBTLWF,LgSUBTLWF,SUBTLCD,LgSUBTLCD,Ortho_N,OLD,POS
0,a,1,10610626,16.177,20415.27,6.018,99.93,3.923,1,1.45,minor|NN
1,aah,3,222,5.403,52.71,3.43,7.56,2.803,2,1.85,#
2,aaron,5,10806,9.288,14.65,2.874,1.93,2.212,3,1.85,NN
3,aback,5,387,5.958,0.29,1.204,0.18,1.204,0,1.95,RB
4,abacus,6,513,6.24,0.24,1.114,0.12,1.041,0,2.9,NN


In [25]:
E_lexicon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40481 entries, 0 to 40480
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Word          40481 non-null  object 
 1   Length        40481 non-null  int64  
 2   Freq_HAL      40481 non-null  object 
 3   Log_Freq_HAL  40481 non-null  float64
 4   SUBTLWF       40481 non-null  object 
 5   LgSUBTLWF     40481 non-null  object 
 6   SUBTLCD       40481 non-null  object 
 7   LgSUBTLCD     40481 non-null  object 
 8   Ortho_N       40481 non-null  int64  
 9   OLD           40481 non-null  object 
 10  POS           40481 non-null  object 
dtypes: float64(1), int64(2), object(8)
memory usage: 3.4+ MB


In [26]:
from preprocessor import TargetCorpusPreprocessor

E_lexicon_df = TargetCorpusPreprocessor(E_lexicon_df).preprocess()
E_lexicon_df.to_excel(os.path.join(corpora_dir, 'merged_corpora', 'E_lexicon_preprocessed.xlsx'), index=False)

Cleaning information col by col:   0%|          | 0/11 [00:00<?, ?it/s]

Freq_HAL
SUBTLWF


Cleaning information col by col:  45%|████▌     | 5/11 [00:02<00:02,  2.45it/s]

LgSUBTLWF


Cleaning information col by col:  55%|█████▍    | 6/11 [00:04<00:03,  1.30it/s]

SUBTLCD


Cleaning information col by col:  64%|██████▎   | 7/11 [00:06<00:04,  1.07s/it]

LgSUBTLCD


Cleaning information col by col: 100%|██████████| 11/11 [00:08<00:00,  1.28it/s]

OLD
POS





In [78]:
E_lexicon_df

Unnamed: 0,Word,Length,Freq_HAL,Log_Freq_HAL,SUBTLWF,LgSUBTLWF,SUBTLCD,LgSUBTLCD,Ortho_N,OLD,POS
0,a,1,10610626,16.177,20415.27,6.018,99.930,3.923,1,1.45,minor|NN
1,aah,3,222,5.403,52.71,3.430,7.560,2.803,2,1.85,
2,aaron,5,10806,9.288,14.65,2.874,1.930,2.212,3,1.85,NN
3,aback,5,387,5.958,0.29,1.204,0.180,1.204,0,1.95,RB
4,abacus,6,513,6.240,0.24,1.114,0.120,1.041,0,2.90,NN
...,...,...,...,...,...,...,...,...,...,...,...
40476,zoom,4,4920,8.501,3.55,2.260,1.290,2.037,4,1.70,VB|NN
40477,zooming,7,523,6.260,0.63,1.518,0.310,1.431,3,1.85,VB
40478,zooms,5,385,5.953,0.06,0.602,0.040,0.602,3,1.80,VB
40479,zucchini,8,314,5.749,0.96,1.699,0.250,1.342,0,3.75,NN


### CSAT_edit

In [96]:
CSAT_only_df = pd.read_excel(os.path.join(corpora_dir, 'merged_corpora', 'CSAT_only.xlsx'))
textbook_only_df = pd.read_excel(os.path.join(corpora_dir, 'merged_corpora', 'textbook_only.xlsx'))
CSAT_df = pd.read_excel(os.path.join(corpora_dir, 'merged_corpora', 'CSAT.xlsx'))

# overlap 1562개
CSAT_df.shape, CSAT_only_df.shape,textbook_only_df.shape

((22147, 2), (17438, 2), (14822, 2))

In [97]:
CSAT_only_POS_df = pd.merge(CSAT_only_df, E_lexicon_df, on='Word', how='inner')[['Word', 'Freq', 'POS']]
CSAT_only_POS_df = CSAT_only_POS_df.dropna(subset=['POS']).reset_index(drop=True)
CSAT_only_POS_df

Unnamed: 0,Word,Freq,POS
0,the,21540,minor
1,to,12665,minor
2,of,10010,minor
3,a,9773,minor|NN
4,and,8974,minor
...,...,...,...
14004,fortress,1,NN
14005,arabian,1,NN|JJ
14006,sapphire,1,NN
14007,sandstone,1,NN


In [98]:
textbook_only_POS_df = pd.merge(textbook_only_df, E_lexicon_df, on='Word', how='inner')[['Word', 'Freq', 'POS']]
textbook_only_POS_df = textbook_only_POS_df.dropna(subset=['POS']).reset_index(drop=True)
textbook_only_POS_df

Unnamed: 0,Word,Freq,POS
0,the,16763,minor
1,to,9000,minor
2,and,7844,minor
3,a,7732,minor|NN
4,of,7161,minor
...,...,...,...
11796,deathbed,1,NN
11797,elegance,1,NN
11798,geographically,1,RB
11799,breakfasts,1,NN|VB


각각 E-lexicon이랑 합치고 POS 달기

In [99]:
CSAT_POS_df = pd.merge(CSAT_df, E_lexicon_df, on='Word', how='inner')[['Word', 'Freq', 'POS']]
CSAT_POS_df = CSAT_POS_df.dropna(subset=['POS']).reset_index(drop=True)
CSAT_POS_df

Unnamed: 0,Word,Freq,POS
0,the,38303,minor
1,to,21665,minor
2,a,17505,minor|NN
3,of,17171,minor
4,and,16818,minor
...,...,...,...
16349,vomit,1,VB|NN
16350,surpluses,1,NN
16351,resented,1,VB|JJ
16352,winded,1,JJ|VB


add Ortho_N, OLD20

In [65]:
from neighborhood import orthographic_N, OLD20

lexicon = CSAT_POS_df['Word'].to_list() 

CSAT_POS_df['Ortho_N_CSAT'] = CSAT_POS_df['Word'].apply(lambda x: orthographic_N(x, lexicon))
CSAT_POS_df['OLD20_CSAT'] = CSAT_POS_df['Word'].apply(lambda x: OLD20(x, lexicon))
CSAT_POS_df

Unnamed: 0,Word,Freq,POS,Ortho_N_CSAT,OLD20_CSAT
0,a,17505.0,minor|NN,1,1.35
1,aaron,7.0,NN,3,1.75
2,aback,,RB,0,1.85
3,abacus,,NN,0,2.75
4,abandon,7.0,VB|NN,0,2.75
...,...,...,...,...,...
39898,zoom,,VB|NN,4,1.60
39899,zooming,,VB,3,1.75
39900,zooms,,VB,3,1.70
39901,zucchini,,NN,0,3.55


In [66]:
lexicon = CSAT_only_POS_df['Word'].to_list() 

CSAT_only_POS_df['Ortho_N_CSAT(only)'] = CSAT_only_POS_df['Word'].apply(lambda x: orthographic_N(x, lexicon))
CSAT_only_POS_df['OLD20_CSAT(only)'] = CSAT_only_POS_df['Word'].apply(lambda x: OLD20(x, lexicon))
CSAT_only_POS_df

Unnamed: 0,Word,Freq,POS,Ortho_N_CSAT(only),OLD20_CSAT(only)
0,a,9773.0,minor|NN,1,1.35
1,aaron,1.0,NN,3,1.75
2,aback,,RB,0,1.85
3,abacus,,NN,0,2.75
4,abandon,5.0,VB|NN,0,2.75
...,...,...,...,...,...
39898,zoom,,VB|NN,4,1.60
39899,zooming,,VB,3,1.75
39900,zooms,,VB,3,1.70
39901,zucchini,,NN,0,3.55


In [67]:
lexicon = textbook_only_POS_df['Word'].to_list() 

textbook_only_POS_df['Ortho_N_Textbook'] = textbook_only_POS_df['Word'].apply(lambda x: orthographic_N(x, lexicon))
textbook_only_POS_df['OLD20_Textbook'] = textbook_only_POS_df['Word'].apply(lambda x: OLD20(x, lexicon))
textbook_only_POS_df

Unnamed: 0,Word,Freq,POS,Ortho_N_Textbook,OLD20_Textbook
0,a,7732.0,minor|NN,1,1.35
1,aaron,6.0,NN,3,1.75
2,aback,,RB,0,1.85
3,abacus,,NN,0,2.75
4,abandon,2.0,VB|NN,0,2.75
...,...,...,...,...,...
39898,zoom,,VB|NN,4,1.60
39899,zooming,,VB,3,1.75
39900,zooms,,VB,3,1.70
39901,zucchini,,NN,0,3.55


In [68]:
CSAT_only_POS_df.to_excel(os.path.join(corpora_dir, 'merged_corpora', 'CSAT_only_POS_N.xlsx'), index=False)
textbook_only_POS_df.to_excel(os.path.join(corpora_dir, 'merged_corpora', 'textbook_only_POS_N.xlsx'), index=False)
CSAT_POS_df.to_excel(os.path.join(corpora_dir, 'merged_corpora', 'CSAT_POS_N.xlsx'), index=False)

In [69]:
CSAT_only_POS_df.columns= ['Word', 'CSAT_only_Freq', 'POS', 'Ortho_N_CSAT(only)', 'OLD20_CSAT(only)']
textbook_only_POS_df.columns= ['Word', 'Textbook_only_Freq', 'POS', 'Ortho_N_Textbook(only)', 'OLD20_Textbook(only)']
CSAT_vs_Textbook_df = pd.merge(CSAT_only_POS_df, textbook_only_POS_df, on='Word', how='inner').drop(columns=['POS_y']).rename(columns={'POS_x': 'POS'})
CSAT_vs_Textbook_df

Unnamed: 0,Word,CSAT_only_Freq,POS,Ortho_N_CSAT(only),OLD20_CSAT(only),Textbook_only_Freq,Ortho_N_Textbook(only),OLD20_Textbook(only)
0,a,9773.0,minor|NN,1,1.35,7732.0,1,1.35
1,aaron,1.0,NN,3,1.75,6.0,3,1.75
2,aback,,RB,0,1.85,,0,1.85
3,abacus,,NN,0,2.75,,0,2.75
4,abandon,5.0,VB|NN,0,2.75,2.0,0,2.75
...,...,...,...,...,...,...,...,...
39898,zoom,,VB|NN,4,1.60,,4,1.60
39899,zooming,,VB,3,1.75,,3,1.75
39900,zooms,,VB,3,1.70,,3,1.70
39901,zucchini,,NN,0,3.55,,0,3.55


In [70]:
CSAT_vs_Textbook_df = pd.merge(CSAT_only_POS_df, textbook_only_POS_df, on='Word', how='outer').drop(columns=['POS_y']).rename(columns={'POS_x': 'POS'})
CSAT_vs_Textbook_df

Unnamed: 0,Word,CSAT_only_Freq,POS,Ortho_N_CSAT(only),OLD20_CSAT(only),Textbook_only_Freq,Ortho_N_Textbook(only),OLD20_Textbook(only)
0,a,9773.0,minor|NN,1,1.35,7732.0,1,1.35
1,aaron,1.0,NN,3,1.75,6.0,3,1.75
2,aback,,RB,0,1.85,,0,1.85
3,abacus,,NN,0,2.75,,0,2.75
4,abandon,5.0,VB|NN,0,2.75,2.0,0,2.75
...,...,...,...,...,...,...,...,...
39898,zoom,,VB|NN,4,1.60,,4,1.60
39899,zooming,,VB,3,1.75,,3,1.75
39900,zooms,,VB,3,1.70,,3,1.70
39901,zucchini,,NN,0,3.55,,0,3.55


In [71]:
CSAT_vs_Textbook_df = CSAT_vs_Textbook_df[['Word', 'CSAT_only_Freq', 'Textbook_only_Freq', 'Ortho_N_CSAT(only)', 'Ortho_N_Textbook(only)',  'OLD20_CSAT(only)',  'OLD20_Textbook(only)', 'POS']]
CSAT_vs_Textbook_df.to_excel(os.path.join(corpora_dir, 'merged_corpora', 'CSAT_vs_Textbook_POS_N_outer.xlsx'), index=False)
CSAT_vs_Textbook_df

Unnamed: 0,Word,CSAT_only_Freq,Textbook_only_Freq,Ortho_N_CSAT(only),Ortho_N_Textbook(only),OLD20_CSAT(only),OLD20_Textbook(only),POS
0,a,9773.0,7732.0,1,1,1.35,1.35,minor|NN
1,aaron,1.0,6.0,3,3,1.75,1.75,NN
2,aback,,,0,0,1.85,1.85,RB
3,abacus,,,0,0,2.75,2.75,NN
4,abandon,5.0,2.0,0,0,2.75,2.75,VB|NN
...,...,...,...,...,...,...,...,...
39898,zoom,,,4,4,1.60,1.60,VB|NN
39899,zooming,,,3,3,1.75,1.75,VB
39900,zooms,,,3,3,1.70,1.70,VB
39901,zucchini,,,0,0,3.55,3.55,NN


In [72]:
CSAT_POS_df.columns =['Word', 'CSAT_Freq', 'POS', 'Ortho_N_CSAT', 'OLD20_CSAT']
CSAT_POS_df = CSAT_POS_df[['Word', 'CSAT_Freq', 'Ortho_N_CSAT', 'OLD20_CSAT', 'POS']]
CSAT_POS_df


Unnamed: 0,Word,CSAT_Freq,Ortho_N_CSAT,OLD20_CSAT,POS
0,a,17505.0,1,1.35,minor|NN
1,aaron,7.0,3,1.75,NN
2,aback,,0,1.85,RB
3,abacus,,0,2.75,NN
4,abandon,7.0,0,2.75,VB|NN
...,...,...,...,...,...
39898,zoom,,4,1.60,VB|NN
39899,zooming,,3,1.75,VB
39900,zooms,,3,1.70,VB
39901,zucchini,,0,3.55,NN


In [73]:
CSAT_df = pd.merge(CSAT_POS_df.drop(columns=['POS']), CSAT_vs_Textbook_df, on='Word', how='outer')[['Word', 'CSAT_Freq', 'CSAT_only_Freq', 'Textbook_only_Freq', 'Ortho_N_CSAT', 'Ortho_N_CSAT(only)', 'Ortho_N_Textbook(only)', 'OLD20_CSAT', 'OLD20_CSAT(only)', 'OLD20_Textbook(only)', 'POS']]
CSAT_df['Length'] = CSAT_df['Word'].apply(lambda x: len(x))
relative_unit = 1000000
CSAT_df['CSAT_RFreq'] = CSAT_df['CSAT_Freq'] / relative_unit
CSAT_df['CSAT_only_RFreq'] = CSAT_df['CSAT_only_Freq'] / relative_unit
CSAT_df['Textbook_only_RFreq'] = CSAT_df['Textbook_only_Freq'] / relative_unit
CSAT_df = CSAT_df[['Word', 'Length', 'CSAT_Freq', 'CSAT_only_Freq', 'Textbook_only_Freq', 'CSAT_RFreq', 'CSAT_only_RFreq', 'Textbook_only_RFreq', 'Ortho_N_CSAT', 'Ortho_N_CSAT(only)', 'Ortho_N_Textbook(only)', 'OLD20_CSAT', 'OLD20_CSAT(only)', 'OLD20_Textbook(only)', 'POS']]
CSAT_df.to_excel(os.path.join(corpora_dir, 'merged_corpora', 'CSAT_final.xlsx'), index=False)
CSAT_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39903 entries, 0 to 39902
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Word                    39903 non-null  object 
 1   Length                  39903 non-null  int64  
 2   CSAT_Freq               16354 non-null  float64
 3   CSAT_only_Freq          14009 non-null  float64
 4   Textbook_only_Freq      11801 non-null  float64
 5   CSAT_RFreq              16354 non-null  float64
 6   CSAT_only_RFreq         14009 non-null  float64
 7   Textbook_only_RFreq     11801 non-null  float64
 8   Ortho_N_CSAT            39903 non-null  int64  
 9   Ortho_N_CSAT(only)      39903 non-null  int64  
 10  Ortho_N_Textbook(only)  39903 non-null  int64  
 11  OLD20_CSAT              39903 non-null  float64
 12  OLD20_CSAT(only)        39903 non-null  float64
 13  OLD20_Textbook(only)    39903 non-null  float64
 14  POS                     39903 non-null

### 어떤 단어가 HAL에 없을까?

In [74]:
CSAT_df['POS'] = CSAT_df['POS'].astype(str)
CSAT_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39903 entries, 0 to 39902
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Word                    39903 non-null  object 
 1   Length                  39903 non-null  int64  
 2   CSAT_Freq               16354 non-null  float64
 3   CSAT_only_Freq          14009 non-null  float64
 4   Textbook_only_Freq      11801 non-null  float64
 5   CSAT_RFreq              16354 non-null  float64
 6   CSAT_only_RFreq         14009 non-null  float64
 7   Textbook_only_RFreq     11801 non-null  float64
 8   Ortho_N_CSAT            39903 non-null  int64  
 9   Ortho_N_CSAT(only)      39903 non-null  int64  
 10  Ortho_N_Textbook(only)  39903 non-null  int64  
 11  OLD20_CSAT              39903 non-null  float64
 12  OLD20_CSAT(only)        39903 non-null  float64
 13  OLD20_Textbook(only)    39903 non-null  float64
 14  POS                     39903 non-null

In [75]:
CSAT_df.describe()

Unnamed: 0,Length,CSAT_Freq,CSAT_only_Freq,Textbook_only_Freq,CSAT_RFreq,CSAT_only_RFreq,Textbook_only_RFreq,Ortho_N_CSAT,Ortho_N_CSAT(only),Ortho_N_Textbook(only),OLD20_CSAT,OLD20_CSAT(only),OLD20_Textbook(only)
count,39903.0,16354.0,14009.0,11801.0,16354.0,14009.0,11801.0,39903.0,39903.0,39903.0,39903.0,39903.0,39903.0
mean,7.984187,43.378806,29.258048,25.382679,4.3e-05,2.9e-05,2.5e-05,1.293988,1.293988,1.293988,2.696753,2.696753,2.696753
std,2.454677,496.466347,307.840872,249.774131,0.000496,0.000308,0.00025,2.737511,2.737511,2.737511,0.976649,0.976649,0.976649
min,1.0,1.0,1.0,1.0,1e-06,1e-06,1e-06,0.0,0.0,0.0,0.95,0.95,0.95
25%,6.0,2.0,1.0,1.0,2e-06,1e-06,1e-06,0.0,0.0,0.0,1.85,1.85,1.85
50%,8.0,4.0,3.0,3.0,4e-06,3e-06,3e-06,0.0,0.0,0.0,2.6,2.6,2.6
75%,10.0,15.0,11.0,10.0,1.5e-05,1.1e-05,1e-05,1.0,1.0,1.0,3.35,3.35,3.35
max,21.0,38303.0,21540.0,16763.0,0.038303,0.02154,0.016763,25.0,25.0,25.0,9.55,9.55,9.55


In [76]:
CSAT_minus_HAL = CSAT_df[CSAT_df['POS'] == 'nan']
#CSAT_minus_HAL.to_excel(os.path.join(corpora_dir, 'merged_corpora', 'CSAT_minus_HAL.xlsx'), index=False)
CSAT_minus_HAL

Unnamed: 0,Word,Length,CSAT_Freq,CSAT_only_Freq,Textbook_only_Freq,CSAT_RFreq,CSAT_only_RFreq,Textbook_only_RFreq,Ortho_N_CSAT,Ortho_N_CSAT(only),Ortho_N_Textbook(only),OLD20_CSAT,OLD20_CSAT(only),OLD20_Textbook(only),POS


In [77]:
# 중요 결론,, HAL을 통해 POS가 달리지 않은 단어들은 모두 textbook에서만 나왔다!!!!
# -> 얘네들은 POS_tagging하자!! (nltk)
CSAT_minus_HAL.describe()

Unnamed: 0,Length,CSAT_Freq,CSAT_only_Freq,Textbook_only_Freq,CSAT_RFreq,CSAT_only_RFreq,Textbook_only_RFreq,Ortho_N_CSAT,Ortho_N_CSAT(only),Ortho_N_Textbook(only),OLD20_CSAT,OLD20_CSAT(only),OLD20_Textbook(only)
count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,,,,,,,,,,,,,
std,,,,,,,,,,,,,
min,,,,,,,,,,,,,
25%,,,,,,,,,,,,,
50%,,,,,,,,,,,,,
75%,,,,,,,,,,,,,
max,,,,,,,,,,,,,


### HAL

In [45]:
# 모든 word에 대해 HAL_freq는 정의되어 있음.
E_lexicon_df[E_lexicon_df['Freq_HAL'] == None]

Unnamed: 0,Word,Length,Freq_HAL,Log_Freq_HAL,SUBTLWF,LgSUBTLWF,SUBTLCD,LgSUBTLCD,Ortho_N,OLD,POS


In [46]:
# HAL_df에는 SUBTLEX
HAL_df = E_lexicon_df.drop(columns=['SUBTLWF', 'LgSUBTLWF', 'SUBTLCD', 'LgSUBTLCD'])
HAL_df.to_excel(os.path.join(corpora_dir, 'merged_corpora', 'HAL.xlsx'), index=False)
HAL_df

Unnamed: 0,Word,Length,Freq_HAL,Log_Freq_HAL,Ortho_N,OLD,POS
0,a,1,10610626,16.177,1,1.45,minor|NN
1,aah,3,222,5.403,2,1.85,
2,aaron,5,10806,9.288,3,1.85,NN
3,aback,5,387,5.958,0,1.95,RB
4,abacus,6,513,6.240,0,2.90,NN
...,...,...,...,...,...,...,...
40476,zoom,4,4920,8.501,4,1.70,VB|NN
40477,zooming,7,523,6.260,3,1.85,VB
40478,zooms,5,385,5.953,3,1.80,VB
40479,zucchini,8,314,5.749,0,3.75,NN


In [47]:
HAL_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40481 entries, 0 to 40480
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Word          40481 non-null  object 
 1   Length        40481 non-null  int64  
 2   Freq_HAL      40481 non-null  int64  
 3   Log_Freq_HAL  40481 non-null  float64
 4   Ortho_N       40481 non-null  int64  
 5   OLD           39382 non-null  float64
 6   POS           39903 non-null  object 
dtypes: float64(2), int64(3), object(2)
memory usage: 2.2+ MB


### SUBTLEX

In [48]:
# SUBTLEX에는 없는 단어들 존재. 
E_lexicon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40481 entries, 0 to 40480
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Word          40481 non-null  object 
 1   Length        40481 non-null  int64  
 2   Freq_HAL      40481 non-null  int64  
 3   Log_Freq_HAL  40481 non-null  float64
 4   SUBTLWF       35515 non-null  float64
 5   LgSUBTLWF     35515 non-null  float64
 6   SUBTLCD       35515 non-null  object 
 7   LgSUBTLCD     35515 non-null  object 
 8   Ortho_N       40481 non-null  int64  
 9   OLD           39382 non-null  float64
 10  POS           39903 non-null  object 
dtypes: float64(4), int64(3), object(4)
memory usage: 3.4+ MB


In [49]:
SUBTLEX_df = E_lexicon_df.dropna(subset=['SUBTLWF']).reset_index(drop=True).drop(columns=['Freq_HAL', 'Log_Freq_HAL'])
SUBTLEX_df.to_excel(os.path.join(corpora_dir, 'merged_corpora', 'SUBTLEX.xlsx'), index=False)
SUBTLEX_df

Unnamed: 0,Word,Length,SUBTLWF,LgSUBTLWF,SUBTLCD,LgSUBTLCD,Ortho_N,OLD,POS
0,a,1,20415.27,6.018,99.930,3.923,1,1.45,minor|NN
1,aah,3,52.71,3.430,7.560,2.803,2,1.85,
2,aaron,5,14.65,2.874,1.930,2.212,3,1.85,NN
3,aback,5,0.29,1.204,0.180,1.204,0,1.95,RB
4,abacus,6,0.24,1.114,0.120,1.041,0,2.90,NN
...,...,...,...,...,...,...,...,...,...
35510,zoom,4,3.55,2.260,1.290,2.037,4,1.70,VB|NN
35511,zooming,7,0.63,1.518,0.310,1.431,3,1.85,VB
35512,zooms,5,0.06,0.602,0.040,0.602,3,1.80,VB
35513,zucchini,8,0.96,1.699,0.250,1.342,0,3.75,NN


In [50]:
SUBTLEX_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35515 entries, 0 to 35514
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Word       35515 non-null  object 
 1   Length     35515 non-null  int64  
 2   SUBTLWF    35515 non-null  float64
 3   LgSUBTLWF  35515 non-null  float64
 4   SUBTLCD    35515 non-null  object 
 5   LgSUBTLCD  35515 non-null  object 
 6   Ortho_N    35515 non-null  int64  
 7   OLD        35515 non-null  float64
 8   POS        35368 non-null  object 
dtypes: float64(3), int64(2), object(4)
memory usage: 2.4+ MB


## CSAT + E-lexicon-proj

In [51]:
CSAT_E_lexicon_df = pd.merge(CSAT_df.drop(columns=['POS', 'Length']), E_lexicon_df, how='outer', on='Word')
CSAT_E_lexicon_df = CSAT_E_lexicon_df[['Word', 'Length', 'CSAT_Freq', 'CSAT_only_Freq', 'Textbook_only_Freq', 'Freq_HAL', 'SUBTLWF', 'CSAT_RFreq', 'CSAT_only_RFreq', 'Textbook_only_RFreq', 'Ortho_N_CSAT', 'Ortho_N_CSAT(only)', 'Ortho_N_Textbook(only)', 'Ortho_N' ,'OLD20_CSAT','OLD20_CSAT(only)','OLD20_Textbook(only)','OLD', 'SUBTLCD', 'POS']]
CSAT_E_lexicon_df.to_excel(os.path.join(corpora_dir, 'merged_corpora', 'CSAT_E_lexicon.xlsx'), index=False)
CSAT_E_lexicon_df

Unnamed: 0,Word,Length,CSAT_Freq,CSAT_only_Freq,Textbook_only_Freq,Freq_HAL,SUBTLWF,CSAT_RFreq,CSAT_only_RFreq,Textbook_only_RFreq,Ortho_N_CSAT,Ortho_N_CSAT(only),Ortho_N_Textbook(only),Ortho_N,OLD20_CSAT,OLD20_CSAT(only),OLD20_Textbook(only),OLD,SUBTLCD,POS
0,a,1,17505.0,9773.0,7732.0,10610626,20415.27,0.017505,0.009773,0.007732,1.0,1.0,1.0,1,1.45,1.45,1.45,1.45,99.930,minor|NN
1,aah,3,,,,222,52.71,,,,,,,2,,,,1.85,7.560,
2,aaron,5,7.0,1.0,6.0,10806,14.65,0.000007,0.000001,0.000006,1.0,1.0,0.0,3,1.85,1.95,2.00,1.85,1.930,NN
3,aback,5,,,,387,0.29,,,,,,,0,,,,1.95,0.180,RB
4,abacus,6,,,,513,0.24,,,,,,,0,,,,2.90,0.120,NN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40476,zoom,4,,,,4920,3.55,,,,,,,4,,,,1.70,1.290,VB|NN
40477,zooming,7,,,,523,0.63,,,,,,,3,,,,1.85,0.310,VB
40478,zooms,5,,,,385,0.06,,,,,,,3,,,,1.80,0.040,VB
40479,zucchini,8,,,,314,0.96,,,,,,,0,,,,3.75,0.250,NN


In [52]:
CSAT_E_lexicon_df.columns

Index(['Word', 'Length', 'CSAT_Freq', 'CSAT_only_Freq', 'Textbook_only_Freq',
       'Freq_HAL', 'SUBTLWF', 'CSAT_RFreq', 'CSAT_only_RFreq',
       'Textbook_only_RFreq', 'Ortho_N_CSAT', 'Ortho_N_CSAT(only)',
       'Ortho_N_Textbook(only)', 'Ortho_N', 'OLD20_CSAT', 'OLD20_CSAT(only)',
       'OLD20_Textbook(only)', 'OLD', 'SUBTLCD', 'POS'],
      dtype='object')

In [53]:
CSAT_E_lexicon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40481 entries, 0 to 40480
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Word                    40481 non-null  object 
 1   Length                  40481 non-null  int64  
 2   CSAT_Freq               16354 non-null  float64
 3   CSAT_only_Freq          14009 non-null  float64
 4   Textbook_only_Freq      11801 non-null  float64
 5   Freq_HAL                40481 non-null  int64  
 6   SUBTLWF                 35515 non-null  float64
 7   CSAT_RFreq              16354 non-null  float64
 8   CSAT_only_RFreq         14009 non-null  float64
 9   Textbook_only_RFreq     11801 non-null  float64
 10  Ortho_N_CSAT            16354 non-null  float64
 11  Ortho_N_CSAT(only)      14009 non-null  float64
 12  Ortho_N_Textbook(only)  11801 non-null  float64
 13  Ortho_N                 40481 non-null  int64  
 14  OLD20_CSAT              16354 non-null

지금 정리를 좀 해보면, SUBTLEX는 HAL에 완전포함 관계이니, 삼단계로 나눠서 
(CSAT, HAL, SUBTLEX)
- CSAT < SUBTLEX < HAL 

1. CSAT O SUBTLEX X HAL X 
2. CSAT O SUBTLEX O HAL X
3. CSAT O SUBTLEX O HAL O

이 차집합 공간에 들어가는 단어들을 분석해보는 걸로,,

In [None]:
CSAT_E_lexicon_df[[CSAT_E_lexicon_df['SUBTLWF'] == 'nan']]

NameError: name 'CSAT_E_lexicon_df' is not defined

## CSAT + HAL

In [250]:
CSAT_HAL_df = pd.merge(CSAT_df.drop(columns=['POS', 'Length']), HAL_df, how='outer', on='Word')
CSAT_HAL_df = CSAT_HAL_df[['Word', 'Length', 'CSAT_Freq', 'CSAT_only_Freq', 'Textbook_only_Freq', 'Freq_HAL', 'CSAT_RFreq', 'CSAT_only_RFreq', 'Textbook_only_RFreq', 'Ortho_N_CSAT', 'Ortho_N_CSAT(only)', 'Ortho_N_Textbook(only)', 'Ortho_N' ,'OLD20_CSAT','OLD20_CSAT(only)','OLD20_Textbook(only)','OLD', 'POS']]
CSAT_HAL_df.to_excel(os.path.join(corpora_dir, 'merged_corpora', 'CSAT_HAL.xlsx'), index=False)
CSAT_HAL_df

Unnamed: 0,Word,Length,CSAT_Freq,CSAT_only_Freq,Textbook_only_Freq,Freq_HAL,CSAT_RFreq,CSAT_only_RFreq,Textbook_only_RFreq,Ortho_N_CSAT,Ortho_N_CSAT(only),Ortho_N_Textbook(only),Ortho_N,OLD20_CSAT,OLD20_CSAT(only),OLD20_Textbook(only),OLD,POS
0,a,1,1284.0,1098.0,186.0,10610626,0.001284,0.001098,0.000186,1.0,1.0,1.0,1,1.5,1.5,1.65,1.45,minor|NN
1,aah,3,,,,222,,,,,,,2,,,,1.85,
2,aaron,5,,,,10806,,,,,,,3,,,,1.85,NN
3,aback,5,,,,387,,,,,,,0,,,,1.95,RB
4,abacus,6,,,,513,,,,,,,0,,,,2.90,NN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40476,zoom,4,,,,4920,,,,,,,4,,,,1.70,VB|NN
40477,zooming,7,,,,523,,,,,,,3,,,,1.85,VB
40478,zooms,5,,,,385,,,,,,,3,,,,1.80,VB
40479,zucchini,8,,,,314,,,,,,,0,,,,3.75,NN


## CSAT + SUBTLEX

In [None]:
CSAT_SUBTLEX_df = pd.merge(CSAT_df.drop(columns=['POS', 'Length']), SUBTLEX_df, how='outer', on='Word')
CSAT_SUBTLEX_df = CSAT_SUBTLEX_df[['Word', 'Length', 'CSAT_Freq', 'CSAT_only_Freq', 'Textbook_only_Freq', 'SUBTLWF', 'CSAT_RFreq', 'CSAT_only_RFreq', 'Textbook_only_RFreq', 'Ortho_N_CSAT', 'Ortho_N_CSAT(only)', 'Ortho_N_Textbook(only)', 'Ortho_N' ,'OLD20_CSAT','OLD20_CSAT(only)','OLD20_Textbook(only)','OLD', 'SUBTLCD','POS']]
CSAT_SUBTLEX_df.to_excel(os.path.join(corpora_dir, 'merged_corpora', 'CSAT_SUBTLEX.xlsx'), index=False)
CSAT_SUBTLEX_df

Unnamed: 0,Word,Length,CSAT_Freq,CSAT_only_Freq,Textbook_only_Freq,SUBTLWF,CSAT_RFreq,CSAT_only_RFreq,Textbook_only_RFreq,Ortho_N_CSAT,Ortho_N_CSAT(only),Ortho_N_Textbook(only),Ortho_N,OLD20_CSAT,OLD20_CSAT(only),OLD20_Textbook(only),OLD,SUBTLCD,POS
0,a,1.0,1284.0,1098.0,186.0,20415.27,0.001284,0.001098,0.000186,1.0,1.0,1.0,1.0,1.5,1.5,1.65,1.45,99.930,minor|NN
1,aah,3.0,,,,52.71,,,,,,,2.0,,,,1.85,7.560,
2,aaron,5.0,,,,14.65,,,,,,,3.0,,,,1.85,1.930,NN
3,aback,5.0,,,,0.29,,,,,,,0.0,,,,1.95,0.180,RB
4,abacus,6.0,,,,0.24,,,,,,,0.0,,,,2.90,0.120,NN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35598,zoom,4.0,,,,3.55,,,,,,,4.0,,,,1.70,1.290,VB|NN
35599,zooming,7.0,,,,0.63,,,,,,,3.0,,,,1.85,0.310,VB
35600,zooms,5.0,,,,0.06,,,,,,,3.0,,,,1.80,0.040,VB
35601,zucchini,8.0,,,,0.96,,,,,,,0.0,,,,3.75,0.250,NN
