In [1]:
import pandas as pd
import urllib3
from bs4 import BeautifulSoup

In [2]:
http = urllib3.PoolManager()
r = http.request('GET', 'https://trilingual.jp/jako/20190416-1632/')
soup = BeautifulSoup(r.data, 'lxml')

In [3]:
table = soup.find('table', attrs={'class': 'tablepress-id-7'})

In [4]:
def parse_table(table):
    table_body = table.find('tbody')
    data = []
    rows = table_body.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        data.append([ele for ele in cols if ele])
    return data

In [5]:
topik_level_1 = pd.DataFrame(parse_table(table), columns=['index', 'word', 'meaning'])

In [6]:
topik_level_1.head()

Unnamed: 0,index,word,meaning
0,1,가게,"store, shop"
1,2,가격,price
2,3,가구,furniture
3,4,가깝다,close
4,5,가끔,sometimes


In [7]:
import os

In [8]:
SOURCE = 'memrise_raw_crawl/'
for name in os.listdir(SOURCE):
    if os.path.isdir(SOURCE+name):
        d = []
        if 'summary.csv' not in os.listdir(SOURCE+name):    
            for filename in os.listdir(SOURCE+name):

                if filename[-4:] == '.csv':
                    print(os.path.join(SOURCE, name, filename))
                    df = pd.read_csv(os.path.join(SOURCE, name, filename), sep='\t', header=None)
                    if df.columns.__len__() != 2 : print(df.columns.__len__())
                    d.append(df)
            if len(d) > 0:
                df = pd.concat(d)
                df.to_csv(os.path.join(SOURCE, name, 'summary.csv'))

In [9]:
level_1 = ['"Sogang Korean New Series 1A" Vocabulary', 
           '"Sogang Korean New Series 1B" Vocabulary',
           'Integrated Korean Beginning 1', 
           'KIIP Beginner Level 1', 
           'Yonsei Korean 1 Vocabulary 2017 (연세 한국어 1급 단어)', 
           'Fun Fun Korean 1', 
           '성균관어학 1급'
          ]

level_2 = ['Integrated Korean Beginning 2', 
           'KIIP Beginner Level 2',
           'Sogang Korean 2A Vocabulary',
           'Sogang Korean 2B (w', 
           'Yonsei Level 2 Vocabulary (연세 한국어 2급 단어)', 
           'Fun Fun Korean 2',
           '성균관 어학 2급'
          ]

level_3 = ['Sogang 3A vocabulary', 
           'Sogang Korean 3B (w',
           'Integrated Korean Intermediate 1', 
           'KIIP 중급1 (Level 3)',
           'Hanyang Workbook - 중급 1',
           'Yonsei 3-1 연세 한국어',
           'Yonsei 3-2 연세 한국어',
           '성균관어학 3급'
          ]

level_4 = ['Integrated Korean Intermediate 2 - First Edition', 
           'KIIP 2016 Level 4 ~ 중급2 NO TYPING', 
           'KIIP 4 - 2016',
           'Sogang 4A', 
           'Sogang 4B', 
           '한양대 중급2', 
           'Seoul National University Korean Level 4', 
           '연세 한국어 4-1 [Yonsei Korean 4-1]',
           'Yonsei Korean 4-2 (Units 6-10) + Proverbs',
           '가나다 intermediate 2',
           '서울대 한국어 4B',
           '성균관 어학 4급'
          ]

level_5 = ['KIIP Level 5 (new book 2016)', 
           '한양대 고급1', 
           'Ewha Korean 5', 
           '서울대 한국어 5A',
           '서울대 5B 한국어 교과서',
           '성균관 어학 5급'
          ]

level_6 = ['서울대 한국어 6A 교과서',
           '한양 고급2']

beginner = ['TOPIK Beginner Vocabulary List', 
            'Yonsei Korean Vocab Beginner', 
            '쏙쏙 40 days TOPIK 초급']

intermediate = ['Korean Vocabulary Practice (INTERMEDIATE)', 
                'TOPIK Essential Vocab 1000 (Intermediate)',
                'TOPIK in 30 Days+ Intermediate Vocabulary',
                'TOPIK 필수어위 1000 중급',
                '쏙쏙 TOPIK 한국어 어휘 중급 (day by day)',
                '2000 Essential Korean Words - Intermediate'
               ]

advanced = ['TOPIK Advanced Vocabulary',
            'Korean Vocabulary Practice for Foreigners Advanced', 
            'New TOPIK II 필수어휘 Vocabulary for Levels 5 & 6', 
            '토픽 어휘로 잡아라! [고급]',
            '한국어 TOPIK 어휘 고급 (50 days) advanced TOPIK vocab'
           ]

In [10]:
def find_source(filename):
    f = filename.lower()
    if 'sogang' in f:
        return 'sogang'
    elif 'yonsei' in f or '연세' in f:
        return 'yonsei'
    elif '한양' in f or 'hanyang' in f:
        return 'hanyang'
    elif '서울대' in f:
        return 'SNU'
    elif '성균관' in f:
        return 'sungkyunkwan'
    elif 'ewha' in f or '이화'in f:
        return 'ewha'
    elif 'kiip' in f:
        return 'kiip'
    elif 'integrated'in f:
        return 'integrated'
    elif 'topik' in f:
        return 'topik'
    else:
        return 'other'

In [11]:
frequencies = pd.read_csv('frequency_list.csv', index_col=0)

In [12]:
frequencies.head()

Unnamed: 0,word,frequency
0,일부러,1903
1,시작하다,80
2,잡지,1471
3,끼우다,2704
4,두께,3730


In [13]:
from wordfreq import zipf_frequency

In [14]:
def get_full_list(list_):
    d = []
    for filename in list_:
        df = pd.read_csv(os.path.join(SOURCE, filename, 'summary.csv'), index_col=0, names=['ko', 'en'], header=0)
        df['word_count'] = df['ko'].map(lambda x: x.split(' ').__len__())
        df['source'] = find_source(filename)
        df['filepath'] = filename
        df['zipf'] = df.ko.map(lambda x: zipf_frequency(x, 'ko'))
        df = pd.merge(df, frequencies, left_on='ko', right_on='word', how='left')
        d.append(df)
    return pd.concat(d)

In [15]:
def find_differences(level_1_vocabulary):
    categories = level_1_vocabulary.source.value_counts().index
    categories = sorted(categories)
    for source in categories:
        for source2 in categories:
            if source < source2:
                s = set(level_1_vocabulary[(level_1_vocabulary.source == source) & (level_1_vocabulary.word_count == 1)].ko.to_list())
                s2 = set(level_1_vocabulary[(level_1_vocabulary.source == source2) & (level_1_vocabulary.word_count == 1)].ko.to_list())
                print(source, 'und', source2)
                print('\tdifferent vocabulary', len(s.difference(s2)))
                print('\tsame vocabulary', len(s.intersection(s2)))

In [16]:
beginner_vocab = pd.concat([get_full_list(beginner), get_full_list(level_1), get_full_list(level_2)])
beginner_vocab['level'] = 'beginner'
beginner_vocab.sort_values(by='zipf', ascending=False).to_csv('beginner.csv')

In [17]:
intermediate_vocab = pd.concat([get_full_list(intermediate), get_full_list(level_3), get_full_list(level_4)])
intermediate_vocab['level'] = 'intermediate'
intermediate_vocab.sort_values(by='zipf', ascending=False).to_csv('intermediate.csv')

In [18]:
advanced_vocab = pd.concat([get_full_list(advanced), get_full_list(level_5), get_full_list(level_6)])
advanced_vocab['level'] = 'advanced'
advanced_vocab.sort_values(by='zipf', ascending=False).to_csv('advanced.csv')

In [19]:
all_vocab = pd.concat([beginner_vocab, intermediate_vocab, advanced_vocab])

In [20]:
single_word = all_vocab[(all_vocab.word_count == 1)]

In [21]:
import re

In [22]:
single_word.ko.map(lambda x: len(re.findall('[\W]', x)) > 0).value_counts()

False    42969
True       253
Name: ko, dtype: int64

In [23]:
single_word[~single_word.ko.map(lambda x: len(re.findall('[\W]', x)) > 0)].ko.value_counts()

맡다      23
쓰다      22
바르다     20
싸다      20
걸리다     20
        ..
남방셔츠     1
승리자      1
원님       1
법무부      1
명동       1
Name: ko, Length: 15291, dtype: int64

In [24]:
single_word = single_word[~single_word.ko.map(lambda x: len(re.findall('[\W]', x)) > 0)]

In [25]:
unified_df = single_word.groupby('ko').apply(lambda x: set(x.source.to_list())).reset_index(name='source')

In [26]:
unified_df = unified_df.merge(
    single_word.groupby('ko').apply(lambda x: list(set(x.en.to_list()))).reset_index(name='translation'),
    on='ko')

In [27]:
unified_df = unified_df.merge(
    single_word.groupby('ko').apply(lambda x: set(x.level.to_list())).reset_index(name='level'),
    on='ko')

In [28]:
unified_df = unified_df.merge(
    single_word.groupby('ko').apply(lambda x: set(x.filepath.to_list())).reset_index(name='memerisedeck'),
    on='ko')

In [29]:
unified_df = unified_df[unified_df.ko.map(lambda x: re.match('^[\d]*$', x) == None)]

In [30]:
unified_df[unified_df.level.map(lambda x: 'beginner' in x)]

Unnamed: 0,ko,source,translation,level,memerisedeck
4,1인실,{other},[single room],{beginner},{Fun Fun Korean 2}
6,1주일,{sogang},[one week],{beginner},{Sogang Korean 2A Vocabulary}
7,1층,{yonsei},[1st floor],{beginner},{Yonsei Korean 1 Vocabulary 2017 (연세 한국어 1급 단어)}
20,5개월,{sogang},[five months],{beginner},{Sogang Korean 2A Vocabulary}
24,AS센터,{other},[after-sales service center],{beginner},{Fun Fun Korean 2}
...,...,...,...,...,...
15273,흰색,"{other, integrated, sungkyunkwan, hanyang, sog...","[White, white, white color]","{intermediate, beginner}","{Fun Fun Korean 2, 성균관 어학 4급, Sogang 3A vocabu..."
15280,힘,"{topik, yonsei, other}","[Strength, strength, energy, power (not 기...),...","{intermediate, beginner}","{쏙쏙 40 days TOPIK 초급, 2000 Essential Korean Wo..."
15284,힘내다,"{topik, other}","[to cheer up, pluck up one's heart]","{intermediate, beginner}","{Fun Fun Korean 2, 쏙쏙 TOPIK 한국어 어휘 중급 (day by ..."
15285,힘들다,"{other, sungkyunkwan, topik, yonsei, kiip}","[To be tough, to have a hard time, difficult, ...",{beginner},"{성균관 어학 2급, Fun Fun Korean 1, KIIP Beginner Le..."


In [31]:
unified_df[unified_df.level.map(lambda x: 'intermediate' in x)]

Unnamed: 0,ko,source,translation,level,memerisedeck
2,1등,"{other, sogang}",[first place],{intermediate},"{가나다 intermediate 2, Sogang Korean 3B (w}"
3,1인당,"{yonsei, kiip}","[per capita, per person, each person]",{intermediate},"{KIIP 2016 Level 4 ~ 중급2 NO TYPING, 연세 한국어 4-1..."
5,1인용,{sogang},[one person use],{intermediate},{Sogang 4A}
8,20쌍,{kiip},"[20 pairs, couples (스물)]",{intermediate},{KIIP 중급1 (Level 3)}
9,20인치,{sogang},[twenty inches],{intermediate},{Sogang Korean 3B (w}
...,...,...,...,...,...
15282,힘껏,"{topik, sungkyunkwan}","[with all one's strength, With all the force o...","{intermediate, advanced}","{쏙쏙 TOPIK 한국어 어휘 중급 (day by day), 한국어 TOPIK 어휘..."
15284,힘내다,"{topik, other}","[to cheer up, pluck up one's heart]","{intermediate, beginner}","{Fun Fun Korean 2, 쏙쏙 TOPIK 한국어 어휘 중급 (day by ..."
15287,힘듦,{sungkyunkwan},[Laboriousness ],{intermediate},{성균관 어학 4급}
15288,힘쓰다,"{topik, other, SNU, sungkyunkwan}","[to strive, try hard, strive, To make an effort]","{intermediate, advanced}","{서울대 5B 한국어 교과서, 쏙쏙 TOPIK 한국어 어휘 중급 (day by da..."


In [32]:
unified_df = unified_df.merge(frequencies, left_on='ko', right_on='word', how='left')
del unified_df['word']

In [33]:
unified_df['zipf'] = unified_df.ko.map(lambda x: zipf_frequency(x, 'ko'))

In [34]:
unified_df = unified_df.sort_values(by='zipf', ascending=False)

In [35]:
unified_df.head(20)

Unnamed: 0,ko,source,translation,level,memerisedeck,frequency,zipf
10326,이,"{other, integrated, sungkyunkwan, topik, yonse...","[2 (Sino-Korean), two, Tooth, tooth, 2, teeth,...","{intermediate, beginner}","{성균관어학 1급, 성균관어학 3급, KIIP Beginner Level 1, 쏙쏙...",5117.0,7.49
10207,을,{hanyang},[the weaker person in a contract],{advanced},{한양 고급2},,7.35
9035,에,{integrated},"[at, in, on (time), to (destination), in, at, ...",{beginner},"{Integrated Korean Beginning 1, Integrated Kor...",2725.0,7.22
10190,은,{other},[silver],{intermediate},{2000 Essential Korean Words - Intermediate},4439.0,7.2
934,고,{kiip},[put in front of someone's name to show they'v...,{advanced},{KIIP Level 5 (new book 2016)},,7.17
2904,다,"{other, integrated, topik, yonsei, sogang}","[all, all, everything (not 전...)]","{intermediate, beginner}","{TOPIK in 30 Days+ Intermediate Vocabulary, Fu...",3271.0,7.11
10255,의,"{yonsei, integrated, sungkyunkwan}","[of (particle), Possessive particle, dash (-)]",{beginner},{Yonsei Korean 1 Vocabulary 2017 (연세 한국어 1급 단어...,,6.99
8846,어,"{other, integrated, sogang}","[oh, oh!]",{beginner},"{Sogang Korean 2B (w, Fun Fun Korean 2, Integr...",331.0,6.98
3401,도,"{yonsei, other, integrated, sungkyunkwan}","[a degree (temperature), Too, also, also, too ...","{intermediate, beginner}",{Yonsei Korean 1 Vocabulary 2017 (연세 한국어 1급 단어...,2972.0,6.98
3776,들,"{yonsei, integrated, SNU}","[plural particle, Field]","{beginner, advanced}",{Yonsei Korean 1 Vocabulary 2017 (연세 한국어 1급 단어...,2243.0,6.96


In [36]:
set(frequencies.word.to_list()).issubset(set(unified_df.ko.to_list()))

False

In [37]:
frequent_but_not_in_list = set(frequencies.word.to_list()).difference(set(unified_df.ko.to_list()))

In [38]:
len(frequent_but_not_in_list)

693

In [39]:
less_frequent_in_list = (set(unified_df.ko.to_list())).difference(set(frequencies.word.to_list()))

In [40]:
len(less_frequent_in_list)

10516

In [41]:
frequent_but_not_in_list = frequencies[frequencies.word.isin(frequent_but_not_in_list)].copy()

In [42]:
set('가입자').issubset(set(unified_df.ko.to_list()))

False

In [43]:
from konlpy.tag import Okt

In [44]:
okt = Okt()

In [45]:
frequent_but_not_in_list['pos'] = frequent_but_not_in_list.word\
                                    .map(lambda x: okt.pos(x))\
                                    .map(lambda x: ' '.join([y[1] for y in x]))

In [46]:
frequent_but_not_in_list.pos.value_counts()

Noun                    457
Verb                     79
Noun Verb                50
Adjective                34
Noun Suffix              23
Adverb                   22
Determiner Noun          11
Noun Josa                 5
Modifier                  2
Eomi                      2
Josa                      1
Adverb Verb               1
VerbPrefix Verb           1
Conjunction               1
Determiner Noun Verb      1
Noun Noun                 1
Verb Noun                 1
Exclamation               1
Name: pos, dtype: int64

In [47]:
frequent_but_not_in_list[frequent_but_not_in_list.pos != 'Noun'].sample(10)

Unnamed: 0,word,frequency,pos
5041,충격적,3646,Noun Suffix
694,만들어지다,832,Verb
3488,잘살다,4353,Verb
738,건설되다,4145,Noun Verb
2431,트이다,2117,Verb
387,발음하다,5371,Noun Verb
3502,저곳,5464,Determiner Noun
2311,소중히,4588,Adjective
1796,하,4961,Exclamation
5189,약해지다,3885,Verb


In [48]:
unified_df['pos'] = unified_df.ko.map(lambda x: okt.pos(x)).map(lambda x: ' '.join([y[1] for y in x]))

In [49]:
unified_df.head(10)

Unnamed: 0,ko,source,translation,level,memerisedeck,frequency,zipf,pos
10326,이,"{other, integrated, sungkyunkwan, topik, yonse...","[2 (Sino-Korean), two, Tooth, tooth, 2, teeth,...","{intermediate, beginner}","{성균관어학 1급, 성균관어학 3급, KIIP Beginner Level 1, 쏙쏙...",5117.0,7.49,Noun
10207,을,{hanyang},[the weaker person in a contract],{advanced},{한양 고급2},,7.35,Josa
9035,에,{integrated},"[at, in, on (time), to (destination), in, at, ...",{beginner},"{Integrated Korean Beginning 1, Integrated Kor...",2725.0,7.22,Josa
10190,은,{other},[silver],{intermediate},{2000 Essential Korean Words - Intermediate},4439.0,7.2,Noun
934,고,{kiip},[put in front of someone's name to show they'v...,{advanced},{KIIP Level 5 (new book 2016)},,7.17,Noun
2904,다,"{other, integrated, topik, yonsei, sogang}","[all, all, everything (not 전...)]","{intermediate, beginner}","{TOPIK in 30 Days+ Intermediate Vocabulary, Fu...",3271.0,7.11,Adverb
10255,의,"{yonsei, integrated, sungkyunkwan}","[of (particle), Possessive particle, dash (-)]",{beginner},{Yonsei Korean 1 Vocabulary 2017 (연세 한국어 1급 단어...,,6.99,Noun
8846,어,"{other, integrated, sogang}","[oh, oh!]",{beginner},"{Sogang Korean 2B (w, Fun Fun Korean 2, Integr...",331.0,6.98,Eomi
3401,도,"{yonsei, other, integrated, sungkyunkwan}","[a degree (temperature), Too, also, also, too ...","{intermediate, beginner}",{Yonsei Korean 1 Vocabulary 2017 (연세 한국어 1급 단어...,2972.0,6.98,Noun
3776,들,"{yonsei, integrated, SNU}","[plural particle, Field]","{beginner, advanced}",{Yonsei Korean 1 Vocabulary 2017 (연세 한국어 1급 단어...,2243.0,6.96,Verb


In [50]:
frequent_but_not_in_list[frequent_but_not_in_list.pos == "Determiner Noun Verb"]

Unnamed: 0,word,frequency,pos
2570,그제서야,3269,Determiner Noun Verb


In [51]:
frequent_but_not_in_list

Unnamed: 0,word,frequency,pos
12,준비되다,3989,Noun Verb
37,조미료,1375,Noun
39,도덕,3016,Noun
44,크림,3399,Noun
52,시리즈,1229,Noun
...,...,...,...
5407,때로,1059,Noun Josa
5426,비상,3959,Noun
5438,눈앞,1528,Noun
5440,생산력,2274,Noun


In [121]:
import urllib.parse


'%ED%81%AC%EB%A6%BC'

In [143]:
import json
import urllib3
from bs4 import BeautifulSoup

In [157]:
def get_naver_dict_definition(word):
    STEM = 'http://ac.dict.naver.com/enkodict/ac?st=11001&r_lt=11001&q='
    http = urllib3.PoolManager()
    r = http.request('GET', STEM +word)
    soup = BeautifulSoup(r.data, 'lxml')
    try:
        translation = json.loads(soup.text)['items'][0][0][1][0]
        print(translation)
        return translation
    except:
        return None

In [161]:
translations = frequent_but_not_in_list.word.map(lambda x: get_naver_dict_definition(x))

In [164]:
frequent_but_not_in_list['en'] = translations

In [166]:
frequent_but_not_in_list

Unnamed: 0,word,frequency,pos,en
12,준비되다,3989,Noun Verb,be prepared; be ready
37,조미료,1375,Noun,"seasoning,condiment"
39,도덕,3016,Noun,"ethics,morals,morality"
44,크림,3399,Noun,"cream, cream"
52,시리즈,1229,Noun,series
...,...,...,...,...
5407,때로,1059,Noun Josa,"sometimes,occasionally,at times,from time to t..."
5426,비상,3959,Noun,emergency
5438,눈앞,1528,Noun,
5440,생산력,2274,Noun,"productive capacity, production power, product..."


In [169]:
frequent_but_not_in_list.sort_values(by='frequency').to_csv('frequent_but_not_in_books.csv')

In [171]:
unified_df.to_pickle('all_words.pkl')