In [1]:
import pandas as pd
import urllib3
from bs4 import BeautifulSoup

In [2]:
http = urllib3.PoolManager()
r = http.request('GET', 'https://trilingual.jp/jako/20190416-1632/')
soup = BeautifulSoup(r.data, 'lxml')

In [3]:
table = soup.find('table', attrs={'class': 'tablepress-id-7'})

In [4]:
def parse_table(table):
    table_body = table.find('tbody')
    data = []
    rows = table_body.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        data.append([ele for ele in cols if ele])
    return data

In [5]:
topik_level_1 = pd.DataFrame(parse_table(table), columns=['index', 'word', 'meaning'])

In [6]:
topik_level_1.head()

Unnamed: 0,index,word,meaning
0,1,가게,"store, shop"
1,2,가격,price
2,3,가구,furniture
3,4,가깝다,close
4,5,가끔,sometimes


In [7]:
import os

In [8]:
SOURCE = 'memrise-scraper/memrise_scraper/'
for name in os.listdir(SOURCE):
    if os.path.isdir(SOURCE+name):
        d = []
        if 'summary.csv' not in os.listdir(SOURCE+name):    
            for filename in os.listdir(SOURCE+name):

                if filename[-4:] == '.csv':
                    print(os.path.join(SOURCE, name, filename))
                    df = pd.read_csv(os.path.join(SOURCE, name, filename), sep='\t', header=None)
                    if df.columns.__len__() != 2 : print(df.columns.__len__())
                    d.append(df)
            if len(d) > 0:
                df = pd.concat(d)
                df.to_csv(os.path.join(SOURCE, name, 'summary.csv'))

In [9]:
level_1 = ['"Sogang Korean New Series 1A" Vocabulary', 
           '"Sogang Korean New Series 1B" Vocabulary',
           'Integrated Korean Beginning 1', 
           'KIIP Beginner Level 1', 
           'Yonsei Korean 1 Vocabulary 2017 (연세 한국어 1급 단어)', 
           'Fun Fun Korean 1', 
           '성균관어학 1급'
          ]

level_2 = ['Integrated Korean Beginning 2', 
           'KIIP Beginner Level 2',
           'Sogang Korean 2A Vocabulary',
           'Sogang Korean 2B (w', 
           'Yonsei Level 2 Vocabulary (연세 한국어 2급 단어)', 
           'Fun Fun Korean 2',
           '성균관 어학 2급'
          ]

level_3 = ['Sogang 3A vocabulary', 
           'Sogang Korean 3B (w',
           'Integrated Korean Intermediate 1', 
           'KIIP 중급1 (Level 3)',
           'Hanyang Workbook - 중급 1',
           'Yonsei 3-1 연세 한국어',
           'Yonsei 3-2 연세 한국어',
           '성균관어학 3급'
          ]

level_4 = ['Integrated Korean Intermediate 2 - First Edition', 
           'KIIP 2016 Level 4 ~ 중급2 NO TYPING', 
           'KIIP 4 - 2016',
           'Sogang 4A', 
           'Sogang 4B', 
           '한양대 중급2', 
           'Seoul National University Korean Level 4', 
           '연세 한국어 4-1 [Yonsei Korean 4-1]',
           'Yonsei Korean 4-2 (Units 6-10) + Proverbs',
           '가나다 intermediate 2',
           '서울대 한국어 4B',
           '성균관 어학 4급'
          ]

level_5 = ['KIIP Level 5 (new book 2016)', 
           '한양대 고급1', 
           'Ewha Korean 5', 
           '서울대 한국어 5A',
           '서울대 5B 한국어 교과서',
           '성균관 어학 5급'
          ]

level_6 = ['서울대 한국어 6A 교과서',
           '한양 고급2']

beginner = ['TOPIK Beginner Vocabulary List', 
            'Yonsei Korean Vocab Beginner', 
            '쏙쏙 40 days TOPIK 초급']

intermediate = ['Korean Vocabulary Practice (INTERMEDIATE)', 
                'TOPIK Essential Vocab 1000 (Intermediate)',
                'TOPIK in 30 Days+ Intermediate Vocabulary',
                'TOPIK 필수어위 1000 중급',
                '쏙쏙 TOPIK 한국어 어휘 중급 (day by day)',
                '2000 Essential Korean Words - Intermediate'
               ]

advanced = ['TOPIK Advanced Vocabulary',
            'Korean Vocabulary Practice for Foreigners Advanced', 
            'New TOPIK II 필수어휘 Vocabulary for Levels 5 & 6', 
            '토픽 어휘로 잡아라! [고급]',
            '한국어 TOPIK 어휘 고급 (50 days) advanced TOPIK vocab'
           ]

In [10]:
def find_source(filename):
    f = filename.lower()
    if 'sogang' in f:
        return 'sogang'
    elif 'yonsei' in f or '연세' in f:
        return 'yonsei'
    elif '한양' in f or 'hanyang' in f:
        return 'hanyang'
    elif '서울대' in f:
        return 'SNU'
    elif '성균관' in f:
        return 'sungkyunkwan'
    elif 'ewha' in f or '이화'in f:
        return 'ewha'
    elif 'kiip' in f:
        return 'kiip'
    elif 'integrated'in f:
        return 'integrated'
    elif 'topik' in f:
        return 'topik'
    else:
        return 'other'

In [11]:
frequencies = pd.read_csv('frequency_list.csv', index_col=0)

In [12]:
frequencies.head()

Unnamed: 0,word,frequency
0,일부러,1903
1,시작하다,80
2,잡지,1471
3,끼우다,2704
4,두께,3730


In [13]:
from wordfreq import zipf_frequency

In [14]:
def get_full_list(list_):
    d = []
    for filename in list_:
        df = pd.read_csv(os.path.join(SOURCE, filename, 'summary.csv'), index_col=0, names=['ko', 'en'], header=0)
        df['word_count'] = df['ko'].map(lambda x: x.split(' ').__len__())
        df['source'] = find_source(filename)
        df['filepath'] = filename
        df['zipf'] = df.ko.map(lambda x: zipf_frequency(x, 'ko'))
        df = pd.merge(df, frequencies, left_on='ko', right_on='word', how='left')
        d.append(df)
    return pd.concat(d)

In [15]:
def find_differences(level_1_vocabulary):
    categories = level_1_vocabulary.source.value_counts().index
    categories = sorted(categories)
    for source in categories:
        for source2 in categories:
            if source < source2:
                s = set(level_1_vocabulary[(level_1_vocabulary.source == source) & (level_1_vocabulary.word_count == 1)].ko.to_list())
                s2 = set(level_1_vocabulary[(level_1_vocabulary.source == source2) & (level_1_vocabulary.word_count == 1)].ko.to_list())
                print(source, 'und', source2)
                print('\tdifferent vocabulary', len(s.difference(s2)))
                print('\tsame vocabulary', len(s.intersection(s2)))

In [16]:
beginner_vocab = pd.concat([get_full_list(beginner), get_full_list(level_1), get_full_list(level_2)])
beginner_vocab['level'] = 'beginner'
beginner_vocab.sort_values(by='zipf', ascending=False).to_csv('beginner.csv')

In [17]:
intermediate_vocab = pd.concat([get_full_list(intermediate), get_full_list(level_3), get_full_list(level_4)])
intermediate_vocab['level'] = 'intermediate'
intermediate_vocab.sort_values(by='zipf', ascending=False).to_csv('intermediate.csv')

In [18]:
advanced_vocab = pd.concat([get_full_list(advanced), get_full_list(level_5), get_full_list(level_6)])
advanced_vocab['level'] = 'advanced'
advanced_vocab.sort_values(by='zipf', ascending=False).to_csv('advanced.csv')

In [19]:
all_vocab = pd.concat([beginner_vocab, intermediate_vocab, advanced_vocab])

In [20]:
single_word = all_vocab[(all_vocab.word_count == 1)]

In [21]:
import re

In [22]:
single_word.ko.map(lambda x: len(re.findall('[\W]', x)) > 0).value_counts()

False    42969
True       253
Name: ko, dtype: int64

In [23]:
single_word[~single_word.ko.map(lambda x: len(re.findall('[\W]', x)) > 0)].ko.value_counts()

맡다      23
쓰다      22
바르다     20
싸다      20
걸리다     20
        ..
외사촌      1
제기       1
건조증      1
청소년일     1
황하       1
Name: ko, Length: 15291, dtype: int64

In [24]:
single_word = single_word[~single_word.ko.map(lambda x: len(re.findall('[\W]', x)) > 0)]

In [53]:
unified_df = single_word.groupby('ko').apply(lambda x: set(x.source.to_list())).reset_index(name='source')

In [54]:
unified_df = unified_df.merge(
    single_word.groupby('ko').apply(lambda x: list(set(x.en.to_list()))).reset_index(name='translation'),
    on='ko')

In [55]:
unified_df = unified_df.merge(
    single_word.groupby('ko').apply(lambda x: set(x.level.to_list())).reset_index(name='level'),
    on='ko')

In [57]:
unified_df = unified_df.merge(
    single_word.groupby('ko').apply(lambda x: set(x.filepath.to_list())).reset_index(name='memerisedeck'),
    on='ko')

In [58]:
unified_df = unified_df[unified_df.ko.map(lambda x: re.match('^[\d]*$', x) == None)]

In [59]:
unified_df[unified_df.level.map(lambda x: 'beginner' in x)]

Unnamed: 0,ko,source,translation,level,memerisedeck
4,1인실,{other},[single room],{beginner},{Fun Fun Korean 2}
6,1주일,{sogang},[one week],{beginner},{Sogang Korean 2A Vocabulary}
7,1층,{yonsei},[1st floor],{beginner},{Yonsei Korean 1 Vocabulary 2017 (연세 한국어 1급 단어)}
20,5개월,{sogang},[five months],{beginner},{Sogang Korean 2A Vocabulary}
24,AS센터,{other},[after-sales service center],{beginner},{Fun Fun Korean 2}
...,...,...,...,...,...
15273,흰색,"{sogang, integrated, sungkyunkwan, other, hany...","[white, White, white color]","{intermediate, beginner}","{Hanyang Workbook - 중급 1, 성균관 어학 4급, Integrate..."
15280,힘,"{other, yonsei, topik}","[strength, energy, power (not 기...), Strength,...","{intermediate, beginner}","{Yonsei Korean Vocab Beginner, TOPIK in 30 Day..."
15284,힘내다,"{topik, other}","[to cheer up, pluck up one's heart]","{intermediate, beginner}","{쏙쏙 TOPIK 한국어 어휘 중급 (day by day), Fun Fun Kore..."
15285,힘들다,"{sungkyunkwan, other, yonsei, kiip, topik}","[to be tough, to be painful/tough, to be hard/...",{beginner},"{Yonsei Korean Vocab Beginner, KIIP Beginner L..."


In [60]:
unified_df[unified_df.level.map(lambda x: 'intermediate' in x)]

Unnamed: 0,ko,source,translation,level,memerisedeck
2,1등,"{sogang, other}",[first place],{intermediate},"{가나다 intermediate 2, Sogang Korean 3B (w}"
3,1인당,"{yonsei, kiip}","[per person, each person, per capita]",{intermediate},"{KIIP 2016 Level 4 ~ 중급2 NO TYPING, 연세 한국어 4-1..."
5,1인용,{sogang},[one person use],{intermediate},{Sogang 4A}
8,20쌍,{kiip},"[20 pairs, couples (스물)]",{intermediate},{KIIP 중급1 (Level 3)}
9,20인치,{sogang},[twenty inches],{intermediate},{Sogang Korean 3B (w}
...,...,...,...,...,...
15282,힘껏,"{sungkyunkwan, topik}","[With all the force one can summon, with all ...","{intermediate, advanced}","{성균관 어학 5급, 쏙쏙 TOPIK 한국어 어휘 중급 (day by day), 한..."
15284,힘내다,"{topik, other}","[to cheer up, pluck up one's heart]","{intermediate, beginner}","{쏙쏙 TOPIK 한국어 어휘 중급 (day by day), Fun Fun Kore..."
15287,힘듦,{sungkyunkwan},[Laboriousness ],{intermediate},{성균관 어학 4급}
15288,힘쓰다,"{topik, sungkyunkwan, SNU, other}","[to strive, try hard, strive, To make an effort]","{intermediate, advanced}","{TOPIK 필수어위 1000 중급, 서울대 5B 한국어 교과서, 쏙쏙 TOPIK ..."


In [62]:
unified_df = unified_df.merge(frequencies, left_on='ko', right_on='word', how='left')
del unified_df['word']

In [63]:
unified_df['zipf'] = unified_df.ko.map(lambda x: zipf_frequency(x, 'ko'))

In [64]:
unified_df = unified_df.sort_values(by='zipf', ascending=False)

In [70]:
unified_df.head(20)

Unnamed: 0,ko,source,translation,level,memerisedeck,frequency,zipf
10326,이,"{sogang, sungkyunkwan, other, integrated, yons...","[2, two (sino), 2 (Sino-Korean), this, Tooth, ...","{intermediate, beginner}","{성균관어학 3급, TOPIK in 30 Days+ Intermediate Voca...",5117.0,7.49
10207,을,{hanyang},[the weaker person in a contract],{advanced},{한양 고급2},,7.35
9035,에,{integrated},"[in, at, on (indicates static location), to (d...",{beginner},"{Integrated Korean Beginning 2, Integrated Kor...",2725.0,7.22
10190,은,{other},[silver],{intermediate},{2000 Essential Korean Words - Intermediate},4439.0,7.2
934,고,{kiip},[put in front of someone's name to show they'v...,{advanced},{KIIP Level 5 (new book 2016)},,7.17
2904,다,"{sogang, other, integrated, yonsei, topik}","[all, all, everything (not 전...)]","{intermediate, beginner}","{TOPIK in 30 Days+ Intermediate Vocabulary, Fu...",3271.0,7.11
10255,의,"{yonsei, sungkyunkwan, integrated}","[of (particle), Possessive particle, dash (-)]",{beginner},"{Integrated Korean Beginning 1, 성균관어학 1급, Yons...",,6.99
8846,어,"{sogang, other, integrated}","[oh, oh!]",{beginner},"{Integrated Korean Beginning 1, Sogang Korean ...",331.0,6.98
3401,도,"{yonsei, sungkyunkwan, other, integrated}","[degree, also, too (particle), Too, also, a de...","{intermediate, beginner}","{Integrated Korean Beginning 1, 2000 Essential...",2972.0,6.98
3776,들,"{yonsei, SNU, integrated}","[Field, plural particle]","{beginner, advanced}","{Integrated Korean Beginning 1, 서울대 한국어 6A 교과서...",2243.0,6.96


In [77]:
ad = unified_df[unified_df.level.map(lambda x: 'advanced' in x)].sort_values(by='frequency')

In [78]:
ad.to_json('advanced.json')

In [85]:
ad[ad.source.map(len) > 3]

Unnamed: 0,ko,source,translation,level,memerisedeck,frequency,zipf
3317,대하다,"{topik, sungkyunkwan, other, hanyang}","[to treat, address, receive (not 여...), To dea...","{intermediate, beginner, advanced}","{TOPIK in 30 Days+ Intermediate Vocabulary, 한양...",10.0,4.63
6544,사회,"{sogang, sungkyunkwan, other, integrated, yons...","[society, community (society), society, the co...","{intermediate, advanced}","{TOPIK Essential Vocab 1000 (Intermediate), TO...",19.0,5.17
848,경우,"{sungkyunkwan, other, hanyang, yonsei, kiip, t...","[a case, case, circumstances that have set sta...","{intermediate, advanced}","{Seoul National University Korean Level 4, TOP...",38.0,5.20
7908,시대,"{sungkyunkwan, other, hanyang, yonsei, topik}","[a time period, Era, age, an era, era, period,...","{intermediate, advanced}","{TOPIK in 30 Days+ Intermediate Vocabulary, Ha...",69.0,5.12
9839,운동,"{integrated, other, hanyang, yonsei, topik}","[exercise (sport), movement, campaign, exercis...","{intermediate, beginner, advanced}","{Yonsei Korean Vocab Beginner, TOPIK in 30 Day...",76.0,5.19
...,...,...,...,...,...,...,...
3956,뛰쳐나가다,"{sogang, sungkyunkwan, topik, hanyang}","[To storm out, run out, dash out, storm out, t...","{intermediate, advanced}","{Sogang 4A, TOPIK in 30 Days+ Intermediate Voc...",,0.00
11157,장바구니,"{SNU, other, ewha, yonsei, kiip}","[shopping bag, a shopping cart; a shopping bas...","{intermediate, advanced}","{KIIP 4 - 2016, Yonsei 3-1 연세 한국어, KIIP 2016 L...",,0.00
11151,장마철,"{sogang, SNU, other, yonsei, topik}","[monsoon season, rainy season, the rainy seaso...","{intermediate, beginner, advanced}","{Yonsei 3-2 연세 한국어, 서울대 5B 한국어 교과서, 쏙쏙 40 days...",,0.00
4292,매콤하다,"{sogang, sungkyunkwan, SNU, hanyang, topik}","[spicy, to be moderately spicy, To be spicy an...","{intermediate, advanced}","{성균관어학 3급, 서울대 5B 한국어 교과서, 한양대 중급2, 쏙쏙 TOPIK 한...",,0.00
