In [1]:
import regex
import pandas as pd
from konlpy.tag import Okt

In [2]:
from wordfreq import zipf_frequency
import dict_helper

In [3]:
import urllib3
from bs4 import BeautifulSoup

In [4]:
import re

In [5]:
okt = Okt()

In [6]:
def get_rel_words(test):
    part_of_sentence = okt.pos(test)
    df = pd.DataFrame(part_of_sentence, columns=['word', 'pos'])
    relevant_word_types = ['Noun', 'Verb', 'Adjective', 'Adverb']
    df['rel'] = df.apply(lambda x: x.pos in relevant_word_types, axis=1)
    relevant_words = df[df.rel & df.word.map(lambda x: len(x) >1)]
    relevant_words = relevant_words.word.map(lambda x: ' '.join(okt.morphs(x, norm=True, stem=True)))
    return relevant_words.to_list()

In [7]:
f = open('subtitles/signal_ep2.txt')

text = f.read()

def text_processing(text):
    text = re.sub('\([\w ]*\)', '', text)
    text = re.sub('\[[\w ]*\]', '', text)

    text = text.split('\n')

    text = [x for x in text if len(x.strip()) > 0]
    return text

text = text_processing(text)

In [8]:
def process_text(text):
    df = pd.DataFrame(text[1:], columns=['rawsubtitle'])
    raw_sub = df.rawsubtitle.map(lambda x: get_rel_words(x))
    all_words = [i for x in raw_sub.to_list() for i in x ]
    frequency_list = pd.DataFrame(all_words, columns=['ko']).value_counts().reset_index(name='count')
    return frequency_list

In [9]:
frequency_list = process_text(text)

In [10]:
frequent_5900 = pd.read_csv('vocabulary list/frequency_list.csv', index_col=0)

In [11]:
all_textbook_vocabulary = pd.read_pickle('all_words.pkl')
set_textbook = set(all_textbook_vocabulary.ko.to_list()).union(set(frequent_5900.word.to_list()))

In [12]:
def find_interessting_vocab(frequency_list, set_textbook):
    missed = set(frequency_list.ko.tolist()) - set_textbook
    interessting_vocab = frequency_list[frequency_list.ko.isin(missed)].copy()
    interessting_vocab['en'] = interessting_vocab.ko.map(dict_helper.get_naver_dict_definition)
    return interessting_vocab

In [13]:
interessting_vocab = find_interessting_vocab(frequency_list, set_textbook)

In [14]:
interessting_vocab[interessting_vocab.en.notna()].head(10)

Unnamed: 0,ko,count,en
13,미제,20,"conundrum, being unresolved"
27,경위,13,judgment; discernment
29,공소,13,"indictment,prosecution, appeal to a higher court"
31,시효,13,prescription
35,유괴,12,"kidnap,abduction,kidnap,abduct"
36,일로,12,"a straight road, one road, over here; to this ..."
51,이재,9,"management of financial affairs, economy, finance"
58,박해,9,"persecution,oppression, persecute,oppress"
61,연쇄,9,"a chain, a link, a series, a connection"
62,무전,9,"radio, wireless, pennilessness"


In [15]:
interessting_vocab['zipf'] = interessting_vocab[interessting_vocab.en.notna()].ko.map(lambda x: zipf_frequency(x, 'ko'))

In [16]:
interessting_vocab[interessting_vocab.en.notna()& (interessting_vocab.zipf < 4)]

Unnamed: 0,ko,count,en,zipf
13,미제,20,"conundrum, being unresolved",3.10
27,경위,13,judgment; discernment,3.80
29,공소,13,"indictment,prosecution, appeal to a higher court",3.42
31,시효,13,prescription,3.44
35,유괴,12,"kidnap,abduction,kidnap,abduct",3.65
...,...,...,...,...
798,써다,1,subside,0.00
801,싸가지,1,,3.64
804,실없다,1,"untrustworthy, unreliable, insincere, idle, va...",0.00
825,악랄하다,1,"wicked,atrocious,vicious,brutal,heinous",3.59


In [17]:
f = open('subtitles/hosplital_playlist_ep8.txt')

text = f.read()

In [18]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(text, 'lxml')

In [19]:
subtitles = soup.find_all('c.korean')

In [20]:
subtitles = [s.find_all('c.bg_transparent')[0] for s in subtitles]

In [21]:
text = '\n'.join([s.text[1:] for s in subtitles])

In [22]:
text = text_processing(text)
frequency = process_text(text)
voca_list = find_interessting_vocab(frequency, set_textbook)

In [23]:
voca_list = voca_list[voca_list.en.notna()].copy()

In [24]:
voca_list['zipf'] = voca_list.ko.map(lambda x: zipf_frequency(x, 'ko'))

In [30]:
voca_list[voca_list.zipf < 5].sort_values(by='count', ascending=False).head(20)

Unnamed: 0,ko,count,en,zipf
61,전이,10,"metastasis,spread,metastasize",4.2
132,송화,6,"the flowers of the pine, pine pollen",0.0
187,명태,5,pollack,3.09
189,어유,5,fish oil,3.4
201,흉부,4,"chest,breast,thorax",3.28
202,대요,4,"a summary, the gist, substance, a syllabus, a ...",4.62
212,판막,4,valves,0.0
291,에피,3,"Effie, Wheat Stalk Bread",3.65
314,스톱,3,stop,3.43
293,감봉,3,cut[reduce dock] one's wages[pay salary],0.0
