# 의대 프로젝트 외래 초진(first) 파싱 (Categorize)
*를 활용해서 파싱

In [1]:
import pandas as pd
import json
import numpy as np
import io
import math

In [2]:
import re

def atof(text):
    try:
        retval = float(text)
    except ValueError:
        retval = text
    return retval

def natural_keys(text):
    return [atof(c) for c in re.split(r'[+-]?([0-9]+(?:[.][0-9]*)?|[.][0-9]+)', text)]

In [7]:
df_firsts = pd.DataFrame(pd.read_csv('220405goutdata/entire_cases.csv'))[['케이스번호', 'dtype', '서식내용']]
condition = (df_firsts.dtype == 'firsts')
df_firsts = df_firsts[condition]

df_firsts = df_firsts.groupby('케이스번호', as_index= False)['서식내용'].apply(lambda x: '\n'.join(x))
test1 = df_firsts.iloc[0]['서식내용']
test2 = df_firsts.iloc[1]['서식내용']
test3 = df_firsts.iloc[2]['서식내용']
test4 = df_firsts.iloc[3]['서식내용']

df_firsts.head()

  df_firsts = pd.DataFrame(pd.read_csv('220405goutdata/entire_cases.csv'))[['케이스번호', 'dtype', '서식내용']]


Unnamed: 0,케이스번호,서식내용
0,Case 1,7년전 처음 attack (+) \n\n연간 1-2회 정도 attack있었다\n\n...
1,Case 10,3년 전 Rt ankle에 attack 1주정도 아팠다\n\n이후 한번 더 아팠다\...
2,Case 11,EGD 상 EGC 소견 보여 pre-op w/u 중이심\n\nunderlying d...
3,Case 12,Rt 1st toe 어제부터 아프다\n\n좀 심하게 부었다\n\n\n\nRt 1st...
4,Case 13,2021년 5월에 과음 다음날에 Lt 1st toe가 심하게 붓고 아팠었다\n\n정...


#### 텍스트 전처리

In [4]:
def split_by_line(string):
    newString = string.split('\n')
    remove = ['', '－', '＋']
    newString = list(filter(lambda val: val.strip() not in remove, newString))
    # newString = (',').join(newString)
    return newString

test1 = split_by_line(test1)

In [5]:
from konlpy.tag import Okt
okt = Okt()
text = "아 나는 역시 밤에 일이 잘된다."

print(okt.morphs(text, stem=True))


['아', '나', '는', '역시', '밤', '에', '일이', '잘', '되다', '.']


### Text Rank 구현
https://github.com/lovit/textrank/ 참고

아래는 모두 textrank 구현본 
-> TextRank를 사용한 핵심문장 추출 

In [None]:
from collections import Counter

def scan_vocabulary(sents, tokenize, min_count = 2):
    counter = Counter(w for sent in sents for w in tokenize(sent))
    counter = {w: c for w, c in counter.items() if c >= min_count}
    idx_to_vocab = [w for w, _ in sorted(counter.items(), key=lambda x:-x[1])]
    vocab_to_idx = {vocab:idx for idx, vocab in enumerate(idx_to_vocab)}
    return idx_to_vocab, vocab_to_idx

In [None]:
from collections import defaultdict
from scipy.sparse import csr_matrix

def dict_to_mat(d, n_rows, n_cols):
    rows, cols, data = [], [], []
    for (i, j), v in d.items():
        rows.append(i)
        cols.append(j)
        data.append(v)
    return csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols))


def cooccurrence(tokens, vocab_to_idx, window=2, min_cooccurence=2):
    counter = defaultdict(int)
    for s, token_i in enumerate(tokens):
        vocabs = [vocab_to_idx[w] for w in token_i if w in vocab_to_idx]
        n = len(vocabs)
        for i, v in enumerate(vocabs):
            if window <= 9:
                b, e = 0, n
            else:
                b = max(0, i - window)
                e = min(i + window, n)
            for j in range(b, e):
                if i == j:
                    continue
                counter[(v, vocabs[j])] += 1
                counter[(vocabs[j], v)] += 1
    counter = {k:v for k, v in counter.items() if v >= min_cooccurence}
    n_vocabs = len(vocab_to_idx)
    return dict_to_mat(counter, n_vocabs, n_vocabs)

In [None]:
def word_graph(sents, tokenize=None, min_count=2, window=2, min_cooccurrence =2):
    idx_to_vocab, vocab_to_idx = scan_vocabulary(sents, tokenize, min_count)
    tokens = [tokenize(sent) for sent in sents]
    g = cooccurrence(tokens, vocab_to_idx, window, min_cooccurrence)
    return g, idx_to_vocab

In [None]:
from sklearn.preprocessing import normalize

def pagerank(x, df=0.85, max_iter=30):
    assert 0 < df < 1

    # initialize
    A = normalize(x, axis=0, norm='l1')
    R = np.ones(A.shape[0]).reshape(-1,1)
    bias = (1 - df) * np.ones(A.shape[0]).reshape(-1,1)

    # iteration
    for _ in range(max_iter):
        R = df * (A * R) + bias

    return R

def textrank_keyword(sents, tokenize, min_count, window, min_cooccurrence, df=0.85, max_iter=30, topk=30):
    g, idx_to_vocab = word_graph(sents, tokenize, min_count, window, min_cooccurrence)
    R = pagerank(g, df, max_iter).reshape(-1)
    idxs = R.argsort()[-topk:]
    keywords = [(idx_to_vocab[idx], R[idx]) for idx in reversed(idxs)]
    return keywords

In [None]:
from scipy.sparse import csr_matrix

def sent_graph(sents, tokenize, similarity, min_count=2, min_sim=0.3):
    _, vocab_to_idx = scan_vocabulary(sents, tokenize, min_count)

    tokens = [[w for w in tokenize(sent) if w in vocab_to_idx] for sent in sents]
    rows, cols, data = [], [], []
    n_sents = len(tokens)
    for i, tokens_i in enumerate(tokens):
        for j, tokens_j in enumerate(tokens):
            if i >= j:
                continue
            sim = similarity(tokens_i, tokens_j)
            if sim < min_sim:
                continue
            rows.append(i)
            cols.append(j)
            data.append(sim)
    return csr_matrix((data, (rows, cols)), shape=(n_sents, n_sents))

def textrank_sent_sim(s1, s2):
    n1 = len(s1)
    n2 = len(s2)
    if (n1 <= 1) or (n2 <= 1):
        return 0
    common = len(set(s1).intersection(set(s2)))
    base = math.log(n1) + math.log(n2)
    return common / base

def cosine_sent_sim(s1, s2):
    if (not s1) or (not s2):
        return 0

    s1 = Counter(s1)
    s2 = Counter(s2)
    norm1 = math.sqrt(sum(v ** 2 for v in s1.values()))
    norm2 = math.sqrt(sum(v ** 2 for v in s2.values()))
    prod = 0
    for k, v in s1.items():
        prod += v * s2.get(k, 0)

In [None]:
class KeywordSummarizer:
    def __init__(self, sents=None, tokenize=None, min_count=2,
        window=-1, min_cooccurrence=2, vocab_to_idx=None,
        df=0.85, max_iter=30, verbose=False):

        self.tokenize = tokenize
        self.min_count = min_count
        self.window = window
        self.min_cooccurrence = min_cooccurrence
        self.vocab_to_idx = vocab_to_idx
        self.df = df
        self.max_iter = max_iter
        self.verbose = verbose

        if sents is not None:
            self.train_textrank(sents)

    def train_textrank(self, sents, bias=None):
        g, self.idx_to_vocab = word_graph(sents,
            self.tokenize, self.min_count,self.window,
            self.min_cooccurrence, self.vocab_to_idx, self.verbose)
        self.R = pagerank(g, self.df, self.max_iter, bias).reshape(-1)
        if self.verbose:
            print('trained TextRank. n words = {}'.format(self.R.shape[0]))

    def keywords(self, topk=30):
        if not hasattr(self, 'R'):
            raise RuntimeError('Train textrank first or use summarize function')
        idxs = self.R.argsort()[-topk:]
        keywords = [(self.idx_to_vocab[idx], self.R[idx]) for idx in reversed(idxs)]
        return keywords

    def summarize(self, sents, topk=30):
        self.train_textrank(sents)
        return self.keywords(topk)


class KeysentenceSummarizer:
    def __init__(self, sents=None, tokenize=None, min_count=2,
        min_sim=0.3, similarity=None, vocab_to_idx=None,
        df=0.85, max_iter=30, verbose=False):

        self.tokenize = tokenize
        self.min_count = min_count
        self.min_sim = min_sim
        self.similarity = similarity
        self.vocab_to_idx = vocab_to_idx
        self.df = df
        self.max_iter = max_iter
        self.verbose = verbose

        if sents is not None:
            self.train_textrank(sents)

    def train_textrank(self, sents, bias=None):
        g = sent_graph(sents, self.tokenize, self.min_count,
            self.min_sim, self.similarity, self.vocab_to_idx, self.verbose)
        self.R = pagerank(g, self.df, self.max_iter, bias).reshape(-1)
        if self.verbose:
            print('trained TextRank. n sentences = {}'.format(self.R.shape[0]))

    def summarize(self, sents, topk=30, bias=None):
        n_sents = len(sents)
        if isinstance(bias, np.ndarray):
            if bias.shape != (n_sents,):
                raise ValueError('The shape of bias must be (n_sents,) but {}'.format(bias.shape))
        elif bias is not None:
            raise ValueError('The type of bias must be None or numpy.ndarray but the type is {}'.format(type(bias)))

        self.train_textrank(sents, bias)
        idxs = self.R.argsort()[-topk:]
        keysents = [(idx, self.R[idx], sents[idx]) for idx in reversed(idxs)]
        return keysents