In [64]:
import json
import re
import pandas as pd
import numpy as np
from collections import Counter
from konlpy.tag import Kkma
from tqdm import tqdm

In [75]:
def tokenize(text: str) -> dict:
    kkma = Kkma()
    tokens = kkma.morphs(text)
    pos_tokens = kkma.pos(text)
    return {'text': text, 'tokens': tokens, 'pos': pos_tokens}

def get_string_length(text: str) -> int:
    return(len(text))
def get_num_tokens(tokens: list) -> int:
    return(len(tokens))
def find_sent_id(tokens: list) -> list:
    ids = []
    for i, tok in enumerate(tokens):
        found = re.findall('[.?!]+', tok)
        if(bool(found)):
            ids.append(i)
    return ids
def split_sentence(sent_ids: list, tokens: list)-> list:  
    sentences = []
    start = 0
    for i in sent_ids:
        sentences.append(tokens[start:i+1])
        start = i+1
    return sentences
def split_pos_sentence(sent_ids: list, pos: list)-> list:  
    sentences = []
    start = 0
    for i in sent_ids:
        sent_both = pos[start:i+1] #get the sentence
        sent_tag = list(map(lambda tag: tag[1], sent_both))#get only the pos tags
        sentences.append(sent_tag)#append
        start = i+1
    return sentences
def get_num_sentences(tokens:list, pos:list):
    sent_ids = find_sent_id(tokens)
    sentences = split_sentence(sent_ids, tokens)
    pos_sentences = split_pos_sentence(sent_ids, pos)
    return len(sent_ids), sentences, pos_sentences
def get_num_token_sentences(sentences: list) -> list:
    num_tokens = []
    for sentence in sentences:
        num_tokens.append(len(sentence))
    return num_tokens

def simple_analysis(data):
    
    text = data['text']
    tokens = data['tokens']
    pos = data['pos']

    all_analysis = {}

    all_analysis['string_length'] = get_string_length(text)
    all_analysis['num_tokens'] = get_num_tokens(tokens)
    num_sentences, sentences, pos_sentences = get_num_sentences(tokens, pos)
    num_token_sentences = get_num_token_sentences(sentences)
    all_analysis['num_sentences'] = num_sentences
    all_analysis['num_token_sentences'] = num_token_sentences
    data['sentence_tokens'] = sentences
    data['sentence_pos'] = pos_sentences
    
    return data, all_analysis
def get_rel_freq(freq: dict):
    freq = pd.Series(freq, dtype = "float64")
    tot = freq.sum()
    return (freq/tot).to_dict()
def get_pos_freq(article: list[list]) -> dict:
    pos_freq = {}
    s_freq = []
    s_rel_freq = []
    article_freq = dict()

    for sentence in article:
        sentence_freq = dict(Counter(sentence))
        s_freq.append(sentence_freq)
        s_rel_freq.append(get_rel_freq(sentence_freq))
        for (key, freq) in sentence_freq.items():
            if key in article_freq.keys():
                article_freq[key] = article_freq[key] + freq
            else:
                article_freq[key] = freq

    pos_freq['s_freq'] = s_freq
    pos_freq['s_rel_freq'] = s_rel_freq
    pos_freq['a_freq'] = article_freq
    pos_freq['a_rel_freq'] = get_rel_freq(article_freq)
        
    return pos_freq 
def calRelPosition(sentence_pos, num_token_sentences):
    for i, sentence in enumerate(sentence_pos):
        pos = np.array(sentence)
        result = {}
        for tag in set(pos):
            indices = np.where(pos == tag)[0][0]
            result[tag] = indices/num_token_sentences[i]       
    return result
def get_more_features(data, all_analysis):
    
    num_token_sentences = np.array(all_analysis['num_token_sentences'])
    num_token_type = len(all_analysis['pos_freq']['s_freq'])
    token_variety = (num_token_type / num_token_sentences)
    all_analysis['token_variety'] = token_variety
    sentence_pos = data['sentence_pos']
    rel_position = calRelPosition(sentence_pos, num_token_sentences)
    all_analysis['rel_position'] = rel_position
    return all_analysis

def analysis(data):
    data, all_analysis = simple_analysis(data)
    all_analysis['pos_freq'] = get_pos_freq(data['sentence_pos'])
    all_analysis = get_more_features(data, all_analysis)
    return data, all_analysis

In [83]:
data['sentence_pos']

[['NNG',
  'JKS',
  'NNG',
  'NNG',
  'NNG',
  'NNG',
  'JKM',
  'NNG',
  'JKO',
  'NNG',
  'XSV',
  'ETD',
  'NNG',
  'JKO',
  'VV',
  'ETD',
  'NNG',
  'NNG',
  'NNG',
  'NNG',
  'JKM',
  'VV',
  'ECS',
  'NNG',
  'NNG',
  'JKM',
  'VV',
  'EPT',
  'EFN',
  'SF'],
 ['NNG',
  'NNG',
  'NNG',
  'NNG',
  'JKM',
  'JX',
  'NNG',
  'NNG',
  'JKS',
  'VV',
  'ETD',
  'NNG',
  'NNG',
  'NNG',
  'JKM',
  'NNG',
  'JKO',
  'NNG',
  'XSV',
  'ETD',
  'NNG',
  'JKO',
  'VV',
  'ETD',
  'OL',
  'NNG',
  'JKO',
  'NNG',
  'NNG',
  'XSV',
  'EPT',
  'EFN',
  'SF'],
 ['OL',
  'NNG',
  'JX',
  'NNG',
  'XSN',
  'NNG',
  'NNG',
  'NNG',
  'JKO',
  'VV',
  'ECD',
  'NNG',
  'NR',
  'NNM',
  'NR',
  'NNM',
  'JX',
  'JX',
  'NNG',
  'XSV',
  'ETD',
  'NNB',
  'VV',
  'EPT',
  'EPT',
  'ETD',
  'SS',
  'NNG',
  'NNG',
  'JKM',
  'JKG',
  'NNG',
  'NNG',
  'NNG',
  'NNG',
  'NNG',
  'NNG',
  'SS',
  'NNG',
  'NNG',
  'JKS',
  'NNG',
  'XSV',
  'ETD',
  'NNG',
  'JKM',
  'JX',
  'NNG',
  'XSV',
  'ECD',
 

In [81]:
calRelPosition(data['sentence_pos'], new_analysis['num_token_sentences'])

{'JC': 0.5675675675675675,
 'XSV': 0.8918918918918919,
 'OL': 0.0,
 'JKO': 0.3783783783783784,
 'EPT': 0.43243243243243246,
 'VV': 0.40540540540540543,
 'NNG': 0.02702702702702703,
 'SP': 0.4864864864864865,
 'JKM': 0.16216216216216217,
 'EFN': 0.9459459459459459,
 'JX': 0.05405405405405406,
 'ECE': 0.4594594594594595,
 'SF': 0.972972972972973,
 'NNB': 0.7297297297297297,
 'NR': 0.10810810810810811,
 'NNM': 0.13513513513513514}

In [84]:
new_input  = {}
new_input['text']="경찰이 미승인 의료기술로 환자를 수술한 의혹을 받는 유명 관절전문병원에 대해 강제수사에 나섰습니다.서울 방배경찰서는 승인 기간이 지난 줄기세포 치료법으로 환자를 수술한 의혹을 받는 A 병원을 압수수색했습니다.A 병원은 제한적 의료기술 승인을 받아 재작년 4월 30일까지만 시술할 수 있었던 '근골격계 질환에서의 자가 지방 줄기세포 치료술'을 기한이 종료된 후에도 지속해서 시술했다는 의혹을 받고 있습니다.앞서 서민민생대책위원회는 지난해 8월 국민건강보험법 위반과 사기, 의료법 위반 등 혐의로 A 병원장을 서울 서부지방검찰청에 고발했습니다.A 병원은 재작년 7월에도 대리수술 혐의로 경찰 수사를 받았고, A 병원장과 의료기구업체 영업사원 등 16명을 불구속 송치했습니다."
new_input['category'] = "articles"

In [85]:
# new_input = input()
# with open(new_input) as json_file:
#     new_input = json.load(json_file)
data = new_input['text']
category = new_input['category']

data = tokenize(data)
data, new_analysis = analysis(data)

In [89]:
new_analysis.keys()

dict_keys(['string_length', 'num_tokens', 'num_sentences', 'num_token_sentences', 'pos_freq', 'token_variety', 'rel_position'])

In [102]:
def list_flatten(lst):
    result = []
    for el in lst:
        result.extend(el)
    return result
def dict_flatten(dct):
    result = {}
    for k, v in dct.items():
            if k in result.keys():
                if isinstance(result[k], list):
                    result[k].append(v)
                else:
                    result[k] = [result[k], v]
            else:
                result[k] = v
    return result
def list_of_dict(lst):
    result = {}
    for dct in lst:
        for k, v in dct.items():
            if k in result.keys():
                if isinstance(result[k], list):
                    result[k].append(v)
                else:
                    result[k] = [result[k], v]
            else:
                result[k] = v
    return result
def mean_of_dict(dct):
    for k,v in dct.items():
        dct[k] = np.mean(v)
    return dct


In [105]:
new_analysis['pos_freq']['a_rel_freq']

{'NNG': 0.4358974358974359,
 'JKS': 0.015384615384615385,
 'JKM': 0.05641025641025641,
 'JKO': 0.05128205128205128,
 'XSV': 0.046153846153846156,
 'ETD': 0.046153846153846156,
 'VV': 0.046153846153846156,
 'ECS': 0.005128205128205128,
 'EPT': 0.041025641025641026,
 'EFN': 0.02564102564102564,
 'SF': 0.02564102564102564,
 'JX': 0.041025641025641026,
 'OL': 0.02564102564102564,
 'XSN': 0.005128205128205128,
 'ECD': 0.010256410256410256,
 'NR': 0.02564102564102564,
 'NNM': 0.02564102564102564,
 'NNB': 0.015384615384615385,
 'SS': 0.010256410256410256,
 'JKG': 0.005128205128205128,
 'ECE': 0.010256410256410256,
 'VXV': 0.005128205128205128,
 'MAG': 0.005128205128205128,
 'JC': 0.010256410256410256,
 'SP': 0.010256410256410256}

In [106]:
main_var = {
    "token_variety": np.mean(new_analysis['token_variety']),
    "rel_position": np.mean(new_analysis['token_variety']),
    "num_token_sentences": np.mean(new_analysis['num_token_sentences']),
    "s_rel_freq": mean_of_dict(list_of_dict(new_analysis['pos_freq']['s_rel_freq'])),
    "a_rel_freq": new_analysis['pos_freq']['a_rel_freq'],
    }