# Goal: to generate a feature matrix (comprising feature sets of more than 120,000 Korean words) to allow supervised machine learning techniques

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# def flatten_list(lists):
#     return functools.reduce(operator.iconcat, lists, [])

# def counter_sort(counter):
#     return sorted(counter.items(), key=lambda item: item[1], reverse=True)

# import time
# start_time = time.time()
# main()
# print("--- %s seconds ---" % (time.time() - start_time))

## Function Definitions

In [2]:
import functools
import itertools
import operator
from collections import Counter
import numpy as np
import pandas as pd
import json
import string
import csv


# transform data json to text file
def write_data(json_file, N1, N2, to_file):
    # load json file to dictionary
    with open(json_file, "r", encoding='utf-8') as read_file:
        data = json.load(read_file)
    # data string
    sents = data["sentence"]
    res, temp = "", 0
    for n in range(N1 - 1, N2):
        for dict in sents[n]["morp"]:
            for dt in sents[n]["NE"]:
                if dict["lemma"] in dt["text"]:
                    res += dict["lemma"] + '\t' + dict['type'] + '\t' + dt['type'] + '\n'
                    temp = 1
                    break
            if temp == 0: 
                res += dict["lemma"] + '\t' + dict['type'] + '\t' + 'O' + '\n'
            temp = 0
        res += '\n'
    # write data to file
    with open(to_file, 'w', encoding='utf-8') as f:
        res = res.rstrip() + '\n'
        f.write(res)
    print("Successfully written to:", to_file)

In [3]:
def wordshape(text):
    res = []
    alphabets_upper = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    alphabets_lower = 'abcdefghijklmnopqrstuvwxyz'
    for char in text:
        if char.isdigit(): res.append('d')
        elif char in string.punctuation: res.append('p')
        elif char in alphabets_lower: res.append('e')
        elif char in alphabets_upper: res.append('E')   
        else: res.append('K')
    return ''.join(res)

def features_without_pos(word):
    containsNumber = int(any(char.isdigit() for char in word))
    allNumber = int(all(char.isdigit() for char in word))
    containsPunctuation = int(any(char in string.punctuation for char in word))
    length = len(word)
    gaz = int(word in gazetteer)
    firstLetter = word[:1]
    lastLetter = word[-1:]
    shape = wordshape(word)
    return [containsNumber, allNumber, containsPunctuation, length, gaz, firstLetter, lastLetter, shape]

def get_features_from_data(file_from, file_to=0):
    data = [line.rstrip('\n').split() for line in open(file_from, encoding='utf-8') if line != '\n']
    res = [['Word', 'ContainsNumber', 'AllNumber', 'ContainsPunctuation', 'Length', 'Gazetteer', \
            'FirstLetter', 'LastLetter', 'Shape', 'POS', 'NER']]
    res += [[data[n][0], *features_without_pos(data[n][0]), data[n][1], data[n][2]] for n in range(len(data))]
    df = pd.DataFrame(res, columns=res[0])
    df = df.drop(0)
    df.reset_index(drop=True, inplace=True)
    if file_to:
        df.to_csv(file_to, index=False, quoting=csv.QUOTE_NONNUMERIC)
        print('Successfully written to:', file_to)
    return df

## Global Variables

In [4]:
gazetteer = [line.rstrip('\n') for line in open('gazetteer_fromTrain.txt', encoding='utf-8')]

## Demo Starts Here

In [5]:
write_data('NEtaggedCorpus_train.json', 1, 3555, 'Classifiers.data')
print('---- Generating feature matrix from data ----')
get_features_from_data('Classifiers.data', 'Classifiers_full_features.data')

Successfully written to: Classifiers.data
---- Generating feature matrix from data ----
Successfully written to: Classifiers_full_features.data


Unnamed: 0,Word,ContainsNumber,AllNumber,ContainsPunctuation,Length,Gazetteer,FirstLetter,LastLetter,Shape,POS,NER
0,한편,0,0,0,2,0,한,편,KK,NNG,O
1,",",0,0,1,1,1,",",",",p,SP,O
2,AFC,0,0,0,3,1,A,C,EEE,SL,O
3,챔피언스,0,0,0,4,0,챔,스,KKKK,NNG,O
4,리그,0,0,0,2,1,리,그,KK,NNG,O
...,...,...,...,...,...,...,...,...,...,...,...
121003,하락,0,0,0,2,0,하,락,KK,NNG,O
121004,하,0,0,0,1,1,하,하,K,XSV,O
121005,았,0,0,0,1,0,았,았,K,EP,O
121006,다,0,0,0,1,1,다,다,K,EF,O
