In [75]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from numpy.linalg import norm
from collections import namedtuple
import gensim
import json
import re
from textsplit.tools import get_penalty, get_segments
from textsplit.algorithm import split_optimal, get_total, get_gains
from nltk.corpus import stopwords
import nltk

import pickle

path = "./txt/"
seg_path = "./segmented/"

In [73]:
def preprocessing(input_path):
    wlem = nltk.WordNetLemmatizer()
    stopwords_list = stopwords.words('english') #nltk에서 제공하는 불용어사전 이용
    
    f = open(path + input_path, 'r')
    texts = f.read().replace("\n", "")
    texts = texts.replace("   ", " ")
    text = texts.replace("  ", " ")


    # 문장별로 잘려서 들어가있는 본문
    splitted_text = list()
    original_text = list()

    text = re.sub("[^ ㄱ-ㅣ가-힣0-9a-zA-Z\.|\?|\!|\n]+", "", text).lower()
    sents = re.split(r"[\?|\.|\!|\n]", text)

    for i in range(len(sents)):
        """
        토크나이징을 통한 불용어 제거 및 어근만 남기기
        """
        word_tokens = nltk.word_tokenize(sents[i])
        tokens_pos = nltk.pos_tag(word_tokens)
        
        words = []
        for word, pos in tokens_pos:
            words.append(word)

        # lemma 사용하기
        lemmatized_words = []
        for word in words:
            new_word = wlem.lemmatize(word)
            lemmatized_words.append(new_word)

            
        #print('stopwords: ', stopwords_list)
        unique_NN_words = set(lemmatized_words)
        final_NN_words = lemmatized_words

        # 불용어 제거
        for word in unique_NN_words:
            if word in stopwords_list:
                while word in final_NN_words: final_NN_words.remove(word)

        splitted_text.append(" ".join(final_NN_words) + "</s>" )
        original_text.append(sents[i] + "</s>")

    return splitted_text, original_text

In [78]:
def get_vector(wrdvec_path):
    model = gensim.models.KeyedVectors.load_word2vec_format(wrdvec_path, binary=True, unicode_errors='ignore')
    wrdvecs = pd.DataFrame(model.vectors, index=model.vocab)

    return wrdvecs


def segmentation(wrdvecs, sentenced_text):
    # sklearn CountVectorizer 클래스: 문서를 token count matrix로 변환하는 클래스
    vecr = CountVectorizer(vocabulary=wrdvecs.index)

    sentence_vectors = vecr.transform(sentenced_text).dot(wrdvecs)

    print(len(sentenced_text))
    segment_len = len(sentenced_text) // 10  # segment target length in sentences

    print("segment_len: ", segment_len)
    try:
        penalty = get_penalty([sentence_vectors], segment_len)
    except ZeroDivisionError:
        return 0
    try:
        optimal_segmentation = split_optimal(sentence_vectors, penalty, seg_limit=segment_len)
    except AssertionError:
        return 0

    segmented_text = get_segments(sentenced_text, optimal_segmentation)

    return segmented_text

In [6]:

wrdvec_path = './ko/ko.bin'
wrdvecs = get_vector(wrdvec_path)

file_list = os.listdir(path)


In [48]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /Users/hbae/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/hbae/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /Users/hbae/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /Users/hbae/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     /Users/hbae/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to
[nltk_data]    |     /Users/hbae/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to
[nltk_data]    |     /Users/hbae/nltk_data...
[nltk_data]    |

[nltk_data]    |   Unzipping corpora/sinica_treebank.zip.
[nltk_data]    | Downloading package smultron to
[nltk_data]    |     /Users/hbae/nltk_data...
[nltk_data]    |   Unzipping corpora/smultron.zip.
[nltk_data]    | Downloading package state_union to
[nltk_data]    |     /Users/hbae/nltk_data...
[nltk_data]    |   Unzipping corpora/state_union.zip.
[nltk_data]    | Downloading package stopwords to
[nltk_data]    |     /Users/hbae/nltk_data...
[nltk_data]    |   Package stopwords is already up-to-date!
[nltk_data]    | Downloading package subjectivity to
[nltk_data]    |     /Users/hbae/nltk_data...
[nltk_data]    |   Unzipping corpora/subjectivity.zip.
[nltk_data]    | Downloading package swadesh to
[nltk_data]    |     /Users/hbae/nltk_data...
[nltk_data]    |   Unzipping corpora/swadesh.zip.
[nltk_data]    | Downloading package switchboard to
[nltk_data]    |     /Users/hbae/nltk_data...
[nltk_data]    |   Unzipping corpora/switchboard.zip.
[nltk_data]    | Downloading package t

True

In [87]:
for i in file_list[:100]:
    title = i[:-5]
    sentenced_text, original_text = preprocessing(i)

    segmented_text = segmentation(wrdvecs, sentenced_text)
    index = [len(segment) for segment in segmented_text]
    
    final_segment = list()
    for i in index:
        final_segment.append(original_text[:i])
        original_text = original_text[i:]

    with open(seg_path + 'seg+' + title + '.txt', 'wt') as f:
        for i, segment_sentences in enumerate(segmented_text):
            segment_str = ' // '.join(segment_sentences)
            print(segment_str + '\n<' + "=" * 30, file=f)


1567
segment_len:  156
157
segment_len:  15
4982
segment_len:  498
3046
segment_len:  304
1108
segment_len:  110
4157
segment_len:  415
276
segment_len:  27
376
segment_len:  37
5932
segment_len:  593
1113
segment_len:  111
1638
segment_len:  163
20
segment_len:  2
7036
segment_len:  703
2905
segment_len:  290
4709
segment_len:  470
175
segment_len:  17
655
segment_len:  65
7689
segment_len:  768
451
segment_len:  45
2145
segment_len:  214
7896
segment_len:  789
512
segment_len:  51
166
segment_len:  16
4589
segment_len:  458
3631
segment_len:  363
3851
segment_len:  385
4596
segment_len:  459
7142
segment_len:  714
2881
segment_len:  288
3703
segment_len:  370
2039
segment_len:  203
7817
segment_len:  781
4763
segment_len:  476
2030
segment_len:  203
9023
segment_len:  902
2319
segment_len:  231
115
segment_len:  11
2901
segment_len:  290
5502
segment_len:  550
1630
segment_len:  163
5603
segment_len:  560
4569
segment_len:  456
1157
segment_len:  115
968
segment_len:  96
669
segment_

In [90]:
import os
import pandas as pd

new_path = "./eng_segment/"
file_list = os.listdir(seg_path)


index = 0
documents = dict()
for i in file_list:
    seg = list()
    f = open(seg_path + i, 'r')
    line = f.readlines()
    title = i[4:-9]

    for j in range(len(line)):
        if " // " in line[j]:
            segments = line[j].split(" // ")
            para = " ".join(segments)
            seg.append(para)
    
    full_text = "".join(seg)
    
    with open(new_path + title + '.txt', 'w', encoding='utf-8') as newf:
        newf.write(full_text)

            
    f.close()
    newf.close()