In [31]:
import re as regex, string
from typing import List
from enchant.checker import SpellChecker
import numba
from tqdm import tnrange, tqdm_notebook
from time import sleep
import nltk

In [32]:
import logging
import sys
logging.basicConfig(format='[%(funcName)s]: %(message)s',
                     level=logging.INFO, stream=sys.stdout)
logger = logging.getLogger("logger")
logger.setLevel(logging.DEBUG)
logger.info('Hello world!')
logger.debug("Hello deubg")

[<module>]: Hello world!
[<module>]: Hello deubg


## 读取文件

In [33]:
filen = 'data/5/testData.txt'
fdata = list(open(filen))
fdata = fdata[0:1000]

## 分词

In [34]:
from nltk.tokenize import word_tokenize
def split_words(fdata: List[List])->List[List]:
    ret = []
    for row in tqdm_notebook(fdata):
        words = word_tokenize(row)
        ret.append(words)
    return ret

In [35]:
passage = split_words(fdata)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [36]:
print(sum(len(row) for row in passage))

387969


## 去除标点、特殊符号、HTML标签等非英文内容

In [37]:
def remove_punc(row_of_words: List[List])->List[List]:
    ret = []
    for row in tqdm_notebook(row_of_words):
        words = [word for word in row if word.isalpha()]
        ret.append(words)
    return ret

In [38]:
passage = remove_punc(passage)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


['its', 'as', 'though', 'an', 'alien', 'made', 'this', 'movie', 'its', 'really', 'disturbing', 'i', 'loved', 'it', 'some', 'great', 'directing', 'very', 'great', 'acting', 'this', 'film', 'is', 'practically', 'theatrical', 'a', 'great', 'script', 'a', 'great', 'plot', 'lots', 'of', 'symbolism', 'some', 'really', 'heavy', 'humour', 'very', 'english', 'and', 'pompous', 'and', 'rude', 'this', 'film', 'is', 'drenched', 'in', 'scarlet', 'sin', 'after', 'you', 'watch', 'it', 'you', 'feel', 'as', 'though', 'your', 'soul', 'and', 'has', 'turned', 'all', 'murky', 'but', 'it', 'was', 'still', 'very', 'much', 'worth', 'watching', 'a', 'good', 'laugh', 'for', 'those', 'special', 'few', 'who', 'are', 'open', 'minded', 'enough', 'to', 'enjoy', 'this', 'really', 'true', 'masterpiece', 'i', 'would', 'not', 'know', 'for', 'sure', 'which', 'genre', 'to', 'place', 'it', 'a', 'cult', 'classic', 'maybe', 'this', 'is', 'basically', 'the', 'story', 'of', 'the', 'owner', 'of', 'a', 'french', 'restaurant', 'h

In [40]:
print(sum(len(row) for row in passage))

331751


## 去除停用词

In [None]:
from nltk.corpus import stopwords
def remove_stop_words(passage):
    stop_words = set(stopwords.words('english'))
    stop_words.add('us')
    passage = [list(filter(lambda w: w.lower() not in stop_words, row)) for row in passage]
    return passage


## 拼写检查

In [44]:
import autocorrect

def words_spell_check(fdata)->List[List]:
    ret = []
    err = 0
    cnt = 0
    for row in tqdm_notebook(fdata):
        corrected_row = []
        for word in row:
            suggest = autocorrect.spell(word)
            if word != suggest:
                err += 1
            cnt += 1
            corrected_row.append(word)
        ret.append(corrected_row)
    logging.info("There are {} errors in {} words, error rate : {}".format(err, cnt, err/cnt))
    return ret

import os, time, random
def correct_words(passage):
    print('Run task (%s)...' % (os.getpid()))
    ret = [[autocorrect.spell(word) for word in row] for row in passage]
    return ret

In [45]:
from multiprocessing import Pool
from typing import List, NoReturn, Callable
def list_multiprocess(lst: List, func: Callable[[List],List], n: int)-> List:
    if len(lst) < n:
        return func(lst)
    p = Pool(n)
    lists = []
    psize = int(len(lst) / n)
    for i in range(n - 1):
        lists.append(lst[i * psize: (i+1) * psize])
    lists.append(lst[(n-1) * psize:])
    ret = []
    for i in range(n):
        ret.append(p.apply_async(func, args=(lists[i],)))
    print('Waiting for all subprocesses done...')
    p.close()
    p.join()
    last = []
    for i in ret:
        last += i.get()
    print('All subprocesses done.')
    return last

In [46]:
logger.setLevel(logging.INFO)
passage = list_multiprocess(passage, words_spell_check, 6)

Waiting for all subprocesses done...

[words_spell_check]: There are 1280 errors in 53425 words, error rate : 0.02395882077678989

[words_spell_check]: There are 1310 errors in 59600 words, error rate : 0.02197986577181208

[words_spell_check]: There are 1323 errors in 52671 words, error rate : 0.025118186478327732

[words_spell_check]: There are 1355 errors in 53657 words, error rate : 0.02525299588124569

[words_spell_check]: There are 1195 errors in 51877 words, error rate : 0.023035256472039634

[words_spell_check]: There are 1386 errors in 60521 words, error rate : 0.022901141752449564
All subprocesses done.


## 词性标注

In [48]:
from nltk.corpus import wordnet
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

## 词形还原

In [49]:
from pattern.en import lemma
lemma('countries')

RuntimeError: generator raised StopIteration

In [50]:
from nltk.stem import WordNetLemmatizer
def lemma_passage(passage):
    ret = list(range(len(passage)))
    lemmatizer = WordNetLemmatizer()
    for i, row in tqdm_notebook(enumerate(passage)):
        nrow = []
        for w, pos in nltk.pos_tag(row):
            wordnet_pos = get_wordnet_pos(pos) or wordnet.NOUN
            nrow.append(lemmatizer.lemmatize(w, pos=wordnet_pos))
        ret[i] = nrow
    return ret

In [51]:
passage_lemma = lemma_passage(passage)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [73]:
import csv

with open("output.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(passage)

In [72]:
with open("data/5/testData_processed_compressed.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(passage_try)