# Xử lý ngôn ngữ tự nhiên thống kê để phân tích cảm xúc

## Làm sạch dữ liệu

Dữ liệu đầu vào

In [135]:
raw_docs = [
    "Here are some very simple basic sentences.", # Đây là một số câu cơ bản rất đơn giản.
    "They won’t be very interesting, I’m afraid.", # Tôi e rằng chúng sẽ không thú vị cho lắm.
    "The point of these examples is to _learn how basic text \ cleaning works_ on *very simple* data." # Mục đích của những ví dụ này là _tìm hiểu cách thức hoạt động của văn bản cơ bản \ làm sạch_ trên dữ liệu *rất đơn giản*.
]

Cài đặt NLTK

In [136]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Chuyển đổi dữ liệu dưới dạng chuỗi thành vectơ từ

In [137]:
from nltk.tokenize import word_tokenize

tokenized_docs = [word_tokenize(doc) for doc in raw_docs]
print(tokenized_docs)

[['Here', 'are', 'some', 'very', 'simple', 'basic', 'sentences', '.'], ['They', 'won', '’', 't', 'be', 'very', 'interesting', ',', 'I', '’', 'm', 'afraid', '.'], ['The', 'point', 'of', 'these', 'examples', 'is', 'to', '_learn', 'how', 'basic', 'text', '\\', 'cleaning', 'works_', 'on', '*', 'very', 'simple', '*', 'data', '.']]


Tìm kiếm trong dữ liệu các ký hiệu dấu câu, ký tự đặc biệt và loại bỏ chúng

In [138]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [139]:
import re
import string
regex = re.compile('[%s]' % re.escape(string.punctuation))
tokenized_docs_no_punctuation = []
for review in tokenized_docs:
    new_review = []
    for token in review:
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            new_review.append(new_token)
    tokenized_docs_no_punctuation.append(new_review)
print(tokenized_docs_no_punctuation)

[['Here', 'are', 'some', 'very', 'simple', 'basic', 'sentences'], ['They', 'won', '’', 't', 'be', 'very', 'interesting', 'I', '’', 'm', 'afraid'], ['The', 'point', 'of', 'these', 'examples', 'is', 'to', 'learn', 'how', 'basic', 'text', 'cleaning', 'works', 'on', 'very', 'simple', 'data']]


Tạo gốc và xác định gốc

In [140]:
from nltk.stem.porter import PorterStemmer
# from nltk.stem.snowball import SnowballStemmer
# from nltk.stem.wordnet import WordNetLemmatizer
porter = PorterStemmer()
# snowball = SnowballStemmer("english")
# wordnet = WordNetLemmatizer()
# each of the following commands perform stemming on word
# porter.stem(word)
# snowball.stem(word)
# wordnet.lemmatize(word)

Một quy trình làm sạch dữ liệu rất hữu ích khác bao gồm xóa các thẻ và thực thể HTML

In [141]:
import nltk
from bs4 import BeautifulSoup
test_string ="<p>While many of the stories tugged at the heartstrings, I never felt manipulated by the authors. (Note: Part of the reason why I don’t like the ’Chicken Soup for the Soul’ series is that I feel that the authors are just dying to make the reader clutch for the box of tissues.)</a>"
soup = BeautifulSoup(test_string, 'html.parser')
print("Original text:")
print(test_string)
print("Cleaned text:")
print(soup.get_text())

Original text:
<p>While many of the stories tugged at the heartstrings, I never felt manipulated by the authors. (Note: Part of the reason why I don’t like the ’Chicken Soup for the Soul’ series is that I feel that the authors are just dying to make the reader clutch for the box of tissues.)</a>
Cleaned text:
While many of the stories tugged at the heartstrings, I never felt manipulated by the authors. (Note: Part of the reason why I don’t like the ’Chicken Soup for the Soul’ series is that I feel that the authors are just dying to make the reader clutch for the box of tissues.)


## Trình bày văn bản

Ví dụ về biểu diễn BoW cho hai văn bản

In [142]:
mydoclist = [
    'Mireia loves me more than Hector loves me',
    'Sergio likes me more than Mireia loves me',
    'He likes basketball more than football',
]
from collections import Counter
for doc in mydoclist:
    tf = Counter()
    for word in doc.split():
        tf[word] += 1
    print(tf.items())

dict_items([('Mireia', 1), ('loves', 2), ('me', 2), ('more', 1), ('than', 1), ('Hector', 1)])
dict_items([('Sergio', 1), ('likes', 1), ('me', 2), ('more', 1), ('than', 1), ('Mireia', 1), ('loves', 1)])
dict_items([('He', 1), ('likes', 1), ('basketball', 1), ('more', 1), ('than', 1), ('football', 1)])


In [143]:
c = Counter() # a new, empty counter
c = Counter('gallahad') # a new counter from an iterable

Trả về số không cho các mục bị thiếu thay vì tăng Key Error (lỗi chính).

In [144]:
c = Counter(['eggs', 'ham'])
c['bacon']

0

Một ví dụ để tính toán trình đánh giá tính năng dựa trên tần số từ

In [145]:
def build_lexicon(corpus):
    # define a set with all possible words included in all the sentences or "corpus"
    # xác định một tập hợp với tất cả các từ có thể có trong tất cả các câu hoặc "ngữ liệu"
    lexicon = set()
    for doc in corpus:
        lexicon.update([word for word in doc.split ()])
    return lexicon
def tf(term, document):
    return freq(term, document)
def freq(term, document):
    return document.split().count(term)
vocabulary = build_lexicon(mydoclist)
doc_term_matrix = []
print ('Our vocabulary vector is [' + ', '.join(list(vocabulary)) + ']')
for doc in mydoclist:
    print('The doc is "' + doc + '"')
    tf_vector = [tf(word, doc) for word in vocabulary]
    tf_vector_string = ', '.join(format(freq, 'd') for freq in tf_vector)
    print('The tf vector for Document %d is [%s]' % ((mydoclist.index(doc)+1), tf_vector_string))
    doc_term_matrix.append(tf_vector)
print("All combined, here is our master document term matrix: ")
print(doc_term_matrix)

Our vocabulary vector is [Hector, football, He, likes, more, loves, basketball, Mireia, than, Sergio, me]
The doc is "Mireia loves me more than Hector loves me"
The tf vector for Document 1 is [1, 0, 0, 0, 1, 2, 0, 1, 1, 0, 2]
The doc is "Sergio likes me more than Mireia loves me"
The tf vector for Document 2 is [0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 2]
The doc is "He likes basketball more than football"
The tf vector for Document 3 is [0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0]
All combined, here is our master document term matrix: 
[[1, 0, 0, 0, 1, 2, 0, 1, 1, 0, 2], [0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 2], [0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0]]


Cần thực hiện một số chuẩn hóa vectơ

In [146]:
import math
import numpy as np
def l2_normalizer(vec):
    denom = np.sum([el**2 for el in vec])
    return [(el / math.sqrt(denom)) for el in vec]
doc_term_matrix_l2 = []
for vec in doc_term_matrix:
    doc_term_matrix_l2.append(l2_normalizer(vec))
print('A regular old document term matrix:')
print(np.matrix(doc_term_matrix))
print('\nA document term matrix with row-wise L2 norm:')
print(np.matrix(doc_term_matrix_l2))

A regular old document term matrix:
[[1 0 0 0 1 2 0 1 1 0 2]
 [0 0 0 1 1 1 0 1 1 1 2]
 [0 1 1 1 1 0 1 0 1 0 0]]

A document term matrix with row-wise L2 norm:
[[0.28867513 0.         0.         0.         0.28867513 0.57735027
  0.         0.28867513 0.28867513 0.         0.57735027]
 [0.         0.         0.         0.31622777 0.31622777 0.31622777
  0.         0.31622777 0.31622777 0.31622777 0.63245553]
 [0.         0.40824829 0.40824829 0.40824829 0.40824829 0.
  0.40824829 0.         0.40824829 0.         0.        ]]


Thử trọng số mỗi từ bằng tần số tài liệu nghịch đảo của nó

In [147]:
def numDocsContaining(word, doclist):
    doccount = 0
    for doc in doclist:
        if freq(word, doc) > 0:
            doccount += 1
        return doccount
def idf(word, doclist):
    n_samples = len(doclist)
    df = numDocsContaining(word, doclist)
    return np.log(n_samples / df) if df != 0 else 0
my_idf_vector = [idf(word, mydoclist) for word in vocabulary]
print('Our vocabulary vector is [' + ', '.join(list (vocabulary)) + ']')
print('The inverse document frequency vector is [' + ', '.join(format(freq, 'f') for freq in my_idf_vector) + ']')

Our vocabulary vector is [Hector, football, He, likes, more, loves, basketball, Mireia, than, Sergio, me]
The inverse document frequency vector is [1.098612, 0.000000, 0.000000, 0.000000, 1.098612, 1.098612, 0.000000, 1.098612, 1.098612, 0.000000, 1.098612]


Chuyển đổi vectơ IDF của chúng tôi thành một ma trận trong đó đường chéo là vectơ IDF

In [148]:
def build_idf_matrix(idf_vector):
    idf_mat = np.zeros((len(idf_vector), len(idf_vector)))
    np.fill_diagonal(idf_mat, idf_vector)
    return idf_mat
my_idf_matrix = build_idf_matrix(my_idf_vector)
print(my_idf_matrix)

[[1.09861229 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         1.09861229 0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         1.09861229
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         1.09861229 0.         0.         0.        ]
 [0.         0.         0.         0.         0.

Chuẩn hóa từng tài liệu bằng cách sử dụng định mức L2


In [149]:
doc_term_matrix_tfidf = []
#performing tf-idf matrix multiplication
for tf_vector in doc_term_matrix:
    doc_term_matrix_tfidf.append(np.dot(tf_vector, my_idf_matrix))
#normalizing
doc_term_matrix_tfidf_l2 = []
for tf_vector in doc_term_matrix_tfidf:
    doc_term_matrix_tfidf_l2.append(l2_normalizer(tf_vector))
print(vocabulary)
# np.matrix() just to make it easier to look at
print(np.matrix(doc_term_matrix_tfidf_l2))

{'Hector', 'football', 'He', 'likes', 'more', 'loves', 'basketball', 'Mireia', 'than', 'Sergio', 'me'}
[[0.28867513 0.         0.         0.         0.28867513 0.57735027
  0.         0.28867513 0.28867513 0.         0.57735027]
 [0.         0.         0.         0.         0.35355339 0.35355339
  0.         0.35355339 0.35355339 0.         0.70710678]
 [0.         0.         0.         0.         0.70710678 0.
  0.         0.         0.70710678 0.         0.        ]]


## Các trường hợp thực tế

### Chuẩn bị dữ liệu và cài đặt

Cài đặt TextBlog

In [167]:
# Chạy lệnh dưới trên terminal
# pip install -U textblob
# python -m textblob.download_corpora

Code ví dụ

In [152]:
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.classify import NaiveBayesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from unidecode import unidecode
def BoW(text):
    # Tokenizing text
    text_tokenized = [word_tokenize(doc) for doc in text]
    # Removing punctuation
    regex = re.compile('[%s]' % re.escape(string. punctuation))
    tokenized_docs_no_punctuation = []
    for review in text_tokenized:
        new_review = []
        for token in review:
            new_token = regex.sub(u'', token)
            if not new_token == u'':
                new_review.append(new_token)
        tokenized_docs_no_punctuation.append(new_review)
    # Stemming and Lemmatizing
    porter = PorterStemmer()
    preprocessed_docs = []
    for doc in tokenized_docs_no_punctuation:
        final_doc = ''
        for word in doc:
            final_doc = final_doc + ' ' + porter. stem(word)
        preprocessed_docs.append(final_doc)
    return preprocessed_docs

# read your train text data here
# textTrain = ReadTrainDataText()
# preprocessed_docs = BoW(textTrain) # for train data # Computing TIDF word space
# tfidf_vectorizer = TfidfVectorizer(min_df = 1)
# trainData = tfidf_vectorizer.fit_transform(preprocessed_docs)
# textTest = ReadTestDataText() #read your test text data here
# prepro_docs_test = BoW(textTest) # for test data testData = tfidf_vectorizer.transform(prepro_docs_test)

### Phân tích tình cảm khi đánh giá phim