# Xử lý ngôn ngữ tự nhiên thống kê để phân tích cảm xúc

## Làm sạch dữ liệu

Dữ liệu đầu vào

In [116]:
raw_docs = [
    "Here are some very simple basic sentences.", # Đây là một số câu cơ bản rất đơn giản.
    "They won’t be very interesting, I’m afraid.", # Tôi e rằng chúng sẽ không thú vị cho lắm.
    "The point of these examples is to _learn how basic text \ cleaning works_ on *very simple* data." # Mục đích của những ví dụ này là _tìm hiểu cách thức hoạt động của văn bản cơ bản \ làm sạch_ trên dữ liệu *rất đơn giản*.
]

Cài đặt NLTK

In [117]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Chuyển đổi dữ liệu dưới dạng chuỗi thành vectơ từ

In [118]:
from nltk.tokenize import word_tokenize

tokenized_docs = [word_tokenize(doc) for doc in raw_docs]
print(tokenized_docs)

[['Here', 'are', 'some', 'very', 'simple', 'basic', 'sentences', '.'], ['They', 'won', '’', 't', 'be', 'very', 'interesting', ',', 'I', '’', 'm', 'afraid', '.'], ['The', 'point', 'of', 'these', 'examples', 'is', 'to', '_learn', 'how', 'basic', 'text', '\\', 'cleaning', 'works_', 'on', '*', 'very', 'simple', '*', 'data', '.']]


Tìm kiếm trong dữ liệu các ký hiệu dấu câu, ký tự đặc biệt và loại bỏ chúng

In [119]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [120]:
import re
import string
regex = re.compile('[%s]' % re.escape(string.punctuation))
tokenized_docs_no_punctuation = []
for review in tokenized_docs:
    new_review = []
    for token in review:
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            new_review.append(new_token)
    tokenized_docs_no_punctuation.append(new_review)
print(tokenized_docs_no_punctuation)

[['Here', 'are', 'some', 'very', 'simple', 'basic', 'sentences'], ['They', 'won', '’', 't', 'be', 'very', 'interesting', 'I', '’', 'm', 'afraid'], ['The', 'point', 'of', 'these', 'examples', 'is', 'to', 'learn', 'how', 'basic', 'text', 'cleaning', 'works', 'on', 'very', 'simple', 'data']]


Tạo gốc và xác định gốc

In [121]:
from nltk.stem.porter import PorterStemmer
# from nltk.stem.snowball import SnowballStemmer
# from nltk.stem.wordnet import WordNetLemmatizer
porter = PorterStemmer()
# snowball = SnowballStemmer("english")
# wordnet = WordNetLemmatizer()
# each of the following commands perform stemming on word
# porter.stem(word)
# snowball.stem(word)
# wordnet.lemmatize(word)

Một quy trình làm sạch dữ liệu rất hữu ích khác bao gồm xóa các thẻ và thực thể HTML

In [122]:
import nltk
from bs4 import BeautifulSoup
test_string ="<p>While many of the stories tugged at the heartstrings, I never felt manipulated by the authors. (Note: Part of the reason why I don’t like the ’Chicken Soup for the Soul’ series is that I feel that the authors are just dying to make the reader clutch for the box of tissues.)</a>"
soup = BeautifulSoup(test_string, 'html.parser')
print("Original text:")
print(test_string)
print("Cleaned text:")
print(soup.get_text())

Original text:
<p>While many of the stories tugged at the heartstrings, I never felt manipulated by the authors. (Note: Part of the reason why I don’t like the ’Chicken Soup for the Soul’ series is that I feel that the authors are just dying to make the reader clutch for the box of tissues.)</a>
Cleaned text:
While many of the stories tugged at the heartstrings, I never felt manipulated by the authors. (Note: Part of the reason why I don’t like the ’Chicken Soup for the Soul’ series is that I feel that the authors are just dying to make the reader clutch for the box of tissues.)


## Trình bày văn bản

Ví dụ về biểu diễn BoW cho hai văn bản

In [123]:
mydoclist = [
    'Mireia loves me more than Hector loves me',
    'Sergio likes me more than Mireia loves me',
    'He likes basketball more than football',
]
from collections import Counter
for doc in mydoclist:
    tf = Counter()
    for word in doc.split():
        tf[word] += 1
    print(tf.items())

dict_items([('Mireia', 1), ('loves', 2), ('me', 2), ('more', 1), ('than', 1), ('Hector', 1)])
dict_items([('Sergio', 1), ('likes', 1), ('me', 2), ('more', 1), ('than', 1), ('Mireia', 1), ('loves', 1)])
dict_items([('He', 1), ('likes', 1), ('basketball', 1), ('more', 1), ('than', 1), ('football', 1)])


In [124]:
c = Counter() # a new, empty counter
c = Counter('gallahad') # a new counter from an iterable

Trả về số không cho các mục bị thiếu thay vì tăng Key Error (lỗi chính).

In [125]:
c = Counter(['eggs', 'ham'])
c['bacon']

0

Một ví dụ để tính toán trình đánh giá tính năng dựa trên tần số từ

In [126]:
def build_lexicon(corpus):
    # define a set with all possible words included in all the sentences or "corpus"
    # xác định một tập hợp với tất cả các từ có thể có trong tất cả các câu hoặc "ngữ liệu"
    lexicon = set()
    for doc in corpus:
        lexicon.update([word for word in doc.split ()])
    return lexicon
def tf(term, document):
    return freq(term, document)
def freq(term, document):
    return document.split().count(term)
vocabulary = build_lexicon(mydoclist)
doc_term_matrix = []
print ('Our vocabulary vector is [' + ', '.join(list(vocabulary)) + ']')
for doc in mydoclist:
    print('The doc is "' + doc + '"')
    tf_vector = [tf(word, doc) for word in vocabulary]
    tf_vector_string = ', '.join(format(freq, 'd') for freq in tf_vector)
    print('The tf vector for Document %d is [%s]' % ((mydoclist.index(doc)+1), tf_vector_string))
    doc_term_matrix.append(tf_vector)
print("All combined, here is our master document term matrix: ")
print(doc_term_matrix)

Our vocabulary vector is [Mireia, than, Sergio, football, likes, loves, more, Hector, He, basketball, me]
The doc is "Mireia loves me more than Hector loves me"
The tf vector for Document 1 is [1, 1, 0, 0, 0, 2, 1, 1, 0, 0, 2]
The doc is "Sergio likes me more than Mireia loves me"
The tf vector for Document 2 is [1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 2]
The doc is "He likes basketball more than football"
The tf vector for Document 3 is [0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0]
All combined, here is our master document term matrix: 
[[1, 1, 0, 0, 0, 2, 1, 1, 0, 0, 2], [1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 2], [0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0]]


Cần thực hiện một số chuẩn hóa vectơ

In [127]:
import math
import numpy as np
def l2_normalizer(vec):
    denom = np.sum([el**2 for el in vec])
    return [(el / math.sqrt(denom)) for el in vec]
doc_term_matrix_l2 = []
for vec in doc_term_matrix:
    doc_term_matrix_l2.append(l2_normalizer(vec))
print('A regular old document term matrix:')
print(np.matrix(doc_term_matrix))
print('\nA document term matrix with row-wise L2 norm:')
print(np.matrix(doc_term_matrix_l2))

A regular old document term matrix:
[[1 1 0 0 0 2 1 1 0 0 2]
 [1 1 1 0 1 1 1 0 0 0 2]
 [0 1 0 1 1 0 1 0 1 1 0]]

A document term matrix with row-wise L2 norm:
[[0.28867513 0.28867513 0.         0.         0.         0.57735027
  0.28867513 0.28867513 0.         0.         0.57735027]
 [0.31622777 0.31622777 0.31622777 0.         0.31622777 0.31622777
  0.31622777 0.         0.         0.         0.63245553]
 [0.         0.40824829 0.         0.40824829 0.40824829 0.
  0.40824829 0.         0.40824829 0.40824829 0.        ]]


Thử trọng số mỗi từ bằng tần số tài liệu nghịch đảo của nó

In [128]:
def numDocsContaining(word, doclist):
    doccount = 0
    for doc in doclist:
        if freq(word, doc) > 0:
            doccount += 1
        return doccount
def idf(word, doclist):
    n_samples = len(doclist)
    df = numDocsContaining(word, doclist)
    return np.log(n_samples / df) if df != 0 else 0
my_idf_vector = [idf(word, mydoclist) for word in vocabulary]
print('Our vocabulary vector is [' + ', '.join(list (vocabulary)) + ']')
print('The inverse document frequency vector is [' + ', '.join(format(freq, 'f') for freq in my_idf_vector) + ']')

Our vocabulary vector is [Mireia, than, Sergio, football, likes, loves, more, Hector, He, basketball, me]
The inverse document frequency vector is [1.098612, 1.098612, 0.000000, 0.000000, 0.000000, 1.098612, 1.098612, 1.098612, 0.000000, 0.000000, 1.098612]


Chuyển đổi vectơ IDF của chúng tôi thành một ma trận trong đó đường chéo là vectơ IDF

In [129]:
def build_idf_matrix(idf_vector):
    idf_mat = np.zeros((len(idf_vector), len(idf_vector)))
    np.fill_diagonal(idf_mat, idf_vector)
    return idf_mat
my_idf_matrix = build_idf_matrix(my_idf_vector)
print(my_idf_matrix)

[[1.09861229 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]
 [0.         1.09861229 0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         1.09861229
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  1.09861229 0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         1.09861229 0.         0.         0.        ]
 [0.         0.         0.         0.         0.

Chuẩn hóa từng tài liệu bằng cách sử dụng định mức L2


In [130]:
doc_term_matrix_tfidf = []
#performing tf-idf matrix multiplication
for tf_vector in doc_term_matrix:
    doc_term_matrix_tfidf.append(np.dot(tf_vector, my_idf_matrix))
#normalizing
doc_term_matrix_tfidf_l2 = []
for tf_vector in doc_term_matrix_tfidf:
    doc_term_matrix_tfidf_l2.append(l2_normalizer(tf_vector))
print(vocabulary)
# np.matrix() just to make it easier to look at
print(np.matrix(doc_term_matrix_tfidf_l2))

{'Mireia', 'than', 'Sergio', 'football', 'likes', 'loves', 'more', 'Hector', 'He', 'basketball', 'me'}
[[0.28867513 0.28867513 0.         0.         0.         0.57735027
  0.28867513 0.28867513 0.         0.         0.57735027]
 [0.35355339 0.35355339 0.         0.         0.         0.35355339
  0.35355339 0.         0.         0.         0.70710678]
 [0.         0.70710678 0.         0.         0.         0.
  0.70710678 0.         0.         0.         0.        ]]


## Các trường hợp thực tế

### Chuẩn bị dữ liệu và cài đặt

Cài đặt TextBlog

In [131]:
# Chạy lệnh dưới trên terminal
# pip install -U textblob
# python -m textblob.download_corpora

Code ví dụ

In [132]:
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.classify import NaiveBayesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
# from unidecode import unidecode
def BoW(text):
    # Tokenizing text
    text_tokenized = [word_tokenize(doc) for doc in text]
    # Removing punctuation
    regex = re.compile('[%s]' % re.escape(string. punctuation))
    tokenized_docs_no_punctuation = []
    for review in text_tokenized:
        new_review = []
        for token in review:
            new_token = regex.sub(u'', token)
            if not new_token == u'':
                new_review.append(new_token)
        tokenized_docs_no_punctuation.append(new_review)
    # Stemming and Lemmatizing
    porter = PorterStemmer()
    preprocessed_docs = []
    for doc in tokenized_docs_no_punctuation:
        final_doc = ''
        for word in doc:
            final_doc = final_doc + ' ' + porter. stem(word)
        preprocessed_docs.append(final_doc)
    return preprocessed_docs

# read your train text data here
# textTrain = ReadTrainDataText()
# preprocessed_docs = BoW(textTrain) # for train data # Computing TIDF word space
# tfidf_vectorizer = TfidfVectorizer(min_df = 1)
# trainData = tfidf_vectorizer.fit_transform(preprocessed_docs)
# textTest = ReadTestDataText() #read your test text data here
# prepro_docs_test = BoW(textTest) # for test data testData = tfidf_vectorizer.transform(prepro_docs_test)

### Phân tích tình cảm khi đánh giá phim

#### Import module

In [133]:
import numpy as np
import pandas as pd
import os
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

#### Loading data

In [134]:
reviews_train = []
for line in open('./aclImdb/movie_data/full_train.txt', 'r'):
    
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open('./aclImdb/movie_data/full_test.txt', 'r'):
    
    reviews_test.append(line.strip())
    
target = [1 if i < 12500 else 0 for i in range(25000)]

Reviews data

In [135]:
reviews_train[10]

'When I first read Armistead Maupins story I was taken in by the human drama displayed by Gabriel No one and those he cares about and loves. That being said, we have now been given the film version of an excellent story and are expected to see past the gloss of Hollywood...<br /><br />Writer Armistead Maupin and director Patrick Stettner have truly succeeded! <br /><br />With just the right amount of restraint Robin Williams captures the fragile essence of Gabriel and lets us see his struggle with issues of trust both in his personnel life(Jess) and the world around him(Donna).<br /><br />As we are introduced to the players in this drama we are reminded that nothing is ever as it seems and that the smallest event can change our lives irrevocably. The request to review a book written by a young man turns into a life changing event that helps Gabriel find the strength within himself to carry on and move forward.<br /><br />It\'s to bad that most people will avoid this film. I only say th

#### Làm sạch và tiền xử lý

In [136]:
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

Reviews data

In [137]:
reviews_train_clean[10]

'when i first read armistead maupins story i was taken in by the human drama displayed by gabriel no one and those he cares about and loves that being said we have now been given the film version of an excellent story and are expected to see past the gloss of hollywood writer armistead maupin and director patrick stettner have truly succeeded  with just the right amount of restraint robin williams captures the fragile essence of gabriel and lets us see his struggle with issues of trust both in his personnel lifejess and the world around himdonna as we are introduced to the players in this drama we are reminded that nothing is ever as it seems and that the smallest event can change our lives irrevocably the request to review a book written by a young man turns into a life changing event that helps gabriel find the strength within himself to carry on and move forward its to bad that most people will avoid this film i only say that because the average american will probably think robin wi

Vectơ hóa

In [138]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)

#### Xây dựng bộ phân loại

In [139]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

target = [1 if i < 12500 else 0 for i in range(25000)]

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print("Accuracy for C=%s: %s" % (c, accuracy_score(y_val, lr.predict(X_val))))

Accuracy for C=0.01: 0.87024
Accuracy for C=0.05: 0.88272


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for C=0.25: 0.87808


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for C=0.5: 0.87392
Accuracy for C=1: 0.87088


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Mô hình cuối cùng

In [140]:
final_model = LogisticRegression(C = 0.05)
final_model.fit(X, target)
print("Final Accuracy: %s" % accuracy_score(target, final_model.predict(X_test)))

Final Accuracy: 0.88144


Xem xét các hệ số lớn nhất và nhỏ nhất, tương ứng

In [141]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), final_model.coef_[0]
    )
}
for best_positive in sorted(
    feature_to_coef.items(), 
    key = lambda x: x[1], 
    reverse = True)[:5]:
    print (best_positive)
    
#     ('excellent', 0.9288812418118644)
#     ('perfect', 0.7934641227980576)
#     ('great', 0.675040909917553)
#     ('amazing', 0.6160398142631545)
#     ('superb', 0.6063967799425831)
    
for best_negative in sorted(
    feature_to_coef.items(), 
    key = lambda x: x[1])[:5]:
    print (best_negative)
    
#     ('worst', -1.367978497228895)
#     ('waste', -1.1684451288279047)
#     ('awful', -1.0277001734353677)
#     ('poorly', -0.8748317895742782)
#     ('boring', -0.8587249740682945)

('excellent', 0.9283544427618067)
('perfect', 0.7944277472574596)
('great', 0.6745553291414698)
('amazing', 0.6164834476214972)
('superb', 0.6055919831751899)
('worst', -1.3679897533477545)
('waste', -1.1688808995878297)
('awful', -1.0273337384921135)
('poorly', -0.8748022417025753)
('boring', -0.8591221031199628)
