***Keras***


In [3]:
import pandas as pd
import numpy as np
import re
import nltk
from keras.preprocessing.text import Tokenizer
from nltk.stem.porter import PorterStemmer
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
from keras.models import Input, Model
from keras.layers import LSTM, Dense, Embedding, concatenate, Dropout, concatenate
from keras.layers import Bidirectional
from sklearn.metrics import accuracy_score, f1_score

Đọc và Xử Lý Dữ Liệu

In [4]:
dataset=pd.read_csv('/content/mynew (9).csv',delimiter=',', error_bad_lines=False)
dataset=dataset.replace('\0','')

In [5]:
dataset2=pd.read_excel('/content/Constraint_English_Val.xlsx')
dataset2=dataset2.replace('\0','')

In [6]:
stop_words = []
stop_words = pd.read_csv('/content/stop_words.txt', sep='\n', header=None)[0].tolist()

In [7]:
def clean_text(string: str, punctuations=r'''!()-[]{};:'"\,<>./?@#$%^&*_~''',stop_words=stop_words) -> str:
    """
    Làm sạch dữ liệu  
    """
    # Xóa urls
    string = re.sub(r'https?://\S+|www\.\S+', '', string)

    # Xóa thẻ html
    string = re.sub(r'<.*?>', '', string)

    # Xóa dấu câu
    for x in string.lower(): 
        if x in punctuations: 
            string = string.replace(x, "") 

    # Chuyển về kiểu chữ thường
    string = string.lower()

    # Xóa từ trong stop word
    string = ' '.join([word for word in string.split() if word not in stop_words])

    # Xóa khoảng trắng
    string = re.sub(r'\s+', ' ', string).strip()

    return string

In [8]:
corpus = []
for review in dataset.values[:, 1]:
    review=clean_text(review)
    corpus.append(review)
X_train=corpus
Y_train = dataset.iloc[:,2:3]
labelencoder_z = LabelEncoder()
Y_train.iloc[:,0] = labelencoder_z.fit_transform(Y_train.iloc[:,0])

In [9]:
corpus1 = []
for review in dataset2.values[:, 1]:
    review=clean_text(review)
    corpus1.append(review)
X_test=corpus1
Y_test = dataset2.iloc[:,2:3]
labelencoder_z = LabelEncoder()
Y_test.iloc[:,0] = labelencoder_z.fit_transform(Y_test.iloc[:,0])

In [10]:
class Embeddings():
    """
    Lớp đọc file word embedding và tạo ma trận dựa trên file đó
    """

    def __init__(self, path, vector_dimension):
        self.path = path 
        self.vector_dimension = vector_dimension
    
    @staticmethod
    # Chuyển đầu vào thành một mảng
    def get_coefs(word, *arr): 
        return word, np.asarray(arr, dtype='float32')
    # Sử dụng dụng file word embedding có sẵn tạo thành từ điển
    def get_embedding_index(self):
        embeddings_index = dict(self.get_coefs(*o.split(" ")) for o in open(self.path, errors='ignore'))
        return embeddings_index
    # Tạo ma trận
    def create_embedding_matrix(self, tokenizer, max_features):
        """
        Hàm tạo ma trận embedding
        """
        model_embed = self.get_embedding_index()

        embedding_matrix = np.zeros((max_features + 1, self.vector_dimension))
        for word, index in tokenizer.word_index.items():
            if index > max_features:
                break
            else:
                try:
                    embedding_matrix[index] = model_embed[word]
                except:
                    continue
        return embedding_matrix

In [11]:

class TextToTensor():
    """ 
    Lớp chuyển đổi từ text sang số và tạo thành ma trận
    """
    def __init__(self, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len

    def string_to_tensor(self, string_list: list) -> list:
        """
        Hàm chuyển đổi từ text sang vecto
        """    
        string_list = self.tokenizer.texts_to_sequences(string_list)
        string_list = pad_sequences(string_list, maxlen=self.max_len)
        
        return string_list

In [12]:
class RnnModel():
    """
    RNN để phân tích ngữ nghĩa
    """

    def __init__(self, embedding_matrix, embedding_dim, max_len, X_additional=None):
        
        inp1 = Input(shape=(max_len,))
        x = Embedding(embedding_matrix.shape[0], embedding_dim, weights=[embedding_matrix])(inp1)
        x = Bidirectional(LSTM(256, return_sequences=True))(x)
        x = Bidirectional(LSTM(150))(x)
        x = Dense(128, activation="relu")(x)
        x = Dropout(0.1)(x)
        x = Dense(64, activation="relu")(x)
        x = Dense(1, activation="sigmoid")(x)    
        model = Model(inputs=inp1, outputs=x)

        model.compile(loss = 'binary_crossentropy', optimizer = 'adam')
        self.model = model

In [13]:

class Pipeline:
    """
    Class Pipeline
    """
    def __init__(
        self, 
        X_train: list, 
        Y_train: list, 
        embed_path: str, 
        embed_dim: int,
        stop_words=[],
        X_test=[], 
        Y_test=[],
        max_len=None,
        epochs=3,
        batch_size=256
        ):

        # Tiền Xử Lý Văn Bản
        X_train = [clean_text(text, stop_words=stop_words) for text in X_train]
        Y_train = np.asarray(Y_train)
        
        # Mã hóa văn bản
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(X_train)

        # Lưu tokenizer
        self.tokenizer = tokenizer

        # Tạo Ma Trận Embeddings
        embedding = Embeddings(embed_path, embed_dim)
        embedding_matrix = embedding.create_embedding_matrix(tokenizer, len(tokenizer.word_counts))

        # Tạo đầu vào cho mô hình
        if max_len is None:
            max_len = np.max([len(text.split()) for text in X_train])
        TextToTensor_instance = TextToTensor(
            tokenizer=tokenizer, 
            max_len=max_len
            )
        X_train = TextToTensor_instance.string_to_tensor(X_train)

        # Tạo Model
        rnn = RnnModel(
            embedding_matrix=embedding_matrix, 
            embedding_dim=embed_dim, 
            max_len=max_len
        )
        rnn.model.fit(
            X_train,
            Y_train, 
            batch_size=batch_size, 
            epochs=epochs
        )

        self.model = rnn.model

        # Nếu có X_test sẽ đưa ra dự đoán
        if len(X_test)>0:
            X_test = [clean_text(text) for text in X_test]
            X_test = TextToTensor_instance.string_to_tensor(X_test)
            yhat = [x[0] for x in rnn.model.predict(X_test).tolist()]
            
            self.yhat = yhat

            # So sánh kết quả dự đoán và thực tế
            if len(Y_test)>0:
                self.acc = accuracy_score(Y_test, [1 if x > 0.5 else 0 for x in yhat])
                self.f1 = f1_score(Y_test, [1 if x > 0.5 else 0 for x in yhat])

In [14]:
import yaml
with open("conf.yml", 'r') as file:
    conf = yaml.safe_load(file).get('pipeline')

In [15]:
results = Pipeline(
    X_train=X_train,
    Y_train=Y_train, 
    embed_path='/content/glove.42B.300d.txt',
    embed_dim=300,
    stop_words=stop_words,
    X_test=X_test,
    Y_test=Y_test,
    max_len=conf.get('max_len'),
    epochs=conf.get('epochs'),
    batch_size=conf.get('batch_size')
)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [22]:
print(results.acc)

0.922429906542056


In [17]:
Y_pred=[1 if x > 0.5 else 0 for x in results.yhat]

In [23]:
from sklearn import metrics
print(metrics.classification_report(Y_test,Y_pred,digits=5))

              precision    recall  f1-score   support

           0    0.91618   0.92157   0.91887      1020
           1    0.92819   0.92321   0.92569      1120

    accuracy                        0.92243      2140
   macro avg    0.92218   0.92239   0.92228      2140
weighted avg    0.92246   0.92243   0.92244      2140



In [24]:

test1  = [ "ngồi một mình trong đêm" ]
test2  = [ "what is covid" ]

In [25]:
TextToTensor_instance = TextToTensor(tokenizer=results.tokenizer,max_len=20)

test1_nn = TextToTensor_instance.string_to_tensor(test1)
test2_nn = TextToTensor_instance.string_to_tensor(test2)

p_test1 = results.model.predict(test1_nn)[0][0]
p_test2 = results.model.predict(test2_nn)[0][0]

In [26]:
print(f'Sentence: {p_test1} Score: {p_test1}')
print(f'Sentence: {p_test2} Score: {p_test2}')

Sentence: 0.026009470224380493 Score: 0.026009470224380493
Sentence: 0.539668619632721 Score: 0.539668619632721


In [None]:
# Saving the predictions
test['prob_is_genuine'] = results.yhat
test['target'] = [1 if x > 0.5 else 0 for x in results.yhat]
 
# Saving the predictions to a csv file
if conf.get('save_results'):
    if not os.path.isdir('output'):
        os.mkdir('output')    
    test[['id', 'target']].to_csv(f'output/submission_{date.today()}.csv', index=False)

In [None]:
!wget http://nlp.stanford.edu/data/glove.42B.300d.zip

--2020-12-14 17:08:37--  http://nlp.stanford.edu/data/glove.42B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.42B.300d.zip [following]
--2020-12-14 17:08:37--  https://nlp.stanford.edu/data/glove.42B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.42B.300d.zip [following]
--2020-12-14 17:08:38--  http://downloads.cs.stanford.edu/nlp/data/glove.42B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1877800501 (1.7G) [application/zip]
Sav

In [2]:
!unzip glove*.zip

Archive:  glove.42B.300d.zip
  inflating: glove.42B.300d.txt      
