<a href="https://colab.research.google.com/github/hasiburrahman1/NLP/blob/master/factroid_type.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np 
import pandas as pd 
import os
import re
import matplotlib.pyplot as plt
import xml.etree.ElementTree as et 
import tensorflow as tf 
import tensorflow_addons as tfa
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, SimpleRNN, GRU, LSTM, Bidirectional, Dropout, Input, Conv2D, MaxPool2D
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.optimizers import Adam
from keras.models import Model
from keras.layers.convolutional import Conv1D
from keras.layers import Conv1D, Dense, MaxPool1D, Flatten, Input, GlobalMaxPooling1D

####Convert XML into DataFrame


In [None]:
xtree = et.parse("/content/drive/My Drive/Colab Notebooks/MedQuAD_3_GHR_QA/joined.xml")
xroot = xtree.getroot()
lst = xroot.findall('QAPairs/QAPair')
data1 = []
data2 = []
data3 = []
print(len(lst))
#print(lst)
print("\n")

for item in lst:
    for x in item:
         if x.get('qtype') != None:
            qtype = x.get('qtype')
            data1.append(qtype)
    Question = item.find('Question').text
    data2.append(Question)
    Answer = item.find('Answer').text
    data3.append(Answer)

In [None]:
df = pd.DataFrame(list(zip(data1, data2, data3)), 
               columns =['qtype', 'Question', 'Answer']) 

df.drop(df.columns[[2]], axis = 1, inplace = True)
print(len(df)) 
df

###Basic data cleaning

In [None]:
df.to_csv('data.csv')
!cp data.csv "drive/My Drive/"

In [None]:
df["qtype"].value_counts()

In [None]:
## categorical to numerical
encode = {
    "genetic changes" : 0,
    "inheritance"  : 1,       
    "frequency" : 2,    
    "information" : 3,     
    "treatment"  : 4,    
}

In [None]:
df["qtype"] = df["qtype"].apply(lambda x: encode[x])
df.head()

###Data cleaning for NLP

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
def clean_text(text):
    text = text.lower()
    
    pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    text = pattern.sub('', text)
    text = " ".join(filter(lambda x:x[0]!='@', text.split()))
    
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)        
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text) 
    text = re.sub(r"\'ll", " will", text)  
    text = re.sub(r"\'ve", " have", text)  
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"did't", "did not", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"couldn't", "could not", text)
    text = re.sub(r"have't", "have not", text)
    text = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-]", "", text)
    return text

In [None]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def CleanTokenize(df):
    ques_lines = list()
    lines = df["Question"].values.tolist()

    for line in lines:
        line = clean_text(line)
        # tokenize the text
        tokens = word_tokenize(line)
        # remove puntuations
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        # remove non alphabetic characters
        words = [word for word in stripped if word.isalpha()]
        
        stop_words = set(stopwords.words("english"))
        # remove stop words
        words = [w for w in words if not w in stop_words]
        
        ques_lines.append(words)
        
    return ques_lines

ques_lines = CleanTokenize(df)
print("total line are ", len(ques_lines)) 
ques_lines[0:10]

In [None]:
len(max(ques_lines, key=len))

###Train-test split

In [None]:
validation_split = 0.2
max_length = 15


tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(ques_lines)
sequences = tokenizer_obj.texts_to_sequences(ques_lines)

word_index = tokenizer_obj.word_index
print("unique tokens - ",len(word_index))
vocab_size = len(tokenizer_obj.word_index) + 1
print('vocab size -', vocab_size)

ques_lines_pad = pad_sequences(sequences, maxlen=max_length, padding='post')
qtype =  df['qtype'].values

indices = np.arange(ques_lines_pad.shape[0])
np.random.shuffle(indices)
ques_lines_pad = ques_lines_pad[indices]
qtype = qtype[indices]

n_values = np.max(qtype) + 1

Y = np.eye(n_values)[qtype]


num_validation_samples = int(validation_split * ques_lines_pad.shape[0])

X_train_pad = ques_lines_pad[:-num_validation_samples]
y_train = Y[:-num_validation_samples]
X_test_pad = ques_lines_pad[-num_validation_samples:]
y_test = Y[-num_validation_samples:]

In [None]:
print('Shape of X_train_pad:', X_train_pad.shape)
print('Shape of y_train:', y_train.shape)

print('Shape of X_test_pad:', X_test_pad.shape)
print('Shape of y_test:', y_test.shape)

###Custom word2vec model

In [None]:
import gensim
model = gensim.models.Word2Vec(sentences=ques_lines, size=100, window=5, workers=4, min_count=1, sg=1) #sg= 1:skip-gram 0:cbow
vocab_words = list(model.wv.vocab)   

print(len(vocab_words))
print(vocab_words)

In [None]:
filename = "questype_word2vec_full.txt"
model.wv.save_word2vec_format(filename, binary=False)

In [None]:
embedding_index = {}
f = open(os.path.join('', '/content/questype_word2vec_full.txt'),  encoding = "utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coeff = np.asarray(values[1:], dtype='float32')
    embedding_index[word] = coeff
f.close()

In [None]:
embedding_dim = 100
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_matrix.shape

In [None]:
embedding_layer = Embedding(len(word_index) + 1,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)

##Create LSTM model

In [None]:
# https://rdrr.io/cran/tfaddons/man/metrics_f1score.html
# https://github.com/tensorflow/addons/issues/825
model1 = Sequential()
model1.add(embedding_layer)
# Recurrent layer
model1.add(LSTM(100, return_sequences=False, 
               dropout=0.1, recurrent_dropout=0.1))

# Fully connected layer
model1.add(Dense(100, activation='relu'))

# Dropout for regularization
model1.add(Dropout(0.5))

# Output layer
model1.add(Dense(5, activation='softmax'))

model1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy',tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tfa.metrics.FBetaScore(num_classes=5, average="macro", threshold=0.5 )])

print(model1.summary())



###LSTM Model Training

In [None]:
history = model1.fit(X_train_pad, y_train, epochs=10,batch_size=32, validation_data=(X_test_pad, y_test), verbose=1)

In [None]:

from matplotlib import pyplot

pyplot.plot(history.history['loss'])
pyplot.plot(history.history['val_loss'])
pyplot.title('model train vs validation loss')
pyplot.ylabel('loss')
pyplot.xlabel('epoch')
pyplot.legend(['train', 'validation'], loc='upper right')
pyplot.show()


####Bidirectional LSTM

In [None]:

model2 =Sequential()
model2.add(embedding_layer)

model2.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.25)))
model2.add(Dense(5, activation='softmax'))

model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy',tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tfa.metrics.FBetaScore(num_classes=5, average="macro", threshold=0.5)])

print(model2.summary())


In [None]:
history = model2.fit(X_train_pad, y_train, epochs=10,batch_size=32, validation_data=(X_test_pad, y_test), verbose=1)

###GRU

In [None]:

#https://www.programcreek.com/python/example/97114/keras.layers.recurrent.GRU

model3 = Sequential()
model3.add(embedding_layer)
model3.add(Dropout(0.5))
model3.add(GRU(100, return_sequences=False))
# Add dropout if overfitting
model3.add(Dropout(0.5))
model3.add(Dense(5))
model3.add(Activation('softmax'))

model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy',tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tfa.metrics.FBetaScore(num_classes=5, average="macro", threshold= .5)])
model3.summary()


In [None]:
history = model3.fit(X_train_pad, y_train, epochs=10,batch_size=60, validation_data=(X_test_pad, y_test), verbose=1)

###Simple RNN

In [None]:

#https://medium.com/@hemantranvir/spam-detection-using-rnn-simplernn-lstm-with-step-by-step-explanation-530367608071
model4 = Sequential()
model4.add(embedding_layer)
#model.add(SimpleRNN(units=embedding_mat_columns))
model4.add(SimpleRNN(units=100, dropout=0.2, recurrent_dropout=0.25))
model4.add(Dense(5, activation='softmax'))
model4.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy',tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tfa.metrics.FBetaScore(num_classes=5, average="macro", threshold= 0.5 )])
model4.summary()


In [None]:
history = model4.fit(X_train_pad, y_train, epochs=10,batch_size=60, validation_data=(X_test_pad, y_test), verbose=1)

####CNN

In [None]:
#https://github.com/keras-team/keras/blob/master/examples/imdb_cnn.py----------
#https://github.com/bhaveshoswal/CNN-text-classification-keras/blob/master/model.py
# set parameters:
filters = 250
kernel_size = 3
hidden_dims = 100

model5 = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions

#model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
model5.add(embedding_layer)
model5.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model5.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model5.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model5.add(Dense(hidden_dims))
model5.add(Dropout(0.2))
model5.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model5.add(Dense(5))
model5.add(Activation('softmax'))

model5.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy',tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tfa.metrics.FBetaScore(num_classes=5, average="macro", threshold= 0.5 )])

model5.summary()

In [None]:
history = model5.fit(X_train_pad, y_train, epochs=10,batch_size=60, validation_data=(X_test_pad, y_test), verbose=1)


###Can this model detect qtype from LSTM model?



In [None]:
from pandas import *
import operator

def predict_(s):
    x_final = pd.DataFrame({"Question":[s]})
    #print("x final",x_final)
    test_lines = CleanTokenize(x_final)
    test_sequences = tokenizer_obj.texts_to_sequences(test_lines)
    test_review_pad = pad_sequences(test_sequences, maxlen=max_length, padding='post')
    pred = model1.predict(test_review_pad)
    #print(f'NumPy Array:\n{[pred]}')
    pred = pred.ravel()
    listOfInt  = pred.tolist()
    #print(listOfInt)
    listOfStr = ["genetic changes", "inheritance", "frequency" , "information" , "treatment"]
    # Create a zip object from two lists
    zipbObj = zip(listOfStr, listOfInt)
    # Create a dictionary from zip object
    dictOfresult = dict(zipbObj)
    #print(s, end =" ")
    #print("predict type:" max(dictOfresult.items(), key=operator.itemgetter(1))[0])
    return max(dictOfresult.items(), key=operator.itemgetter(1))[0]

def qtype_encode(actual_qtype):
    if actual_qtype == 0:
        return 'genetic changes'      
    elif actual_qtype == 1: 
        return 'inheritance'
    elif  actual_qtype == 2: 
        return 'frequency'
    elif  actual_qtype == 3: 
        return 'information'
    elif  actual_qtype == 4: 
        return 'treatment'

In [None]:
i = 4000
for k in range(i,i+50):
    ques = df['Question'].iloc[k]  
    qtype = df['qtype'].iloc[k]
    predict_qtype = predict_(ques)
    actual_qtype = qtype_encode(qtype)
    print("Question : ",ques ,"\nActual type :",actual_qtype,"\nPredict type : ",predict_qtype,"\n\n" )

In [None]:
predict_("What is (are) Coronavirous syndrome ?")    

In [None]:
predict_("How many people are affected by Coronavirous syndrome ?")

In [None]:
predict_("Is Coronavirous inherited ?")