In [None]:
from gensim.models import word2vec
from keras.models import Sequential
from keras.layers import Conv2D,MaxPool2D,Dense,Flatten
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, AveragePooling2D
from keras import backend as K
import numpy as np

In [None]:
model_DOC_test = word2vec.Word2Vec.load('DOC-test_model')
model_DOC_train = word2vec.Word2Vec.load('DOC-train_model')

In [None]:
train_file = open('train.txt', 'r')
train_data = train_file.read().splitlines()
train_file.close()

In [None]:
test_file = open('test.txt', 'r')
test_data = test_file.read().splitlines()
test_file.close()

In [None]:
len(train_data)

In [None]:
def restructure_X(data):
    sentence = []
    X = []
    Y = []
    for i in range(len(data)):
        if data[i] != ". 0":    
            sentence.append(data[i].split())
        elif len(sentence)>0:
            sentence = np.array(sentence)
            X.append(sentence[:,0])
            Y.append(sentence[:,1])
            sentence = []
                
    return X,Y

In [None]:
X_train, y_train= restructure_X(train_data)

In [None]:
X_test,y_test = restructure_X(test_data)

In [None]:
def get_vector(sentences,model):
    data_vec = []
    for sentence in sentences:
        word_vec = []
        for i in range(sentence.shape[0]):
            word_vec.append(model[sentence[i]])
        data_vec.append(np.array(word_vec))
    return np.array(data_vec)

In [None]:
X_train_vec = get_vector(X_train,model_DOC_train)

In [None]:
X_test_vec = get_vector(X_test,model_DOC_test)

In [None]:
# add contextual information: one word before and one word after
# Thank Marshall Wice for teaching me this method
def pad_words(dataset_vec):
    new_data_vec = [] 
    for i in range(len(dataset_vec)):
        sentence = dataset_vec[i]
        sentence_vec = []
        if len(sentence) == 1:
            word_matrix = np.zeros((3, 50))
            word_matrix[1] = sentence
            sentence_vec.append(word_matrix)

        elif len(sentence) == 2:
            for i in range(len(sentence)):
                word_matrix = np.zeros((3, 50))
                if i == 0:
                    word_matrix[1] = sentence[i]
                    word_matrix[2] = sentence[i+1]
                elif i == len(sentence)-1:
                    word_matrix[0] = sentence[i-1]
                    word_matrix[1] = sentence[i]
                sentence_vec.append(word_matrix)

        else:
            for i in range(len(sentence)):
                word_matrix = np.zeros((3, 50))
                if i == 0:
                    word_matrix[1] = sentence[i]
                    word_matrix[2] = sentence[i+1]
                elif i == len(sentence)-1:
                    word_matrix[0] = sentence[i-1]
                    word_matrix[1] = sentence[i]
                else:
                    word_matrix[0] = sentence[i-1]
                    word_matrix[1] = sentence[i]
                    word_matrix[2] = sentence[i+1]
                sentence_vec.append(word_matrix)
            
        new_data_vec.append(np.array(sentence_vec)) 
        
    new_data_vec = np.array(new_data_vec)
    flattened_data_vec = []
    for i in range(len(new_data_vec)):
        for j in range(new_data_vec[i].shape[0]):
            flattened_data_vec.append(new_data_vec[i][j])
            
    return np.array(flattened_data_vec)


In [None]:
X_train = pad_words(X_train_vec)

In [None]:
X_test = pad_words(X_test_vec)

In [None]:
def pad_labels(labels):
    new_labels = []
    for i in range(len(labels)):
        for j in range(len(labels[i])):
            new_labels.append(labels[i][j])
    return np.array(new_labels)

In [None]:
pad_y_train = pad_labels(y_train)

In [None]:
pad_y_test = pad_labels(y_test)

In [None]:
pad_y_train[0]

In [None]:
X_train = X_train.reshape(X_train.shape[0], 10, 15, 1)

In [None]:
X_test = X_test.reshape(X_test.shape[0], 10, 15, 1)

In [None]:
pad_y_train = pad_y_train.astype(np.float32).reshape(-1,1)

In [None]:
pad_y_test = pad_y_test.astype(np.float32).reshape(-1,1)

In [None]:
from sklearn.preprocessing import OneHotEncoder
Onehot_y_train = OneHotEncoder().fit_transform(pad_y_train).todense()

In [None]:
Onehot_y_test = OneHotEncoder().fit_transform(pad_y_test).todense()

In [None]:
Onehot_y_test.shape

### CNN

In [None]:
model = Sequential()
model.add(Conv2D(6,kernel_size=2,strides=1,padding='same',input_shape=(10, 15, 1)))
model.add(Activation('relu'))
model.add(Conv2D(16,kernel_size=2,strides=1,padding='valid'))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(104))
model.add(Dense(8,activation = 'softmax'))

In [None]:
model.compile('sgd',loss='categorical_crossentropy',metrics=['accuracy'])

In [None]:
model.fit(X_train,Onehot_y_train,batch_size=64,epochs=40,validation_data=[X_test,Onehot_y_test])

In [None]:
predictions_last_epoch = model.predict(X_test, batch_size=64,verbose=1)

In [None]:
target_names = ['O', 'I-ORG', 'I-MISC', 'I-PER', 'I-LOC', 
                    'B-MISC', 'B-LOC', 'B-ORG']

In [None]:
predicted_classes = np.argmax(predictions_last_epoch, axis=1)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(pad_y_test, predicted_classes, target_names=target_names))

### Logistic regression

In [None]:
X_for_lg_train = pad_words(X_train_vec)

In [None]:
X_for_lg_train = X_for_lg_train.reshape(y_for_lg_train.shape[0],150)

In [None]:
X_for_lg_train.shape

In [None]:
X_for_lg_test = pad_words(X_test_vec)

In [None]:
X_for_lg_test = X_for_lg_test.reshape(X_for_lg_test.shape[0],150)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lg = LogisticRegression(max_iter=10000, verbose=1, penalty='l2', multi_class='multinomial',solver='sag')

In [None]:
lg.fit(X= X_for_lg_train,y=pad_labels(y_train))

In [None]:
pred = lg.predict(X_for_lg_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(pad_labels(y_test),pred))