### Import Functions

In [None]:
import sys
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install keras
!{sys.executable} -m pip install bs4
!{sys.executable} -m pip install sklearn
#to install Tensorflow: https://www.tensorflow.org/tutorials

In [None]:
#General Load and Save Functions
import tensorflow as tf
import numpy as np
import pandas as pd
import pickle
import glob
import time
import csv
import os

#Functions for loading and cleaning the Wikipedia Data
import bs4 as bs
import urllib.request
import re

#SKlearn packages for pre-processing the train and test data
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from keras.utils import np_utils

#Keras Packages for creating the deep learning model
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
    
print("Import Successful")

### Create Model Saving Directories

In [None]:
#Git does not allow empty folders to be checked in and out so we will create the model saving directory
try:
    os.mkdir('Baseline_Models')
except:
    pass
try:
    os.mkdir('EntityPlus_Models')
except:
    pass

### Load Train and Test Data from CSV

In [None]:
def Load_Data(file):
    with open(file) as f:
        reader = csv.reader(f)
        S = [[row[0],row[1]] for row in reader]
        S.pop(0)
    return S

def Load_Baseline(f1,f2):
    train = Load_Data(f1)
    test = Load_Data(f2)
    return train, test

### Load in Wikipedia Data through Web API

In [None]:
def Load_Wiki(wik):
    raw_html = urllib.request.urlopen(wik)
    raw_html = raw_html.read()

    article_html = bs.BeautifulSoup(raw_html, 'lxml')
    article_paragraphs = article_html.find_all('p')
    article_text = ''

    for para in article_paragraphs:
        article_text += para.text

    article_text = article_text.lower()
    return article_text 

def Load_Wiki_Train():
    w = Load_Data('Curated/TRAIN_WIKI.csv')
    wiki_train = []
    for line in w:
        try:
            cik = line[0]
            link = line[1].replace('"','')
            article = Load_Wiki(link)
            wiki_train.append([cik,article])
        except:
            print(link)
    with open('Curated/wiki_train','wb') as out:
        pickle.dump(wiki_train, out)
    print(len(wiki_train), "Articles Loaded")
    
def Split_Wiki():
    with open('Curated/wiki_train','rb') as out:
        wiki_train = pickle.load(out)

    wiki_train_split = []
    for line in wiki_train:
        cik = line[0]
        wik = line[1].split(".")
        for l in wik:
            wiki_train_split.append([line[0],l])

    with open('Curated/wiki_train_split','wb') as out:
        pickle.dump(wiki_train_split,out)
        
def Load_Wiki_Train_Test(f1,f2,f3):
    trainx = Load_Data(f1)
    test = Load_Data(f2)
    with open(f3,'rb') as out:
        wiki_train = pickle.load(out)
    train = trainx + wiki_train
    return train, test

Load_Wiki_Train()

### Encode and Transform Data into Keras-Readable Vector Format

In [None]:
def Encode_Data(Load):
    train, test = Load
    
    f = train + test
    full_train = pd.DataFrame(train)
    full_test = pd.DataFrame(test)
    full = pd.DataFrame(f)

    sentences_full = full[1].values
    sentences_train = full_train[1].values
    sentences_test = full_test[1].values
    y_train = full_train[0].values
    y_test = full_test[0].values

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_full)

    X_train = vectorizer.transform(sentences_train)
    X_test = vectorizer.transform(sentences_test)
    encoder = LabelEncoder()
    encoder.fit(y_train)
    train_encoded_Y = encoder.transform(y_train)
    test_encoded_Y = encoder.transform(y_test)
    tr_y = np_utils.to_categorical(train_encoded_Y)
    te_y = np_utils.to_categorical(test_encoded_Y)
    labels = [[w,x,y,z] for w,x,y,z in zip(test_encoded_Y,y_test,X_test,sentences_test)]
    
    return X_train, tr_y, X_test, te_y, labels

### Simple Model - has only one input layer and one softmax layer for predictions

In [None]:
def Run_Simple_Model(n,b,m,name):
    print("Starting Model with ",str(n),"epochs")
    start = time.time()
    X_train, tr_y, X_test, te_y, labeled = m
    model = Sequential()
    model.add(Dense(8, input_dim=X_train.get_shape()[1], activation='relu'))
    model.add(Dense(tr_y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, tr_y, batch_size=b,epochs=n)
    score = model.evaluate(X_test, te_y, batch_size=1)
    model.save(name+'_Models//'+name+'_Simple_n'+str(n)+'_b'+str(b)+'_t'+str(start))
    print("Score:",score)
    end = time.time()
    print("Model with ",str(n),"epochs took",(end-start),"seconds")

### Baseline - Simple Model

In [None]:
nlist = [1000]
for n in nlist:
    Run_Simple_Model(n,1,Encode_Data(Load_Baseline('Curated/TRAIN_NAME.csv','Curated/TEST_NAME.csv')),'Baseline')

### Entity Plus - Simple Model

In [None]:
for n in range(160,161,1):
    Run_Simple_Model(n,16,Encode_Data(Load_Wiki_Train_Test('Curated/TRAIN_NAME.csv','Curated/TEST_NAME.csv','Curated/wiki_train_split')),'EntityPlus')

### Advanced Model - includes input layer, randomization layer, dense layer, and softmax layer for predictions

In [None]:
def Run_Advanced_Model(n,b,m,name):
    print("Starting Model with ",str(n),"epochs")
    start = time.time()
    X_train, tr_y, X_test, te_y, labeled = m
    model = Sequential()
    model.add(Dense(tr_y.shape[1], input_dim=X_train.get_shape()[1], activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(tr_y.shape[1], activation='relu'))
    model.add(Dense(tr_y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, tr_y, batch_size=b,epochs=n)
    score = model.evaluate(X_test, te_y, batch_size=1)
    model.save(name+'_Models//'+name+'_Advanced_n'+str(n)+'_b'+str(b)+'_t'+str(start))
    print("Score:",score)
    end = time.time()
    print("Model with ",str(n),"epochs took",(end-start),"seconds")

### Baseline - Advanced Model

In [None]:
nlist = [30]
for n in nlist:
    Run_Advanced_Model(n,1,Encode_Data(Load_Baseline('Curated/TRAIN_NAME.csv','Curated/TEST_NAME.csv')),'Baseline')

### Entity Plus - Advanced Model

In [None]:
for n in range(11,12,1):
    Run_Advanced_Model(n,10,Encode_Data(Load_Wiki_Train_Test('Curated/TRAIN_NAME.csv','Curated/TEST_NAME.csv','Curated/wiki_train_split')),'EntityPlus')

### Many Layers Model - includes input layer, four Randomization and dense layers, and softmax layer for predictions

In [None]:
def Run_Many_Layer_Model(n,b,m,name):
    print("Starting Model with ",str(n),"epochs")
    start = time.time()
    X_train, tr_y, X_test, te_y, labeled = m
    model = Sequential()
    model.add(Dense(tr_y.shape[1], input_dim=X_train.get_shape()[1], activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(tr_y.shape[1], activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(tr_y.shape[1], activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(tr_y.shape[1], activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(tr_y.shape[1], activation='relu'))
    model.add(Dense(tr_y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, tr_y, batch_size=b,epochs=n)
    score = model.evaluate(X_test, te_y, batch_size=1)
    model.save(name+'_Models//'+name+'_Many_Layers_n'+str(n)+'_b'+str(b)+'_t'+str(start))
    print("Score:",score)
    end = time.time()
    print("Model with ",str(n),"epochs took",(end-start),"seconds")

### Baseline - Many Layer Model

In [None]:
nlist = [30]
for n in nlist:
    Run_Many_Layer_Model(n,1,Encode_Data(Load_Baseline('Curated/TRAIN_NAME.csv','Curated/TEST_NAME.csv')),'Baseline')

### Entity Plus - Many Layer Model

In [None]:
for n in range(11,12,1):
    Run_Many_Layer_Model(n,10,Encode_Data(Load_Wiki_Train_Test('Curated/TRAIN_NAME.csv','Curated/TEST_NAME.csv','Curated/wiki_train_split')),'EntityPlus')

### Function to Examine the Models and Analyze what the model got wrong. For the project the analysis was done in Excel (Confusion_Analysis.xlsx)

In [None]:
def ExamineModel(f,m):
    print(f)
    model = keras.models.load_model(f)
    X_train, tr_y, X_test, te_y, labeled = m
    score = model.evaluate(X_test, te_y, batch_size=1)
    print(score)
    preds = model.predict(X_test,batch_size=1)
    for row,l in zip(preds,labeled):
        correct = l[0]
        correct_Prob = row[correct]
        cik = l[1]
        name = l[3]
        pred = row.argmax()
        pred_Prob = row.max()
        cat = 'Wrong'
        if correct == pred:
            cat = 'Correct'
        else:
            print([cik,correct,correct_Prob,pred,pred_Prob])
            
for file in glob.glob('Baseline_Models/*'):
    try:
        ExamineModel(file,Encode_Data(Load_Baseline('Curated/TRAIN_NAME.csv','Curated/TEST_NAME.csv')))
    except:
        pass
    
for file in glob.glob('EntityPlus_Models/*'):
    try:
        ExamineModel(file,Encode_Data(Load_Wiki_Train_Test('Curated/TRAIN_NAME.csv','Curated/TEST_NAME.csv','Curated/wiki_train_split')))
    except:
        pass