In [0]:
# Dissertation: syntax highlighter for ordinary English text
# The last update: 01/09/2019

# Please upload BBCdataset.zip and corresponding weights first

In [0]:
!unzip BBCdataset.zip

In [0]:
from keras.models import Model, Input
from keras.layers import LSTM, Dense, Embedding, GRU, Concatenate
import numpy as np
import pandas as pd
import io

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 

from os import listdir
from os.path import isfile, join
onlyfiles = [f for f in listdir('BBCdataset') if isfile(join('BBCdataset', f))]
total_data = []
for i in onlyfiles:
    with io.open('BBCdataset/'+i, encoding='utf-8') as f:
        # potential encoding problems
        try:
            data = f.read()
            total_data.append(data)
        except:
            continue

In [0]:
test_data = total_data[-24:]
train_data = total_data[:-24]

chars = sorted(set(list(''.join(total_data))))
# index dict
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars)) 

In [0]:
# we provide 3-layered GRU with residual connections model and 3-layered GRU
# with residual connections and an embedding layer model
# Implementation of Appendix A
def residual_GRU():
    input_char = Input(shape=(None, len(chars)))
    hidden1 = GRU(512, return_sequences=True)(input_char)
    concat1 = Concatenate(axis=-1)([input_char, hidden1])
    hidden2 = GRU(256, return_sequences=True)(concat1)
    concat2 = Concatenate(axis=-1)([hidden1, hidden2])
    hidden3 = GRU(128, return_sequences=True, name='target')(concat2)
    concat3 = Concatenate(axis=-1)([hidden2, hidden3])
    output = Dense(len(chars), activation='softmax')(concat3)
    model = Model(input_char, output)
    
    return model

def embedding_residual_GRU():
    input_char = Input(shape=(None,))
    embed = Embedding(88, 64)(input_char)
    hidden1 = GRU(512, return_sequences=True)(embed)
    concat1 = Concatenate(axis=-1)([embed, hidden1])
    hidden2 = GRU(256, return_sequences=True)(concat1)
    concat2 = Concatenate(axis=-1)([hidden1, hidden2])
    hidden3 = GRU(128, return_sequences=True, name='target')(concat2)
    concat3 = Concatenate(axis=-1)([hidden2, hidden3])
    output = Dense(len(chars), activation='softmax')(concat3)
    model = Model(input_char, output)
    
    return model

def vectorize_1(test_para):
    '''
    one hot encoding
    '''
    x_pred = np.zeros((1, len(test_para), len(chars)))
    for t, char in enumerate(test_para):
        x_pred[0, t, char_indices[char]] = 1.
   
    return x_pred

def vectorize_2(test_para):
    '''
    dense encoding
    '''
    indices = [char_indices[char] for char in test_para]
    return indices

def choose_one_article(test_idx, encoding='non-embedding'):
    # choose one article
    test_para = test_data[test_idx]
    # encoding test_para
    if encoding == 'non-embedding':
        x_pred = vectorize_1(test_para)
        pred = inter_model.predict(x_pred)
    elif encoding == 'embedding':
        x_pred = vectorize_2(test_para)
        pred = inter_model.predict(x_pred).reshape((1,-1,128))
    # get outputs
    result = []
    for i in range(128):        # output dim should be 128
        a = list(pred[0][:,i])
        result.append(a)    
    result.insert(0, list(test_para))
    
    df = pd.DataFrame(result)
    col_name = ['ts'+str(i) for i in list(range(len(test_para)))]
    df.columns = col_name
    
    return test_para, df

def choose_all_article(encoding='non-embedding'):
    test_para = ''
    for each in test_data:
        test_para += each
        test_para += '\n'
        
    if encoding == 'non-embedding':
        x_pred = vectorize_1(test_para)
        pred = inter_model.predict(x_pred)
    elif encoding == 'embedding':
        x_pred = vectorize_2(test_para)
        pred = inter_model.predict(x_pred).reshape((1,-1,128))
    # get outputs
    result = []
    for i in range(128):        # output dim should be 128
        a = list(pred[0][:,i])
        result.append(a)    
    result.insert(0, list(test_para))
    
    df = pd.DataFrame(result)
    col_name = ['ts'+str(i) for i in list(range(len(test_para)))]
    df.columns = col_name
    
    return test_para, df
        
def pca_transform(df, proportion):
    '''PCA Transformation'''
    X = df.T.iloc[:,1:]
    pca = PCA(n_components=proportion)
    X_reduced = pca.fit_transform(X)
    weights = pca.explained_variance_ratio_
    
    PCAoutputs = []
    for i in range(len(pca.explained_variance_ratio_)):
        PCAoutputs.append(X_reduced[:,i])
        
    PCAoutputs.insert(0,list(test_para))
    
    return pd.DataFrame(PCAoutputs), weights


In [0]:
# choose 3-layered GRU with residual connections model for example
model = residual_GRU()
model.load_weights('residual-GRU.h5')

layer_name = 'target'
inter_model = Model(inputs=model.input,
                    outputs=model.get_layer(layer_name).output)

In [0]:
# one test article
test_idx = 6
test_para, df = choose_one_article(test_idx, encoding='non-embedding')   
df2, weights =  pca_transform(df, 0.6)

# To test model performance on hidden state units, use the code below
# all test article
#test_para, df = choose_all_article(encoding='non-embedding')  
#df2, weights =  pca_transform(df, 0.6)

In [0]:
alpha = chars[2:]

def calculate_mean(df):
    '''Implementation of caluculating word-vector'''
    res = np.zeros(len(weights),)
    count = 0
    for i in range(len(df.columns)):
        if df.iloc[0, i] in alpha:
            res += df.iloc[1:, i]
            count += 1
        else:
            if count == 0:
                continue
            else:
                for j in range(i-count,i):    
                    df.iloc[1:, j] = (res/count)
                count = 0
                res = np.zeros(len(weights),)
    return df

In [0]:
# kmeans
# cluster number is derived using the Algorithm One in the dissertation
df3 = calculate_mean(df2)
X = df3.T.iloc[:,1:]
clu_num = 6
kmeans = KMeans(n_clusters=clu_num, n_init = 500).fit(X)

In [0]:
df4 = pd.DataFrame([list(df3.iloc[0,:]),
                    kmeans.labels_])

kmeans_res = [[] for i in range(clu_num)]
char = ''
for i in range(len(df4.columns)-1):
    if int(df4.iloc[1, i]) == int(df4.iloc[1, i+1]):
        char += str(df4.iloc[0, i])
    else:
        char += df4.iloc[0, i]
        kmeans_res[int(df4.iloc[1, i])].append(char)
        char = ''
        
# visualize words in all clusters
for i in range(clu_num):
    print(sorted(set(kmeans_res[i])))

In [0]:
# set of kmeans result
kmeans_res_set = []
for i in range(len(kmeans_res)):
    res = list(set(kmeans_res[i]))
    kmeans_res_set.append(res)
    
# do not visualize the cluster with the most words (common words)    
maxlen = max([len(i) for i in kmeans_res_set])

new_one = []
for i in kmeans_res_set:
    if len(i) == maxlen:
        new_one.append([''])
    else:
        new_one.append(i)
        
# remove stop words
new_k_res = [[] for i in range(clu_num)]
for i in range(len(new_one)):
    for word in new_one[i]:
        if word.lower() not in stop_words:
            new_k_res[i].append(word.strip())  

In [0]:
# visualize the test article
paras = test_para.strip().split('\n\n')
word_list = [paras[i].split() for i in range(len(paras))]

total = []
for words in word_list:
    formats = ''
    for word in words:
        if word in new_k_res[0]:
            color = 'red'             #'Tomato'                         
        elif word in new_k_res[1]:
            color = 'red'             #'Orange'            
        elif word in new_k_res[2]:
            color = 'red'             #'DodgerBlue'
        elif word in new_k_res[3]:
            color = 'red'             #'MediumSeaGreen'
        elif word in new_k_res[4]:
            color = 'red'             #'SlateBlue'
        elif word in new_k_res[5]:
            color = 'red'             #'SaddleBrown '
        else:
            color = 'black'
        style = '<span style="word-break:break-all; color:' + color + '">' + word+ '</span>&nbsp;'
        formats = formats + style
    total.append(formats)
    
with open ('test.txt', 'w') as f:
    for i in total:
        style = '<p>'+i+'</p>'
        f.write(style)

In [0]:
#####################################
# PCA and t-SNE visualization
#####################################

In [0]:
# Visualization on PCA features
list1 = [list(kmeans.labels_)]

for i in range(len(weights)):
    a = list(df3.iloc[i+1, :])
    list1.append(a)

df_vis = pd.DataFrame(list1)
df_vis = df_vis.T

# rename columns
dict1 = dict()
dict1[0] = 'target'
for i in range(len(weights)):
    dict1[i+1] = 'd'+str(i+1)
df_vis.rename( dict1,axis=1, inplace=True)
df_vis['target'] = df_vis['target'].apply(lambda x: str(x))
# x values
feat_cols = list(df_vis.columns[1:])
data_subset = df_vis[feat_cols].values

# copy of df_vis
df_copy = df_vis.copy()

In [0]:
# PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df_vis[feat_cols].values)
df_copy['pca-one'] = pca_result[:,0]
df_copy['pca-two'] = pca_result[:,1] 

import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(8,8))
sns.scatterplot(
    x="pca-one", y="pca-two",
    hue="target",
    palette=sns.color_palette("hls", clu_num),
    data=df_copy,
    legend="full",
    alpha=0.3
)

plt.savefig('PCA_visualization')

In [0]:
# tsne
import time
from sklearn.manifold import TSNE

time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(data_subset)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
df_copy['tsne-2d-one'] = tsne_results[:,0]
df_copy['tsne-2d-two'] = tsne_results[:,1]
plt.figure(figsize=(8,8))
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="target",
    palette=sns.color_palette("hls", clu_num),
    data=df_copy,
    size=0.5,
    legend="full",
    alpha=1
)

plt.savefig('tsne_visualization')

In [0]:
#############################################
# Visualization on hidden state units
#############################################

In [0]:
alpha = chars[2:]

def calculate_mean1(df):
    # calculate mean of each word
    res = np.zeros(128,)
    count = 0
    for i in range(len(df.columns)):
        if df.iloc[0, i] in alpha:
            res += df.iloc[1:, i]
            count += 1
        else:
            if count == 0:
                continue
            else:
                for j in range(i-count,i):    
                    df.iloc[1:, j] = (res/count)
                count = 0
                res = np.zeros(128,)
    return df

In [0]:
# pca and tsne
df3 = calculate_mean1(df)
list1 = [list(kmeans.labels_)]

for i in range(128):
    a = list(df3.iloc[i+1, :])
    list1.append(a)

df_vis = pd.DataFrame(list1)
df_vis = df_vis.T

# rename columns
dict1 = dict()
dict1[0] = 'target'
for i in range(128):
    dict1[i+1] = 'd'+str(i+1)
df_vis.rename( dict1,axis=1, inplace=True)
df_vis['target'] = df_vis['target'].apply(lambda x: str(x))
# x values
feat_cols = list(df_vis.columns[1:])
data_subset = df_vis[feat_cols].values

# copy of df_vis
df_copy = df_vis.copy()

In [0]:
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df_copy[feat_cols].values)
df_copy['pca-one'] = pca_result[:,0]
df_copy['pca-two'] = pca_result[:,1] 

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(8,8))
sns.scatterplot(
    x="pca-one", y="pca-two",
    hue="target",
    palette=sns.color_palette("hls", clu_num),
    data=df_copy,
    legend="full",
    alpha=0.3
)
plt.savefig('PCA_hidden unit')

In [0]:
import time
from sklearn.manifold import TSNE

time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(data_subset)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
df_copy['tsne-2d-one'] = tsne_results[:,0]
df_copy['tsne-2d-two'] = tsne_results[:,1]
plt.figure(figsize=(8,8))
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="target",
    palette=sns.color_palette("hls", clu_num),
    data=df_copy,
    legend="full",
    alpha=0.3,
    size=0.3
)

plt.savefig('t-SNE_hidden unit')