In [1]:
import os
os.chdir('..')
os.getcwd()

'/Users/signapoop/Desktop/fyp-graph-clustering'

In [6]:
import pandas as pd
import numpy as np
import pickle
import re

In [7]:
parent_dir = os.path.abspath('..')
test_file = '/data/trec/train_5500.txt'

In [8]:
def get_dataframe(filename):
    lines = open(filename, 'r').read().splitlines()
    data = []
    for i in range(0, len(lines)):
        label = lines[i].split(' ')[0]
        label = label.split(":")[0]
        text = ' '.join(lines[i].split(' ')[1:])
        text = re.sub('[^A-Za-z0-9 ,\?\'\"-._\+\!/\`@=;:]+', '', text)
        data.append([label, text])

    df = pd.DataFrame(data, columns=['label', 'text'])
    df.label = df.label.astype('category')
    return df

df_train = get_dataframe(parent_dir+test_file)

In [10]:
df_train.head()

Unnamed: 0,label,text
0,DESC,How did serfdom develop in and then leave Russ...
1,ENTY,What films featured the character Popeye Doyle ?
2,DESC,How can I find a list of celebrities ' real na...
3,ENTY,What fowl grabs the spotlight after the Chines...
4,ABBR,What is the full form of .com ?


In [11]:
tgt2idx = {'ABBR': 0, 'DESC': 1, 'ENTY': 2, 'HUM': 3, 'LOC': 4, 'NUM': 5}

In [12]:
int_labels = [tgt2idx[l] for l in df_train['label']]

In [13]:
df_train['int_label'] = int_labels

In [14]:
df_train.head()

Unnamed: 0,label,text,int_label
0,DESC,How did serfdom develop in and then leave Russ...,1
1,ENTY,What films featured the character Popeye Doyle ?,2
2,DESC,How can I find a list of celebrities ' real na...,1
3,ENTY,What fowl grabs the spotlight after the Chines...,2
4,ABBR,What is the full form of .com ?,0


In [15]:
df_train.to_csv(parent_dir+'/data/trec/train.csv', index=False)

In [16]:
test_file = '/data/trec/test_data.txt'
df_test = get_dataframe(parent_dir+test_file)
int_labels = [tgt2idx[l] for l in df_test['label']]
df_test['int_label'] = int_labels
df_test.to_csv(parent_dir+'/data/trec/test.csv', index=False)

## Train on embeddings

In [21]:
test_file = '/data/trec/train.pkl'
with open(parent_dir+test_file, 'rb') as f:
    [text_train, X_train] = pickle.load(f)

In [22]:
X_train.shape

(5452, 512)

In [24]:
test_file = '/data/trec/test.pkl'
with open(parent_dir+test_file, 'rb') as f:
    [text_test, X_test] = pickle.load(f)

In [25]:
X_test.shape

(500, 512)

In [26]:
y_train = list(df_train['int_label'])
y_test = list(df_test['int_label'])

In [27]:
import torch.nn as nn


class LogisticRegression(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegression, self).__init__()
        self.fc1 = nn.Linear(input_size, 50)
        #self.fc2 = nn.Linear(300, 50)
        self.fc2 = nn.Linear(50, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.fc2(out)
        #out = self.fc3(out)
        return out

In [28]:
import torch

inputs = torch.from_numpy(X_train)
targets = torch.from_numpy(np.array(y_train))

model = LogisticRegression(512, 6)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)

for epoch in range(1000):
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    loss.backward()
    optimizer.step()

    train_loss = loss.data[0]
    _, predicted = torch.max(outputs.data, 1)
    total = targets.size(0)
    correct = predicted.eq(targets.data).cpu().sum()
    
    if epoch % 100 == 0:
        print("Epoch {}, Loss {:.3f}, Acc {}/{}".format(epoch, train_loss, correct, total))



Epoch 0, Loss 1.809, Acc 164/5452
Epoch 100, Loss 0.625, Acc 4320/5452
Epoch 200, Loss 0.471, Acc 4545/5452
Epoch 300, Loss 0.420, Acc 4603/5452
Epoch 400, Loss 0.389, Acc 4682/5452
Epoch 500, Loss 0.370, Acc 4716/5452
Epoch 600, Loss 0.357, Acc 4738/5452
Epoch 700, Loss 0.346, Acc 4770/5452
Epoch 800, Loss 0.337, Acc 4793/5452
Epoch 900, Loss 0.329, Acc 4805/5452


In [29]:
test_inputs = torch.from_numpy(X_test)
y_pred = model(test_inputs)
_, y_pred = torch.max(y_pred.data, 1)

In [31]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred.numpy())
acc

0.9

In [34]:
import torch
# Use the model object to select the desired layer
layer = model._modules.get('fc1')

# Define a function that will copy the output of a layer
my_embedding = torch.zeros([X_train.shape[0], 50]) 

def copy_data(m, i, o):
    my_embedding.copy_(o.data)
    
# Attach that function to our selected layer
h = layer.register_forward_hook(copy_data)

In [35]:
test_inputs = torch.from_numpy(X_train)
y_pred = model(test_inputs)

In [36]:
X_emb_50 = my_embedding.numpy()
from sklearn.manifold import TSNE
X_emb = TSNE(n_components=2, n_iter=2000, verbose=True, perplexity=30).fit_transform(X_emb_50)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 5452 samples in 0.009s...
[t-SNE] Computed neighbors for 5452 samples in 1.301s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5452
[t-SNE] Computed conditional probabilities for sample 2000 / 5452
[t-SNE] Computed conditional probabilities for sample 3000 / 5452
[t-SNE] Computed conditional probabilities for sample 4000 / 5452
[t-SNE] Computed conditional probabilities for sample 5000 / 5452
[t-SNE] Computed conditional probabilities for sample 5452 / 5452
[t-SNE] Mean sigma: 0.226178
[t-SNE] KL divergence after 250 iterations with early exaggeration: 79.991257
[t-SNE] KL divergence after 2000 iterations: 1.572140


In [37]:
import bokeh.plotting as bp
from bokeh.plotting import save, show
from bokeh.models import HoverTool
from bokeh.io import output_notebook

colormap = np.array([
        "#1f77b4", "#ffbb78", "#d62728", "#c5b0d5",
        "#e377c2", "#9edae5"
    ])
output_notebook()

In [40]:
named_labels = list(df_train['label'])

In [44]:
plot_fig = bp.figure(plot_width=700, plot_height=500,
                     tools="pan, wheel_zoom, box_zoom, reset, hover, previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

data_dict = {'content': text_train, 
             'x': X_emb[:, 0],
             'y': X_emb[:, 1],
             'color': colormap[y_train],
             'label': named_labels}

mySource = bp.ColumnDataSource(data_dict)

plot_fig.circle(x='x', y='y', color='color', legend='label', source=mySource)
plot_fig.legend.location = (0, 70)
new_legend = plot_fig.legend[0]
plot_fig.legend[0].plot = None
plot_fig.add_layout(new_legend, 'right')
plot_fig.legend.label_text_font_size = '7pt'

# hover tools
hover = plot_fig.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @label"}

show(plot_fig)

In [46]:
df_train.groupby('label').count()

Unnamed: 0_level_0,text,int_label
label,Unnamed: 1_level_1,Unnamed: 2_level_1
ABBR,86,86
DESC,1162,1162
ENTY,1250,1250
HUM,1223,1223
LOC,835,835
NUM,896,896


In [47]:
from bokeh.embed import components
plot_script, plot_div = components(plot_fig)


In [48]:
plot_div

'\n<div class="bk-root" id="d25e8f9a-33ca-4129-95b6-ac491e05c2c7"></div>'

In [49]:
plot_script

'\n<script type="text/javascript">\n  (function() {\n    var fn = function() {\n      Bokeh.safely(function() {\n        (function(root) {\n          function embed_document(root) {\n            \n          var docs_json = \'{"c5d6deef-e178-4c61-9947-df006c4e4a3a":{"roots":{"references":[{"attributes":{"data_source":{"id":"1a48fc95-751f-4297-ba8b-760a8bdcbec1","type":"ColumnDataSource"},"glyph":{"id":"4e8b2ec9-2eac-4a59-84e8-deca774e0278","type":"Circle"},"hover_glyph":null,"muted_glyph":null,"nonselection_glyph":{"id":"492c0495-062a-4825-9dc2-de49c7d8d055","type":"Circle"},"selection_glyph":null,"view":{"id":"2854995f-52c7-4d60-811d-1d08ca61b210","type":"CDSView"}},"id":"1ea8780b-1805-4f91-83c6-528687666576","type":"GlyphRenderer"},{"attributes":{},"id":"38e7589a-054a-4ebc-b600-df2603d53732","type":"LinearScale"},{"attributes":{},"id":"62a1cd26-5f64-4ff6-9429-afbcecd951d9","type":"LinearScale"},{"attributes":{"bottom_units":"screen","fill_alpha":{"value":0.5},"fill_color":{"value":"li