In [1]:
import os
os.chdir('..')
os.getcwd()

In [2]:
import pandas as pd
import numpy as np
import pickle
import re

In [3]:
parent_dir = os.path.abspath('..')
test_file = '/data/trec/train_5500.txt'

In [4]:
def get_dataframe(filename):
    lines = open(filename, 'r').read().splitlines()
    data = []
    for i in range(0, len(lines)):
        label = lines[i].split(' ')[0]
        label = label.split(":")[0]
        text = ' '.join(lines[i].split(' ')[1:])
        text = re.sub('[^A-Za-z0-9 ,\?\'\"-._\+\!/\`@=;:]+', '', text)
        data.append([label, text])

    df = pd.DataFrame(data, columns=['label', 'text'])
    df.label = df.label.astype('category')
    return df

df_train = get_dataframe(parent_dir+test_file)

In [5]:
df_train.head()

Unnamed: 0,label,text
0,DESC,How did serfdom develop in and then leave Russ...
1,ENTY,What films featured the character Popeye Doyle ?
2,DESC,How can I find a list of celebrities ' real na...
3,ENTY,What fowl grabs the spotlight after the Chines...
4,ABBR,What is the full form of .com ?


In [6]:
tgt2idx = {'ABBR': 0, 'DESC': 1, 'ENTY': 2, 'HUM': 3, 'LOC': 4, 'NUM': 5}

In [7]:
int_labels = [tgt2idx[l] for l in df_train['label']]

In [8]:
df_train['int_label'] = int_labels

In [9]:
df_train.head()

Unnamed: 0,label,text,int_label
0,DESC,How did serfdom develop in and then leave Russ...,1
1,ENTY,What films featured the character Popeye Doyle ?,2
2,DESC,How can I find a list of celebrities ' real na...,1
3,ENTY,What fowl grabs the spotlight after the Chines...,2
4,ABBR,What is the full form of .com ?,0


In [10]:
df_train.to_csv(parent_dir+'/data/trec/train.csv', index=False)

In [11]:
test_file = '/data/trec/test_data.txt'
df_test = get_dataframe(parent_dir+test_file)
int_labels = [tgt2idx[l] for l in df_test['label']]
df_test['int_label'] = int_labels
df_test.to_csv(parent_dir+'/data/trec/test.csv', index=False)

## Train on embeddings

In [12]:
test_file = '/data/trec/train.pkl'
with open(parent_dir+test_file, 'rb') as f:
    [text_train, X_train] = pickle.load(f)

In [13]:
X_train.shape

(5452, 512)

In [14]:
test_file = '/data/trec/test.pkl'
with open(parent_dir+test_file, 'rb') as f:
    [text_test, X_test] = pickle.load(f)

In [15]:
X_test.shape

(500, 512)

In [16]:
y_train = list(df_train['int_label'])
y_test = list(df_test['int_label'])

In [17]:
import torch.nn as nn


class LogisticRegression(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegression, self).__init__()
        self.fc1 = nn.Linear(input_size, 50)
        #self.fc2 = nn.Linear(300, 50)
        self.fc2 = nn.Linear(50, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.fc2(out)
        #out = self.fc3(out)
        return out

In [18]:
import torch

inputs = torch.from_numpy(X_train)
targets = torch.from_numpy(np.array(y_train))

model = LogisticRegression(512, 6)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)

for epoch in range(1000):
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    loss.backward()
    optimizer.step()

    train_loss = loss.data[0]
    _, predicted = torch.max(outputs.data, 1)
    total = targets.size(0)
    correct = predicted.eq(targets.data).cpu().sum()
    
    if epoch % 100 == 0:
        print("Epoch {}, Loss {:.3f}, Acc {}/{}".format(epoch, train_loss, correct, total))



Epoch 0, Loss 1.782, Acc 1447/5452
Epoch 100, Loss 0.625, Acc 4315/5452
Epoch 200, Loss 0.469, Acc 4535/5452
Epoch 300, Loss 0.417, Acc 4614/5452
Epoch 400, Loss 0.387, Acc 4691/5452
Epoch 500, Loss 0.370, Acc 4722/5452
Epoch 600, Loss 0.357, Acc 4739/5452
Epoch 700, Loss 0.346, Acc 4768/5452
Epoch 800, Loss 0.337, Acc 4786/5452
Epoch 900, Loss 0.329, Acc 4802/5452


In [19]:
test_inputs = torch.from_numpy(X_test)
y_pred = model(test_inputs)
_, y_pred = torch.max(y_pred.data, 1)

In [20]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred.numpy())
acc

0.9

In [21]:
# Use the model object to select the desired layer
layer = model._modules.get('fc1')

# Define a function that will copy the output of a layer
my_embedding = torch.zeros([X_train.shape[0], 50]) 

def copy_data(m, i, o):
    my_embedding.copy_(o.data)
    
# Attach that function to our selected layer
h = layer.register_forward_hook(copy_data)

In [22]:
test_inputs = torch.from_numpy(X_train)
y_pred = model(test_inputs)

In [23]:
X_emb_50 = my_embedding.numpy()
from sklearn.manifold import TSNE
X_emb = TSNE(n_components=2, n_iter=2000, verbose=True, perplexity=30).fit_transform(X_emb_50)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 5452 samples in 0.008s...
[t-SNE] Computed neighbors for 5452 samples in 1.180s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5452
[t-SNE] Computed conditional probabilities for sample 2000 / 5452
[t-SNE] Computed conditional probabilities for sample 3000 / 5452
[t-SNE] Computed conditional probabilities for sample 4000 / 5452
[t-SNE] Computed conditional probabilities for sample 5000 / 5452
[t-SNE] Computed conditional probabilities for sample 5452 / 5452
[t-SNE] Mean sigma: 0.226624
[t-SNE] KL divergence after 250 iterations with early exaggeration: 80.071953
[t-SNE] KL divergence after 2000 iterations: 1.574402


In [24]:
import bokeh.plotting as bp
from bokeh.plotting import save, show
from bokeh.models import HoverTool
from bokeh.io import output_notebook

colormap = np.array([
        "#1f77b4", "#ffbb78", "#d62728", "#c5b0d5",
        "#e377c2", "#9edae5"
    ])
output_notebook()

In [25]:
named_labels = list(df_train['label'])

In [26]:
title = 'Visualisation of the TREC Questions dataset'
plot_fig = bp.figure(plot_width=700, plot_height=500, title=title,
                     tools="pan, wheel_zoom, box_zoom, reset, hover, previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

data_dict = {'content': text_train, 
             'x': X_emb[:, 0],
             'y': X_emb[:, 1],
             'color': colormap[y_train],
             'label': named_labels}

mySource = bp.ColumnDataSource(data_dict)

plot_fig.circle(x='x', y='y', color='color', legend='label', source=mySource)
plot_fig.legend.location = (0, 70)
new_legend = plot_fig.legend[0]
plot_fig.legend[0].plot = None
plot_fig.add_layout(new_legend, 'right')
plot_fig.legend.label_text_font_size = '7pt'

# hover tools
hover = plot_fig.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @label"}

show(plot_fig)

In [27]:
df_train.groupby('label').count()

Unnamed: 0_level_0,text,int_label
label,Unnamed: 1_level_1,Unnamed: 2_level_1
ABBR,86,86
DESC,1162,1162
ENTY,1250,1250
HUM,1223,1223
LOC,835,835
NUM,896,896


In [28]:
# from bokeh.plotting import output_file, save
# output_file("trec.html")
# save(plot_fig)

## Bottleneck layer

In [31]:
class LogisticRegression1(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegression1, self).__init__()
        self.fc1 = nn.Linear(input_size, 50)
        self.fc2 = nn.Linear(50, 2)
        self.fc3 = nn.Linear(2, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.fc2(out)
        out = self.fc3(out)
        return out

In [32]:
inputs = torch.from_numpy(X_train)
targets = torch.from_numpy(np.array(y_train))

model = LogisticRegression1(512, 6)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)

for epoch in range(1000):
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    loss.backward()
    optimizer.step()

    train_loss = loss.data[0]
    _, predicted = torch.max(outputs.data, 1)
    total = targets.size(0)
    correct = predicted.eq(targets.data).cpu().sum()
    
    if epoch % 100 == 0:
        print("Epoch {}, Loss {:.3f}, Acc {}/{}".format(epoch, train_loss, correct, total))

  from ipykernel import kernelapp as app


Epoch 0, Loss 1.947, Acc 896/5452
Epoch 100, Loss 0.816, Acc 3822/5452
Epoch 200, Loss 0.660, Acc 4193/5452
Epoch 300, Loss 0.617, Acc 4269/5452
Epoch 400, Loss 0.587, Acc 4336/5452
Epoch 500, Loss 0.562, Acc 4386/5452
Epoch 600, Loss 0.540, Acc 4430/5452
Epoch 700, Loss 0.571, Acc 4315/5452
Epoch 800, Loss 0.534, Acc 4400/5452
Epoch 900, Loss 0.512, Acc 4459/5452


In [33]:
test_inputs = torch.from_numpy(X_test)
y_pred = model(test_inputs)
_, y_pred = torch.max(y_pred.data, 1)

acc = accuracy_score(y_test, y_pred.numpy())
acc

0.842

In [34]:
# Use the model object to select the desired layer
layer = model._modules.get('fc2')

# Define a function that will copy the output of a layer
my_embedding = torch.zeros([X_train.shape[0], 2]) 

def copy_data(m, i, o):
    my_embedding.copy_(o.data)
    
# Attach that function to our selected layer
h = layer.register_forward_hook(copy_data)

test_inputs = torch.from_numpy(X_train)
y_pred = model(test_inputs)

In [35]:
X_emb_2 = my_embedding.numpy()

In [37]:
title = 'Visualisation of the TREC Questions dataset'
plot_fig = bp.figure(plot_width=700, plot_height=500, title=title,
                     tools="pan, wheel_zoom, box_zoom, reset, hover, previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

data_dict = {'content': text_train, 
             'x': X_emb_2[:, 0],
             'y': X_emb_2[:, 1],
             'color': colormap[y_train],
             'label': named_labels}

mySource = bp.ColumnDataSource(data_dict)

plot_fig.circle(x='x', y='y', color='color', legend='label', source=mySource)
plot_fig.legend.location = (0, 70)
new_legend = plot_fig.legend[0]
plot_fig.legend[0].plot = None
plot_fig.add_layout(new_legend, 'right')
plot_fig.legend.label_text_font_size = '7pt'

# hover tools
hover = plot_fig.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @label"}

show(plot_fig)

## Directly from embeddings

In [38]:
class LogisticRegression2(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegression2, self).__init__()
        self.fc1 = nn.Linear(input_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        return out

In [39]:
inputs = torch.from_numpy(X_train)
targets = torch.from_numpy(np.array(y_train))

model = LogisticRegression2(512, 6)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)

for epoch in range(1000):
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    loss.backward()
    optimizer.step()

    train_loss = loss.data[0]
    _, predicted = torch.max(outputs.data, 1)
    total = targets.size(0)
    correct = predicted.eq(targets.data).cpu().sum()
    
    if epoch % 100 == 0:
        print("Epoch {}, Loss {:.3f}, Acc {}/{}".format(epoch, train_loss, correct, total))

  from ipykernel import kernelapp as app


Epoch 0, Loss 1.795, Acc 881/5452
Epoch 100, Loss 1.042, Acc 4091/5452
Epoch 200, Loss 0.832, Acc 4200/5452
Epoch 300, Loss 0.734, Acc 4268/5452
Epoch 400, Loss 0.675, Acc 4328/5452
Epoch 500, Loss 0.634, Acc 4361/5452
Epoch 600, Loss 0.604, Acc 4385/5452
Epoch 700, Loss 0.580, Acc 4413/5452
Epoch 800, Loss 0.561, Acc 4442/5452
Epoch 900, Loss 0.545, Acc 4456/5452


In [40]:
test_inputs = torch.from_numpy(X_test)
y_pred = model(test_inputs)
_, y_pred = torch.max(y_pred.data, 1)

acc = accuracy_score(y_test, y_pred.numpy())
acc

0.876

In [41]:
X_emb = TSNE(n_components=2, n_iter=2000, verbose=True, perplexity=30).fit_transform(X_train)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 5452 samples in 0.145s...
[t-SNE] Computed neighbors for 5452 samples in 27.266s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5452
[t-SNE] Computed conditional probabilities for sample 2000 / 5452
[t-SNE] Computed conditional probabilities for sample 3000 / 5452
[t-SNE] Computed conditional probabilities for sample 4000 / 5452
[t-SNE] Computed conditional probabilities for sample 5000 / 5452
[t-SNE] Computed conditional probabilities for sample 5452 / 5452
[t-SNE] Mean sigma: 0.294704
[t-SNE] KL divergence after 250 iterations with early exaggeration: 88.892410
[t-SNE] KL divergence after 2000 iterations: 1.998787


In [42]:
title = 'Visualisation of the TREC Questions dataset'
plot_fig = bp.figure(plot_width=700, plot_height=500, title=title,
                     tools="pan, wheel_zoom, box_zoom, reset, hover, previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

data_dict = {'content': text_train, 
             'x': X_emb[:, 0],
             'y': X_emb[:, 1],
             'color': colormap[y_train],
             'label': named_labels}

mySource = bp.ColumnDataSource(data_dict)

plot_fig.circle(x='x', y='y', color='color', legend='label', source=mySource)
plot_fig.legend.location = (0, 70)
new_legend = plot_fig.legend[0]
plot_fig.legend[0].plot = None
plot_fig.add_layout(new_legend, 'right')
plot_fig.legend.label_text_font_size = '7pt'

# hover tools
hover = plot_fig.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @label"}

show(plot_fig)

In [43]:
from bokeh.plotting import output_file, save
output_file("trec_general.html")
save(plot_fig)

'/Users/signapoop/Desktop/fyp-graph-clustering/trec_general.html'