In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from cis700 import tokenizer

import torch
from torch.utils import data

In [None]:
# Code referenced from https://gist.github.com/gyglim/1f8dfb1b5c82627ae3efcfbbadb9f514
import tensorflow as tf
import numpy as np
import scipy.misc 
try:
    from StringIO import StringIO  # Python 2.7
except ImportError:
    from io import BytesIO         # Python 3.x


class Logger(object):
    
    def __init__(self, log_dir):
        """Create a summary writer logging to log_dir."""
        self.writer = tf.summary.FileWriter(log_dir)

    def scalar_summary(self, tag, value, step):
        """Log a scalar variable."""
        summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)])
        self.writer.add_summary(summary, step)

    def image_summary(self, tag, images, step):
        """Log a list of images."""

        img_summaries = []
        for i, img in enumerate(images):
            # Write the image to a string
            try:
                s = StringIO()
            except:
                s = BytesIO()
            scipy.misc.toimage(img).save(s, format="png")

            # Create an Image object
            img_sum = tf.Summary.Image(encoded_image_string=s.getvalue(),
                                       height=img.shape[0],
                                       width=img.shape[1])
            # Create a Summary value
            img_summaries.append(tf.Summary.Value(tag='%s/%d' % (tag, i), image=img_sum))

        # Create and write Summary
        summary = tf.Summary(value=img_summaries)
        self.writer.add_summary(summary, step)
        
    def histo_summary(self, tag, values, step, bins=1000):
        """Log a histogram of the tensor of values."""

        # Create a histogram using numpy
        counts, bin_edges = np.histogram(values, bins=bins)

        # Fill the fields of the histogram proto
        hist = tf.HistogramProto()
        hist.min = float(np.min(values))
        hist.max = float(np.max(values))
        hist.num = int(np.prod(values.shape))
        hist.sum = float(np.sum(values))
        hist.sum_squares = float(np.sum(values**2))

        # Drop the start of the first bin
        bin_edges = bin_edges[1:]

        # Add bin edges and counts
        for edge in bin_edges:
            hist.bucket_limit.append(edge)
        for c in counts:
            hist.bucket.append(c)

        # Create and write Summary
        summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=hist)])
        self.writer.add_summary(summary, step)
        self.writer.flush()

In [None]:

# define dataset
class ArticleDataset(data.Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, articles, labels):
        'Initialization'
        self.labels = labels
        self.articles = articles

  def __len__(self):
        'Denotes the total number of samples'
        return len(self.articles)

  def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        # Load data and get label
        X = self.articles[index]
        y = self.labels[index]

        return X, y

In [None]:
# define classifier
tok = tokenizer.build_tokenizer()

class BoWClassifier(nn.Module):  # inheriting from nn.Module!

    def __init__(self,num_labels,vocab_size):
        super(BoWClassifier, self).__init__()
        self.linear = nn.Linear(vocab_size, num_labels)

    def forward(self, bow_vec):
        return F.log_softmax(self.linear(bow_vec), dim=1)

def make_bow_vector(article, word_to_ix):
    tokens = tok.tokenize(article)
    ids = tok.convert_tokens_to_ids(tokens)
    words = tok.convert_ids_to_tokens(ids)
    
    vec = torch.zeros(len(word_to_ix))
    for word in words:
        if word in word_to_ix:
            vec[word_to_ix[word]] += 1
    return vec

def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]]).view(-1)

In [None]:
def make_bow_vector(article, word_to_ix):
    tokens = tok.tokenize(article)
    ids = tok.convert_tokens_to_ids(tokens)
    words = tok.convert_ids_to_tokens(ids)
    
    vec = torch.zeros(len(word_to_ix))
    for word in words:
        if word in word_to_ix:
            vec[word_to_ix[word]] += 1
    return vec

In [None]:
############################# READ DATA ################################
coarse_labels = open("coarse_labels.txt", "r").readlines()
content = open("content.txt", "r").readlines()

In [None]:
######################## CREATE COARSE CATEGORY LABEL DICT ################################
possible_cats = open("supercatstats.txt", "r").read().split(',')

label_to_ix={}
for line in possible_cats:
    cat = line.split("'")[1]
    if cat not in label_to_ix:
        label_to_ix[cat] = len(label_to_ix)

In [None]:
############################## CREATE VOCAB DICT ###################################
word_to_ix = {}
counts = open("filtered_counts.txt", "r").readlines()[:2000]
for line in counts:
    word = line.split()[1]
    if word not in word_to_ix:
        word_to_ix[word] = len(word_to_ix)

In [None]:
############################## CONVERT CONTENT TO VECTORS ###################################
content_vec = [0 for _ in range(len(content))]
print("converting content..")
for i, l in enumerate(content):
    content_vec[i] = make_bow_vector(content[i].strip('\n'), word_to_ix)
    #if i%10000==0:
    #    print(str(i)+"/"+str(len(content_vec)))
print("done converting content..")

In [None]:
############################## CONVERT LABELS TO VECTORS ###################################
coarse_labels_vec = [0 for _ in range(len(content))]
print("converting labels..")
for i, l in enumerate(coarse_labels):
    coarse_labels_vec[i] = make_target(coarse_labels[i].strip('\n'), label_to_ix)
    
    #if i%10000==0:
    #    print(str(i)+"/"+str(len(coarse_labels_vec)))
print("done converting labels..")

In [None]:
################################ BUILD COARSE DATASETS ####################################

torch.manual_seed(0)
dataset = ArticleDataset(content_vec, coarse_labels_vec)

train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset,val_dataset,test_dataset = data.random_split(dataset,[train_size, val_size, test_size])

train_loader_coarse = data.DataLoader(train_dataset, batch_size=100, shuffle=True)
val_loader_coarse = data.DataLoader(val_dataset, batch_size=100, shuffle=True)
test_loader_coarse = data.DataLoader(test_dataset, batch_size=100)

In [None]:
############################## INITIALIZE MODEL ####################################
VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = 180

loss_function = nn.NLLLoss()

In [None]:
import time, datetime
import torch.nn as nn

def train_model(ver="coarse",num_epochs=10, lr=0.1,train_loader=train_loader_coarse,val_loader=val_loader_coarse, test_loader=test_loader_coarse):

    now = time.mktime(datetime.datetime.now().timetuple())
    logger = Logger('./logs/logreg_'+ver+'_'+str(now)+'/')
    logger_val = Logger('./logs/logreg_val_'+ver+'_'+str(now)+'/')
    
    train_acc = []
    train_loss = []
    train_y = []
    val_acc = []
    val_y = []
    val_loss = []
    model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)
    optimizer = optim.SGD(model.parameters(), lr=lr)
    total_step = len(train_loader)*num_epochs
    step = 0
    model.train()
    for epoch in range(num_epochs):
        print("********EPOCH "+str(epoch)+"********")
        logs = {}
        for i, batch in enumerate(train_loader):
            articles, labels = batch
            labels = labels.view(-1)
            optimizer.zero_grad()
            outputs = model(articles)
            _, argmax = torch.max(outputs, 1)
            loss = loss_function(outputs, labels)
            loss.backward()
            optimizer.step()
            accuracy = (labels == argmax).float().mean()
            if (step + 1) % 200 == 0: 
                print('Epoch: [% d/% d], Step: [% d/% d], Loss: %.4f, Accuracy: %4f'
                  % (epoch + 1, num_epochs, step, len(train_loader) * num_epochs, loss.item(), accuracy.item())) 
                
                to_log = {'loss': loss.item(), 'accuracy': accuracy.item()}
                for handle, val in to_log.items():
                    logger.scalar_summary(handle, val, step+1)
                model.eval()
                with torch.no_grad():
                    num_correct = 0
                    total = 0
                    for i, batch in enumerate(val_loader, 0):

                        articles, labels = batch
                        labels = labels.view(-1)
                        outputs = model(articles)
                        _, argmax = torch.max(outputs, 1)
                        loss = loss_function(outputs, labels)
                        num_correct += (argmax == labels).float().sum()
                        total += articles.shape[0]
                        accuracy = (labels == argmax).float().mean()
                    to_log = {'loss': loss.item(), 'accuracy': accuracy.item()}
                    for handle, val in to_log.items():
                        print(handle, val)
                        logger_val.scalar_summary(handle, val, step+1)
                    print(step)
                    print('The validation accuracy is: %s%% [%s]' % (num_correct/total * 100,100))  
                model.train()

                
            step +=1
            
        model.eval()
        with torch.no_grad():
            num_correct = 0
            total = 0
            for i, batch in enumerate(test_loader, 0):

                articles, labels = batch
                labels = labels.view(-1)
                outputs = model(articles)
                _, argmax = torch.max(outputs, 1)
                loss = loss_function(outputs, labels)
                num_correct += (argmax == labels).float().sum()
                total += articles.shape[0]
                accuracy = (labels == argmax).float().mean()
            print(step)
            print('The test accuracy is: %s%% [%s]' % (num_correct/total * 100,100))  
        model.train()
        

    return model

In [None]:
# coarse - final model
model_c = train_model(num_epochs=30,lr=0.05)

In [None]:
############################# READ FINE LABELS DATA ################################
fine_labels = open("fine_labels.txt", "r").readlines()

In [None]:
######################## CREATE FINE CATEGORY LABEL DICT ################################
possible_cats = open("catstats.txt", "r").read().split(',')

label_to_ix={}
for line in possible_cats:
    cat = line.split("'")[1]
    if cat not in label_to_ix:
        label_to_ix[cat] = len(label_to_ix)

############################## CONVERT LABELS TO VECTORS ###################################
fine_labels_vec = [0 for _ in range(len(fine_labels))]
print("converting labels..")
for i, l in enumerate(fine_labels):
    fine_labels_vec[i] = make_target(fine_labels[i].strip('\n'), label_to_ix)
    if i%5000==0:
        print(str(i)+"/"+str(len(fine_labels_vec)))
print("done converting labels..")

In [None]:
################################ BUILD FINE DATASETS ####################################
dataset = ArticleDataset(content_vec, fine_labels_vec)

train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset,val_dataset,test_dataset = data.random_split(dataset,[train_size, val_size, test_size])

train_loader_fine= data.DataLoader(train_dataset, batch_size=100, shuffle=True)
val_loader_fine = data.DataLoader(val_dataset, batch_size=100, shuffle=True)
test_loader_fine = data.DataLoader(test_dataset, batch_size=100)

In [None]:
VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = 370

loss_function = nn.NLLLoss()

In [None]:
# fine final model
model_f = train_model(ver="fine",num_epochs=20,lr=0.05,train_loader=train_loader_fine,val_loader=val_loader_fine,test_loader=test_loader_fine)