# Language Classification

## Overview

My aim here is to build a language classifier for EU languages.

Proposed Approach:
1. Inspect test set
1. Create dataset for training / validation
1. Train / valid split
1. Numericalize
1. Create embeddings
1. Build language classification model

## Setup

In [None]:
#from os import path
#from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
#platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

#accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

#!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.0-{platform}-linux_x86_64.whl torchvision

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm

import dill

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from collections import Counter, defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

import utils

In [None]:
np.random.seed(1)

## Settings

In [None]:
PATH = Path('data')  # Directory for all data and temporary files
TRAIN = PATH/'train'  # Directory for training text
TEST_FN = PATH/'test'  # Filename for test text
PATH_TMP = PATH/'tmp'  # Temporary directory to save progress

MIN_FREQ = 30  # We'll replace words with lower frequency with unknown

BS = 64  # Batch size for our RNN
SKIP_BS = 512  # Batch size for skip-gram. Can be high.

EMB_SZ = 200  # Dimension of word embeddings
HIDDEN_SZ = 100

# List of language
LANGS = list(map(lambda x: x.name, list(TRAIN.iterdir())))

assert torch.cuda.is_available()  # Notebook is written for GPU computations.

In [None]:
PATH_TMP.mkdir(parents=True, exist_ok=True)

## Clarify Goal

Let's first have a look at the test set we are trying to predict. It looks like a simple text classification task.

In [None]:
test = pd.read_csv(TEST_FN, sep = '\t', lineterminator='\n', header=None)
test.rename({0:'label', 1:'text'}, axis = 1, inplace=True)
test[test['label'] == 'en'].head()

Before going any further, let's apply some preprocessing. In particular, I apply the following steps:
1. Remove uninformative meta-comments, such as who is speaking).
1. Replace numbers with a generic <num> token. After all, the specific number shouldn't affect the classification results.
1. Create a special end-of-sentence (<eos>) token.
1. Replace all punctuation with a special <punc> token. 
1. Collapse adjecent white space. In other words, '   ' becomes ' '.

In [None]:
test['text'] = test['text'].apply(utils.preprocess)

Let's check a random English and German sentence after pre-processing.

In [None]:
print(test[test['label']=='en'].iloc[0]["text"])
print('---')
print(test[test['label']=='de'].iloc[0]["text"])

The target for our classification model has the following characteristics:
1. The vast majority of examples are a single sentence.
1. Most of the time, we have a decent number of words (15-33) to predict a language.
1. However, we can have as little as 3 words. This might pose a challenge if those words are not language-specific.

In [None]:
def word_count(x): return len(x.split())
def sentence_count(x): return len(x.split('<eos>')) - 1
test['text'].apply([sentence_count, word_count, len]).describe()

## Preprocess Training Dataset

In [None]:
exampl = utils.concat_docs('en', TRAIN)
exampl[:200]

In [None]:
exampl = utils.txt2list(exampl[:1000])
exampl[:2]

In [None]:
dfs = []
for lang in LANGS:
    print(' '+lang+' ', end = "")
    txt = utils.concat_random_sent(utils.txt2list(utils.concat_docs(lang, TRAIN)))
    temp_df = pd.DataFrame({'text':txt})
    temp_df['label'] = lang
    dfs.append(temp_df)
df = pd.concat(dfs)[['label', 'text']]
df.reset_index(inplace=True, drop = True)

In [None]:
df.head()

In [None]:
df['text'].apply([sentence_count, word_count, len]).describe()

In [None]:
del(dfs, temp_df, txt, exampl)
dill.dump(df, open(PATH_TMP/'df.pickle', mode = 'wb'))

In [None]:
#df = dill.load(open(PATH_TMP/'df.pickle', mode = 'rb'))

## Train Test Split

In [None]:
len(df.index)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(np.array(df['text']), np.array(df['label']), 
                                                  test_size=0.01, random_state=42)
y_train

In [None]:
del(df)

## Numericalize

In [None]:
words = Counter()
for row in tqdm(X_train, position=0, leave=False): words.update(row.split())
words.most_common(10)

In [None]:
words = {k:v for k, v in tqdm(words.items(), leave = False) if v >= MIN_FREQ}
words = sorted(words, key=words.get, reverse=True)
words = ['<unk>','<pad>'] + words

In [None]:
vocab_size = len(words)
vocab_size

In [None]:
word2idx = defaultdict(lambda: 0, {o:i for i,o in enumerate(words)})
idx2word = defaultdict(lambda: '<unk>', {i:o for i,o in enumerate(words)})

In [None]:
print([word2idx[w] for w in X_train[0].split()])

In [None]:
X_train = utils.numericalize(X_train, word2idx)
X_val = utils.numericalize(X_val, word2idx)

In [None]:
utils.de_numericalize(X_train[:2], idx2word)

In [None]:
lang2idx = defaultdict(lambda: 0, {o:i for i,o in enumerate(LANGS)})
idx2lang = defaultdict(lambda: '<unk>', {i:o for i,o in enumerate(LANGS)})

In [None]:
y_train = np.array([lang2idx[x] for x in y_train])
y_val = np.array([lang2idx[x] for x in y_val])

In [None]:
with open(PATH_TMP/'numericalized.pickle', mode = 'wb') as f:
    dill.dump([words, vocab_size, word2idx, idx2word, X_train, X_val, y_train, y_val], f)

In [None]:
#with open(PATH_TMP/'numericalized.pickle', mode = 'rb') as f:
#    (words, vocab_size, word2idx, idx2word, X_train, X_val, y_train, y_val) = dill.load(f)

## Create Embeddings

In [None]:
m, n = X_train.shape

In [None]:
idx_freq = Counter()
for row in tqdm(X_train, leave = False): idx_freq.update(row)
idx_freq = np.array([idx_freq[i] for i in tqdm(range(vocab_size), leave = False)]).astype(np.int32)
idx_freq = np.maximum(idx_freq, 1)

In [None]:
[(x, utils.subsamp_disc_prob(idx_freq)[word2idx[x]]) for x in 
 ['the', 'in', 'of', 'president', 'approval', 'origin']]

In [None]:
X_skip, y_skip = utils.skipgram_data(X_train, idx_freq)
skip_dl = DataLoader(TensorDataset(X_skip, y_skip), batch_size=SKIP_BS, shuffle = True)

In [None]:
class skip_model(nn.Module):
    def __init__(self, emb_sz = EMB_SZ, vocab_size = vocab_size):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_sz)
        self.target_emb = nn.Embedding(vocab_size, emb_sz)
        self.emb.weight.data.uniform_(-0.05, 0.05)
        self.target_emb.weight.data.uniform_(-0.05, 0.05)
        
    def forward(self, x):
        context, target = x[:, 0], x[:, 1]
        context, target = self.emb(context), self.target_emb(target)
        res = (context * target).sum(1)
        res = torch.sigmoid(res)
        return res.squeeze()

In [None]:
model = skip_model().cuda()

In [None]:
loss_func = nn.BCELoss().cuda()  # Binary cross entropy loss

In [None]:
def loss_batch(xb, yb, model, loss_func, opt):
    '''https://github.com/fastai/fastai_v1/blob/master/dev_nb/001a_nn_basics.ipynb'''
    # Note: changed this by adding yb.view(-1) to match dimensions

    loss = loss_func(model(xb.cuda()), yb.cuda().view(-1))

    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()

    return loss.item(), len(xb)

In [None]:
class Learner(object):
    
    def __init__(self, model, loss_func, train_dl = None, valid_dl = None):
        self.model = model
        self.loss_func = loss_func
        self.train_dl = train_dl
        self.valid_dl = valid_dl
        self.losses = []
    
    def lr_find(self, start = 1e-6, end = 1e1,
                exp_smooth = True,
               skip_first = 10, skip_last = 10):
        lr = start; lrs = []; losses = []
        self.model.train()
        i = 0
        for xb,yb in self.train_dl:
            opt = optim.Adam(self.model.parameters(), lr=lr)
            l, _ = loss_batch(xb, yb, self.model, self.loss_func, opt)
            if (not exp_smooth) or i ==0:
                loss = l
            else:
                loss = 0.9*loss + 0.1*l
            if (i+1)%100 == 0:
                print(f'iteration {i}, lr = {lr}, loss = {loss}')
            lrs.append(lr), losses.append(loss)
            if (lr > end) or (i > 10 and loss > 3*np.mean(losses[:i])):
                break
            lr *= 1.01; i += 1        
        f, ax = plt.subplots(figsize=(5, 5))
        ax.set(yscale = 'log', xscale = 'log')
        ax = plt.plot(lrs[skip_first:-skip_last], losses[skip_first:-skip_last])
        self.losses = losses
        
        # Re-initialize embeddings
        self.model.emb.weight.data.uniform_(-0.05, 0.05)
        self.model.target_emb.weight.data.uniform_(-0.05, 0.05)
        
    def plot_loss(self):
        f, ax = plt.subplots(figsize=(5, 5))
        ax.set(yscale = 'log')
        ax = plt.plot(self.losses)
            
    def fit(self, lr, epochs, callOn_epoch_start = None):
        
        opt = optim.Adam(self.model.parameters(), lr=lr)
        loss_list = []
        
        for epoch in range(epochs):
            
            if callOn_epoch_start:
                callOn_epoch_start()
        
            # Fit model to training data
            self.model.train()
            losses, nums = zip(*[loss_batch(xb, yb, self.model, self.loss_func, opt) 
                                 for xb,yb in tqdm(self.train_dl, position=0, leave = False)])
            train_loss = np.sum(np.multiply(losses,nums)) / np.sum(nums)
            loss_list = loss_list+list(losses)

            # Calculate loss on validation set
            if self.valid_dl != None:
                model.eval()
                with torch.no_grad():
                    losses,nums = zip(*[loss_batch(model, loss_func, xb, yb)
                                        for xb,yb in valid_dl])
                val_loss = np.sum(np.multiply(losses,nums)) / np.sum(nums)
                print(f'Epoch {epoch}. Training loss: {train_loss}. Validation loss: {val_loss}.')
            else:
                print(f'Epoch {epoch}. Training loss: {train_loss}.')
                
            
        self.losses = loss_list

In [None]:
class skipgram_Learner(Learner):
    def update_train(self):
        X_skip, y_skip = utils.skipgram_data(X_train, idx_freq)
        self.train_dl = DataLoader(TensorDataset(X_skip, y_skip), batch_size=SKIP_BS, shuffle = True)
        
    def fit(self, lr, epochs): 
        super().fit(lr, epochs, callOn_epoch_start=self.update_train)

In [None]:
learn = skipgram_Learner(model, loss_func, skip_dl)

In [None]:
learn.lr_find()

In [None]:
lr = 1e-3

In [None]:
learn.fit(lr, 3)

In [None]:
learn.plot_loss()

In [None]:
torch.save(learn.model, PATH_TMP/'embeddings0.pt')

In [None]:
embs = learn.model.emb.weight.data.cpu().numpy()

In [None]:
def cos_dist(u, v): return np.dot(u, v) / np.sqrt(np.sum(u**2)*np.sum(v**2))
def emb_pair_dist(a, b, c, d):
    return cos_dist(embs[word2idx[a]] - embs[word2idx[b]],
                   embs[word2idx[c]] - embs[word2idx[d]])

In [None]:
emb_pair_dist('man', 'woman', 'he', 'she')

In [None]:
emb_pair_dist('good', 'better', 'bad', 'worse')

## Create Classifier

In [None]:
X_train = torch.from_numpy(X_train).type(torch.int64)
y_train = torch.from_numpy(y_train).type(torch.int64)
X_val = torch.from_numpy(X_val).type(torch.int64)
y_val = torch.from_numpy(y_val).type(torch.int64)

train_dl = DataLoader(TensorDataset(X_train, y_train), batch_size=BS, shuffle = True)
valid_dl = DataLoader(TensorDataset(X_val, y_val), batch_size=BS, shuffle = False)

In [None]:
class Lang_Detect(nn.Module):
    def __init__(self, emb_sz = EMB_SZ, vocab_size = vocab_size,
                hidden_sz = HIDDEN_SZ, out_sz = len(LANGS)):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_sz)
        self.emb_drop = nn.Dropout(0.25)
        self.emb.weight.data.uniform_(-0.05, 0.05)
        self.gru = nn.GRU(emb_sz, hidden_sz)
        self.drop = nn.Dropout(0.25)
        self.lout = nn.Linear(hidden_sz, out_sz)
        self.hidden_sz = hidden_sz
                
    def forward(self, seq): 
        bs, _ = seq.shape
        h =  torch.zeros(1, bs, self.hidden_sz).cuda()
        embedded = self.emb(seq).transpose(0, 1)
        outputs, _ = self.gru(self.emb_drop(embedded), h)
        output = self.lout(self.drop(outputs[-1]))
        return output

In [None]:
model = Lang_Detect().cuda()

In [None]:
loss_func = nn.CrossEntropyLoss().cuda()

In [None]:
def loss_batch(xb, yb, model, loss_func, opt):
    '''https://github.com/fastai/fastai_v1/blob/master/dev_nb/001a_nn_basics.ipynb'''
    # Note: changed this by adding yb.view(-1) to match dimensions

    loss = loss_func(model(xb.cuda()), yb.cuda())

    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()

    return loss.item(), len(xb)

In [None]:
class Learner(object):
    
    def __init__(self, model, loss_func, train_dl = None, valid_dl = None):
        self.model = model
        self.loss_func = loss_func
        self.train_dl = train_dl
        self.valid_dl = valid_dl
        self.losses = []
    
    def lr_find(self, start = 1e-6, end = 1e1, exp_smooth_param = 0.9):
        
        self.model.train()
        lr = start; lrs = []; losses = []; i = 0
        for xb,yb in tqdm(self.train_dl, leave = False,
                         position = 0):
            opt = optim.Adam(self.model.parameters(), lr=lr)
            loss, _ = loss_batch(xb, yb, self.model, self.loss_func, opt)
            lrs.append(lr), losses.append(loss)
            if (lr > end) or (i > 10 and loss > 3*np.mean(losses[:i])):
                break
            lr *= 1.01; i += 1        
        self.losses = losses
        self.plot_loss(x = lrs, xlog=True, exp_smooth_param = exp_smooth_param)
        
    def plot_loss(self, x = None, xlog = False, exp_smooth_param = 0.95,
                 skip_edges = False):
        y_smooth = utils.exp_smooth(np.array(self.losses), exp_smooth_param)
        if skip_edges:
            y_smooth = y_smooth[10:-10]
        f, ax = plt.subplots(figsize=(5, 5))
        if xlog:
            ax.set(yscale = 'log', xscale = 'log')
        else:
            ax.set(yscale = 'log')
        if x is not None:
            if skip_edges:
                x = x[10:-10]
            ax = plt.plot(x, y_smooth)
        else:
            ax = plt.plot(y_smooth)     
            
    def fit(self, lr, epochs, callOn_epoch_start = None):
        
        opt = optim.Adam(self.model.parameters(), lr=lr)
        loss_list = []
        
        for epoch in range(epochs):
            
            if callOn_epoch_start:
                callOn_epoch_start()
        
            # Fit model to training data
            self.model.train()
            losses, nums = zip(*[loss_batch(xb, yb, self.model, self.loss_func, opt) 
                                 for xb,yb in tqdm(self.train_dl, leave = False,
                                                  position = 0)])
            train_loss = np.sum(np.multiply(losses,nums)) / np.sum(nums)
            loss_list = loss_list+list(losses)

            # Calculate loss on validation set
            if self.valid_dl != None:
                self.model.eval()
                with torch.no_grad():
                    losses,nums = zip(*[loss_batch(model, loss_func, xb, yb)
                                        for xb,yb in tqdm(valid_dl, leave = False,
                                                         position = 0)])
                val_loss = np.sum(np.multiply(losses,nums)) / np.sum(nums)
                print(f'Epoch {epoch}. Training loss: {train_loss}. Validation loss: {val_loss}.')
            else:
                print(f'Epoch {epoch}. Training loss: {train_loss}.')
                
        self.losses = loss_list
        
    def predict(self, dl):
        self.model.eval()
        with torch.no_grad():
            res = [self.model(xb.cuda()).detach().cpu().numpy().argmax(axis = -1) for 
                   xb, _ in tqdm(dl, leave = False, position = 0)]
        return np.concatenate(res)

In [None]:
learn = Learner(model, loss_func, train_dl, valid_dl)

In [None]:
learn.lr_find()

In [None]:
lr = 3e-3

In [None]:
learn.fit(lr, 1)

In [None]:
preds = learn.predict(valid_dl)

In [None]:
def accuracy(pred, y):
    m = y.shape[0]
    assert pred.shape == (m,)
    return np.sum(pred == y) / m

In [None]:
accuracy(preds, y_val.detach().cpu().numpy())

In [None]:
1/21