Description and code can be found at:
https://github.com/jannaescur/language_identification_slpdl

In [1]:


# This Python 3 environment comes with many helpful analytics libraries installed
import random
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch # Deep learning framework
import torch.nn.functional as F
from nltk import ngrams
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.
#Init random seed to get reproducible results
seed = 1111
random.seed(seed)
np.random.RandomState(seed)
torch.manual_seed(seed)

# Any results you write to the current directory are saved as output.
x_train_full = open("../input/x_train.txt").read().splitlines()
y_train_full = open("../input/y_train.txt").read().splitlines()
print('Example:')
print('LANG =', y_train_full[0])
print('TEXT =', x_train_full[0])

['labels.csv', 'x_train.txt', 'y_train.txt', 'x_test.txt']
Example:
LANG = est
TEXT = Klement Gottwaldi surnukeha palsameeriti ning paigutati mausoleumi. Surnukeha oli aga liiga hilja ja oskamatult palsameeritud ning hakkas ilmutama lagunemise tundemärke. 1962. aastal viidi ta surnukeha mausoleumist ära ja kremeeriti. Zlíni linn kandis aastatel 1949–1989 nime Gottwaldov. Ukrainas Harkivi oblastis kandis Zmiivi linn aastatel 1976–1990 nime Gotvald.


In [2]:
import re
from collections import Counter
import itertools
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

class Dictionary(object):
    def __init__(self):
        self.token2idx = {}
        self.idx2token = []

    def add_token(self, token):
        if token not in self.token2idx:
            self.idx2token.append(token)
            self.token2idx[token] = len(self.idx2token) - 1
        return self.token2idx[token]

    def __len__(self):
        return len(self.idx2token)

def build_vocab(sentences):
    """
    Builds a vocabulary mapping from word to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv]

def pad_sentences(sentences, labels, padding_word="<PAD/>", max_len = 300):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    sequence_length = max_len
    padded_sentences = []
    fianal_sent = []
    padded_labels = []
    for i, sentence in enumerate(sentences):
        n_trams = 0
        new_sentence = []
        for j,word in enumerate(sentence):
            new_sentence.append(word)
            if (j+1)%sequence_length == 0:
                padded_sentences.append(new_sentence)
                padded_labels.append(labels[i])
                new_sentence = []
                
        num_padding = sequence_length - len(new_sentence)
        if 0 < num_padding < sequence_length:
            new_sentence = new_sentence + [padding_word] * num_padding
            padded_sentences.append(new_sentence)
            padded_labels.append(labels[i])
        
    return padded_sentences,padded_labels

# Prepare X as clean word list without articles, and non interesting words:
x_text = [clean_str(s).split(" ") for s in x_train_full]
lens = [len(x) for x in x_text]
print("Max words: {}\nMin words:{}".format(max(lens),min(lens)))

# Prepare padding
x_train, y_train = pad_sentences(x_text, y_train_full)
# Vocab dict
vocabulary, vocabulary_inv = build_vocab(x_train)

lang_vocab = Dictionary()
# use python set to obtain the list of languages without repetitions
languages = set(y_train_full)
for lang in sorted(languages):
    lang_vocab.add_token(lang)
print("Labels:", len(lang_vocab), "languages")

Max words: 1585
Min words:1
Labels: 235 languages


In [3]:

def build_input_data(sentences, labels, vocabulary, languages):
    """
    Maps sentencs and labels to vectors based on a vocabulary.
    """
    x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
    print(x.shape)
    y = np.array([languages.token2idx[label] for label in labels])
    return x, y

X_train, Y_train = build_input_data(x_train, y_train, vocabulary, lang_vocab)

vocab_size = len(vocabulary)
sentence_len = X_train.shape[1]
num_classes = int(max(Y_train)) +1 # added int() to convert np.int64 to int

print('vocab size       = {}'.format(vocab_size))
print('max sentence len = {}'.format(sentence_len))
print('num of classes   = {}'.format(num_classes))



(118079, 300)
vocab size       = 678055
max sentence len = 300
num of classes   = 235


In [4]:
from torch import nn
class CNN(nn.Module):
    def __init__(self, kernel_sizes=[3,4,5], num_filters=512, embedding_dim=256, pretrained_embeddings=None):
        super(CNN, self).__init__()
        self.kernel_sizes = kernel_sizes
        ConvMethod = "in_channel__is_embedding_dim"
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        #self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
        #self.embedding.weight.requires_grad = mode=="nonstatic"

        if use_cuda:
            self.embedding = self.embedding.cuda()

        conv_blocks = []
        for kernel_size in kernel_sizes:
            # maxpool kernel_size must <= sentence_len - kernel_size+1, otherwise, it could output empty
            maxpool_kernel_size = sentence_len - kernel_size +1

            
            conv1d = nn.Conv1d(in_channels = 1, out_channels = num_filters, kernel_size = kernel_size*embedding_dim, stride = embedding_dim)

            component = nn.Sequential(
                conv1d,
                nn.ReLU(),
                nn.MaxPool1d(kernel_size = maxpool_kernel_size)
            )
            if use_cuda:
                component = component.cuda()

            conv_blocks.append(component)

            if 0:
                conv_blocks.append(
                nn.Sequential(
                    conv1d,
                    nn.ReLU(),
                    nn.MaxPool1d(kernel_size = maxpool_kernel_size)
                ).cuda()
                )

        self.conv_blocks = nn.ModuleList(conv_blocks)   # ModuleList is needed for registering parameters in conv_blocks
        self.fc = nn.Linear(num_filters*len(kernel_sizes), num_classes)

    def forward(self, x):       # x: (batch, sentence_len)
        x = self.embedding(x)   # embedded x: (batch, sentence_len, embedding_dim)
        #    input:  (batch, in_channel=1, in_length=sentence_len*embedding_dim),
        #    output: (batch, out_channel=num_filters, out_length=sentence_len-...)
        # needs to convert x to (batch, embedding_dim, sentence_len)
        x = x.reshape(x.size(0), -1)
        x = x.unsqueeze(1)
        x_list= [conv_block(x) for conv_block in self.conv_blocks]
        out = torch.cat(x_list, 2)
        out = out.view(out.size(0), 1,-1)
        feature_extracted = out
        out = F.dropout(out, p=0.5, training=self.training)
        return self.fc(out), feature_extracted


def evaluate(model, x_test, y_test):
    inputs = Variable(x_test)
    preds, vector = model(inputs)
    preds = torch.max(preds, 1)[1]
    if use_cuda:
        preds = preds.cuda()
    #eval_acc = sum(preds.data == y_test) / len(y_test)          # pytorch 0.3
    eval_acc = (preds.data == y_test).sum().item() / len(y_test) # pytorch 0.4
    return eval_acc, vector.cpu().data.numpy()


In [5]:
import sys
embedding_dim = 300
num_filters = 100
kernel_sizes = [3,4,5]
batch_size = 50

def load_pretrained_embeddings():
    pretrained_fpath_saved = os.path.expanduser("models/googlenews_extracted-python{}.pl".format(sys.version_info.major))
    if os.path.exists(pretrained_fpath_saved):
        with open(pretrained_fpath_saved, 'rb') as f:
            embedding_weights = pickle.load(f)
    else:
        print('- Error: file not found : {}\n'.format(pretrained_fpath_saved))
        print('- Please run the code "python utils.py" to generate the file first\n\n')
        sys.exit()

    # embedding_weights is a dictionary {word_index:numpy_array_of_300_dim}
    out = np.array(list(embedding_weights.values())) # added list() to convert dict_values to a list for use in python 3
    #np.random.shuffle(out)

    print('embedding_weights shape:', out.shape)
    # pretrained embeddings is a numpy matrix of shape (num_embeddings, embedding_dim)
    return out



In [6]:
use_pretrained_embeddings = False

if use_pretrained_embeddings:
    pretrained_embeddings = load_pretrained_embeddings()
else:
    pretrained_embeddingseddings = np.random.uniform(-0.01, -0.01, size=(vocab_size, embedding_dim))


In [7]:
use_cuda =torch.cuda.is_available

model = CNN().cuda()
print(model)

CNN(
  (embedding): Embedding(678055, 256)
  (conv_blocks): ModuleList(
    (0): Sequential(
      (0): Conv1d(1, 512, kernel_size=(768,), stride=(256,))
      (1): ReLU()
      (2): MaxPool1d(kernel_size=298, stride=298, padding=0, dilation=1, ceil_mode=False)
    )
    (1): Sequential(
      (0): Conv1d(1, 512, kernel_size=(1024,), stride=(256,))
      (1): ReLU()
      (2): MaxPool1d(kernel_size=297, stride=297, padding=0, dilation=1, ceil_mode=False)
    )
    (2): Sequential(
      (0): Conv1d(1, 512, kernel_size=(1280,), stride=(256,))
      (1): ReLU()
      (2): MaxPool1d(kernel_size=296, stride=296, padding=0, dilation=1, ceil_mode=False)
    )
  )
  (fc): Linear(in_features=1536, out_features=235, bias=True)
)


In [8]:
from torch.utils.data import TensorDataset, DataLoader
x_train = torch.from_numpy(X_train).long()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.999), eps=1e-08, weight_decay=4e-5)

criterion = torch.nn.CrossEntropyLoss(reduction='sum')
y_train = torch.from_numpy(Y_train).long()
dataset_train = TensorDataset(x_train, y_train)
#train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
train_loader = DataLoader(dataset_train, batch_size=2000, shuffle=True, num_workers=4, pin_memory=False)
model.train()
total_loss = 0
ncorrect = 0
nsentences = 0
ntokens = 0
niterations = 0
for epoch in range(50):
    for _i,(inputs, labels) in enumerate(train_loader):
        # Get input and target sequences from batch

        optimizer.zero_grad()
        inputs, labels = inputs.cuda(), labels.cuda()
        #labels = torch.tensor([d for d in labels], dtype=torch.long, device='cuda')
        preds, _ = model(inputs)
        preds = preds.squeeze(1)
        loss = criterion(preds, labels)
        loss.backward()
        optimizer.step()
        '''
        for param in model.parameters():
            print(param.grad.data.sum())
        '''
        # Training statistics
        total_loss += loss.item()
        ncorrect += (torch.max(preds, 1)[1] == labels).sum().item()
        nsentences += labels.numel()
        niterations += 1
        if _i%500:
            print("Loss:", loss.item()/labels.numel())
            print("Accuracy: ",(torch.max(preds, 1)[1] == labels).sum().item()/labels.numel())
    total_loss = total_loss / nsentences
    accuracy = 100 * ncorrect / nsentences
    print(accuracy)
    print(total_loss)
    print(f'Train: wpb={ntokens//niterations}, bsz={nsentences//niterations}, num_updates={niterations}')


Loss: 5.7916298828125
Accuracy:  0.009
Loss: 5.67406298828125
Accuracy:  0.019
Loss: 5.55397705078125
Accuracy:  0.0185
Loss: 5.38517724609375
Accuracy:  0.0245
Loss: 5.26264404296875
Accuracy:  0.029
Loss: 5.19072265625
Accuracy:  0.0295
Loss: 5.066814453125
Accuracy:  0.0485
Loss: 5.01407666015625
Accuracy:  0.063
Loss: 4.90265478515625
Accuracy:  0.0665
Loss: 4.8226259765625
Accuracy:  0.0775
Loss: 4.75219482421875
Accuracy:  0.081
Loss: 4.6866455078125
Accuracy:  0.0875
Loss: 4.6469111328125
Accuracy:  0.0805
Loss: 4.57485498046875
Accuracy:  0.096
Loss: 4.48790380859375
Accuracy:  0.1
Loss: 4.44159814453125
Accuracy:  0.111
Loss: 4.3303076171875
Accuracy:  0.1165
Loss: 4.2509013671875
Accuracy:  0.137
Loss: 4.26089892578125
Accuracy:  0.127
Loss: 4.2242333984375
Accuracy:  0.131
Loss: 4.15829052734375
Accuracy:  0.147
Loss: 4.037376708984375
Accuracy:  0.169
Loss: 4.039865478515625
Accuracy:  0.1605
Loss: 3.931399169921875
Accuracy:  0.184
Loss: 3.89369677734375
Accuracy:  0.181
L