In [1]:
import os
import shutil
import requests


REPOSITORY_PATH = "https://github.com/hse-aml/natural-language-processing"


def download_file(url, file_path):
    r = requests.get(url, stream=True)
    total_size = int(r.headers.get('content-length'))
    try:
        with open(file_path, 'wb', buffering=16*1024*1024) as f:

            for chunk in r.iter_content(32 * 1024):
                f.write(chunk)
                
    except Exception:
        print("Download failed")
    finally:
        if os.path.getsize(file_path) != total_size:
            os.remove(file_path)
            print("Removed incomplete download")


def download_from_github(version, fn, target_dir, force=False):
    url = REPOSITORY_PATH + "/releases/download/{0}/{1}".format(version, fn)
    file_path = os.path.join(target_dir, fn)
    if os.path.exists(file_path) and not force:
        print("File {} is already downloaded.".format(file_path))
        return
    download_file(url, file_path)


def sequential_downloader(version, fns, target_dir, force=False):
    os.makedirs(target_dir, exist_ok=True)
    for fn in fns:
        download_from_github(version, fn, target_dir, force=force)


def download_resources(force=False):
    sequential_downloader(
        "week1",
        [
            "train.tsv",
            "validation.tsv",
            "test.tsv",
            "text_prepare_tests.tsv",
        ],
        "data",
        force=force
    )


def download_project_resources(force=False):
    sequential_downloader(
        "project",
        [
            "dialogues.tsv",
            "tagged_posts.tsv",
        ],
        "data",
        force=force
    )

In [2]:
download_resources()

File data\train.tsv is already downloaded.
File data\validation.tsv is already downloaded.
File data\test.tsv is already downloaded.
File data\text_prepare_tests.tsv is already downloaded.


In [3]:
from ast import literal_eval
import pandas as pd
import numpy as np


In [4]:
import re
from sklearn.preprocessing import MultiLabelBinarizer
regex_pat = re.compile(r'\'')

def read_data(filename):
    data = pd.read_csv(filename, sep='\t', encoding='utf-8')
    data['tags'] = data['tags'].apply(literal_eval)
    
    return data

In [5]:
traindf = read_data("./data/train.tsv")
validdf = read_data("./data/validation.tsv")
test = pd.read_csv('./data/test.tsv', sep='\t')
traindf.tail()

Unnamed: 0,title,tags
99995,"Obj-c, incorrect checksum for freed object - o...","[iphone, objective-c, ios, cocoa-touch]"
99996,How to connect via HTTPS using Jsoup?,"[java, android]"
99997,Python Pandas Series of Datetimes to Seconds S...,"[python, datetime, pandas]"
99998,jqGrid issue grouping - Duplicate rows get app...,"[javascript, jquery]"
99999,Create a List of primitive int?,"[java, list, generics]"


In [6]:

from collections import Counter

tags_counts = Counter()

for tags in traindf['tags'].values:
    for tag in tags:
        tags_counts[tag] += 1

In [7]:
print(tags_counts)

Counter({'javascript': 19078, 'c#': 19077, 'java': 18661, 'php': 13907, 'python': 8940, 'jquery': 7510, 'c++': 6469, 'html': 4668, 'objective-c': 4338, 'asp.net': 3939, '.net': 3872, 'ruby-on-rails': 3344, 'ios': 3256, 'c': 3119, 'mysql': 3092, 'android': 2818, 'ruby': 2326, 'arrays': 2277, 'json': 2026, 'vb.net': 1918, 'iphone': 1909, 'django': 1835, 'css': 1769, 'ajax': 1767, 'r': 1727, 'string': 1573, 'winforms': 1468, 'swift': 1465, 'regex': 1442, 'angularjs': 1353, 'xml': 1347, 'spring': 1346, 'wpf': 1289, 'sql': 1272, 'asp.net-mvc': 1244, 'multithreading': 1118, 'eclipse': 992, 'linq': 964, 'xcode': 900, 'forms': 872, 'html5': 842, 'windows': 838, 'hibernate': 807, 'linux': 793, 'codeigniter': 786, 'node.js': 771, 'swing': 759, 'database': 740, 'list': 693, 'ruby-on-rails-3': 692, 'jsp': 680, 'image': 672, 'entity-framework': 649, 'web-services': 633, 'spring-mvc': 618, 'visual-studio-2010': 588, 'sql-server': 585, 'file': 582, 'sockets': 579, 'visual-studio': 574, 'date': 560, '

In [8]:
mlb = MultiLabelBinarizer()
traindf = traindf.join(pd.DataFrame(mlb.fit_transform(traindf['tags'].values),
                                #columns = mlb.classes_,
                              columns=sorted(tags_counts.keys()),
                              index=traindf.index))
traindf.drop('tags',axis=1,inplace=True)

validdf = validdf.join(pd.DataFrame(mlb.fit_transform(validdf['tags'].values),
                                #columns = mlb.classes_,
                              columns=sorted(tags_counts.keys()),
                              index=validdf.index))
validdf.drop('tags',axis=1,inplace=True)

In [9]:
traindf.to_csv('./data/train_prepped.tsv',index=False, header=True, sep="\t")
validdf.to_csv('./data/valid_prepped.tsv',index=False, header=True, sep="\t")

In [10]:
import torchtext
from torchtext import data
import torch

In [11]:
TEXT = data.Field(tokenize = 'spacy')
LABEL = data.Field(sequential=False, 
                         use_vocab=False, 
                         pad_token=None, 
                            unk_token=None)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [12]:

sorted_tag_lst = sorted(list(tags_counts.keys()))


In [13]:
class TagsMultiLabelDataset(torchtext.data.Dataset):
    def __init__(self, df, tt_text_field, tt_label_field, txt_col, lbl_cols, **kwargs):
#         dataFields = [("text", TEXT)]

#         for i in tags_counts:
#             dataFields.append((i, LABEL))
        
        #torchtext Field objects
        fields = [('text', tt_text_field)]
        for l in lbl_cols: fields.append((l, tt_label_field))
        
        
        
        is_test = False if lbl_cols[0] in df.columns else True
        n_labels = len(lbl_cols)
        
        examples = []
        for idx, row in df.iterrows():
            if not is_test:
                lbls = [ row[l] for l in lbl_cols ]
            else:
                lbls = [0.0] * n_labels
                
            txt = str(row[txt_col])
            
            examples.append(data.Example.fromlist([txt]+lbls, fields))
        fields = dict(fields)                   
        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(example): 
        return len(example.text)
    
    @classmethod
    def splits(cls, text_field, label_field, train_df, txt_col, lbl_cols, val_df=None, test_df=None, **kwargs):
        # build train, val, and test data
        train_data, val_data, test_data = (None, None, None)
        
        if train_df is not None: 
            train_data = cls(train_df.copy(), text_field, label_field, txt_col, lbl_cols, **kwargs)
        if val_df is not None: 
            val_data = cls(val_df.copy(), text_field, label_field, txt_col, lbl_cols, **kwargs)
        if test_df is not None: 
            test_data = cls(test_df.copy(), text_field, label_field, txt_col, lbl_cols, **kwargs)

        return tuple(d for d in (train_data, val_data, test_data) if d is not None)
    
    
class TextMultiLabelDataLoader():
    def __init__(self, src, x_fld, y_flds, y_dtype='torch.cuda.FloatTensor'):
        self.src, self.x_fld, self.y_flds = src, x_fld, y_flds
        self.y_dtype = y_dtype

    def __len__(self): return len(self.src)#-1

    def __iter__(self):
        it = iter(self.src)
        for i in range(len(self)):
            b = next(it)
            
            if (len(self.y_flds) > 1):
                targ = [ getattr(b, y) for y in self.y_flds ] 
                targ = torch.stack(targ, dim=1).type(self.y_dtype)
            else: 
                targ = getattr(b, self.y_flds[0])
                targ = targ.type(self.y_dtype)

            yield getattr(b, self.x_fld), targ


In [14]:
type(traindf)

pandas.core.frame.DataFrame

In [39]:
train_data= TagsMultiLabelDataset(traindf[:int(len(traindf)*0.5)],TEXT,LABEL,'title',sorted_tag_lst)

valid_data= TagsMultiLabelDataset(validdf[:int(len(traindf)*0.3)],TEXT,LABEL,'title',sorted_tag_lst)

In [40]:
print(len(train_data))
print(len(valid_data))

50000
30000


In [41]:
MAX_VOCAB_SIZE = 10_000

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d")


In [42]:
train_iterator = data.Iterator(train_data, batch_size=128, device=device, shuffle=False)
valid_iterator = data.Iterator(valid_data, batch_size=1024, device=device, shuffle=False)

In [43]:
print(train_iterator.dataset.examples[0].text)

['How', 'to', 'draw', 'a', 'stacked', 'dotplot', 'in', 'R', '?']


In [44]:
#import re

#re.sub(r'[\[,\s\]\']', '', train_iterator.dataset.examples[0].label)

In [45]:
len(TEXT.vocab)

10002

In [46]:
train_iterator.create_batches()
for batch in train_iterator.batches:
    print(getattr(batch[0],['.net','java'][0]))
    break
    

0


In [47]:
train_iterator.create_batches()
for batch in train_iterator:
    print(getattr(batch,'.net'))
    break

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')


In [48]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [sent len, batch size]
        text = text.permute(1, 0)
        #text = [batch size, sent len]
        embedded = self.embedding(text)
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conv_n = [batch size, n_filters, sent len - filter_sizes[n]]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [49]:
len(TEXT.vocab)

10002

In [50]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300 #
N_FILTERS = 100
FILTER_SIZES = [2,3,4]
OUTPUT_DIM = len(sorted_tag_lst)
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [51]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,301,000 trainable parameters


In [52]:
#pretrained_embeddings = TEXT.vocab.vectors

#model.embedding.weight.data.copy_(pretrained_embeddings)

In [53]:
#UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

#model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
#model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [54]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss() 

model = model.to(device)
criterion = criterion.to(device)

In [56]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    
    model.train()
    
    iterator.create_batches()
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text)
        
        #print(f'prediction size:{predictions.size()}')
        
        outputs = torch.empty((predictions.size()[0],0)) #torch.zeros((predictions.size()[1],predictions.size()[0]))
        outputs = outputs.to(device)
        #print(outputs.size())
        
        for j in sorted_tag_lst:
            out = getattr(batch, j)
            out = torch.unsqueeze(out, -1)
            out=out.to(device)
            outputs=torch.cat((outputs,out),dim=1)
            
        
        loss = criterion(predictions,outputs)
        
        #outputs = outputs.T
        
        
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    
        
    return epoch_loss / len(iterator)

In [57]:

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text)
            
            outputs = torch.empty((predictions.size()[0],0)) #torch.zeros((predictions.size()[1],predictions.size()[0]))
            outputs = outputs.to(device)
            #print(outputs.size())

            for j in sorted_tag_lst:
                out = getattr(batch, j)
                out = torch.unsqueeze(out, -1)
                out=out.to(device)
                outputs=torch.cat((outputs,out),dim=1)
            
            
            loss = criterion(predictions, outputs)
            #outputs = outputs.T
            

            epoch_loss += loss.item()
            
        
    return epoch_loss / len(iterator)

In [58]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [59]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion)
    valid_loss = evaluate(model,valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'cnn-stackoverflow-model1.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f}')

Epoch: 01 | Epoch Time: 0m 45s
	Train Loss: 0.080 | Train Acc: -228.851318359375%
	 Val. Loss: 0.059 |  Val. Acc: -229.1893768310547%
Epoch: 02 | Epoch Time: 0m 47s
	Train Loss: 0.057 | Train Acc: -234.7738800048828%
	 Val. Loss: 0.050 |  Val. Acc: -213.4608917236328%
Epoch: 03 | Epoch Time: 0m 44s
	Train Loss: 0.049 | Train Acc: -232.67343139648438%
	 Val. Loss: 0.046 |  Val. Acc: -215.85243225097656%
Epoch: 04 | Epoch Time: 0m 44s
	Train Loss: 0.045 | Train Acc: -214.67051696777344%
	 Val. Loss: 0.043 |  Val. Acc: -224.436279296875%
Epoch: 05 | Epoch Time: 0m 46s
	Train Loss: 0.042 | Train Acc: -251.72097778320312%
	 Val. Loss: 0.043 |  Val. Acc: -246.05360412597656%


In [60]:
model.load_state_dict(torch.load('cnn-stackoverflow-model1.pt'))

<All keys matched successfully>

In [61]:
import spacy
nlp = spacy.load('en')

def predict_class(model, sentence, min_len = 4):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    preds = model(tensor)
    preds = torch.squeeze(preds)
    top = preds.argsort()
    
    return top[-3:].cpu().numpy()[::-1]
    

In [65]:
preds = predict_class(model, "Pandas Series of Datetimes to Seconds?")
print(preds)
#print(f'Predicted class is: {pred_class} = {LABEL.vocab.itos[pred_class]}')
for i in preds:
    print(sorted_tag_lst[i])

[62 57 63]
python
pandas
python-2.7


Unnamed: 0,title,.net,ajax,algorithm,android,angularjs,apache,arrays,asp.net,asp.net-mvc,...,visual-studio-2010,wcf,web-services,windows,winforms,wordpress,wpf,xaml,xcode,xml
99995,"Obj-c, incorrect checksum for freed object - o...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99996,How to connect via HTTPS using Jsoup?,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99997,Python Pandas Series of Datetimes to Seconds S...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99998,jqGrid issue grouping - Duplicate rows get app...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99999,Create a List of primitive int?,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
