In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as f
import spacy
import sklearn as sl
import re
import string

In [2]:
# Import the prebuilt word embeddings, upon which the incoming dataset will be convolved.
word_embeddings = spacy.load('en')

In [3]:
def sequence_to_data(seq, max_len=None):
    ####Converting sequence to data basically convertig words to vectors
    data = [word_embeddings(ix).vector for ix in seq.split()]
    if max_len == None:
        max_len = len(data)
    data_mat = np.zeros((1, max_len, 96))
    for ix in range(min(max_len, len(data))):
        data_mat[:, ix, :] = np.expand_dims(data[ix],axis=0)

    return data_mat

def seq_data_matrix(seq_data, max_len=None):  ####Now Concating different sentences and converting to a matrix
    data = np.concatenate([sequence_to_data(ix, max_len) for ix in seq_data], axis=0)
    return data

In [6]:
df = pd.read_csv("datasets/V1.4_Training.csv")#loading dataset

In [7]:
df

Unnamed: 0,663_3,"""Please enable removing language code from the Dev Center ""language history"" For example if you ever selected ""ru"" and ""ru-ru"" laguages and you published this xap to the Store then it causes Tile localization to show the en-us(default) tile localization which is bad.""",1
0,663_4,"""Note: in your .csproj file, there is a Suppor...",0
1,664_1,"""Wich means the new version not fully replaced...",0
2,664_2,"""Some of my users will still receive the old x...",0
3,664_3,"""The store randomly gives the old xap or the n...",0
4,664_4,"""My app has a WP7 version and a WP8 version XA...",0
5,664_5,"""The wp7 xap works only on WP7 and the wp8 xap...",0
6,665_1,"""Sometimes the Store gives the wrong wp7 xap v...",0
7,665_2,"""It should be an option to remove the ""ru"" lan...",1
8,665_3,"""Currently if you ever mistakenly selected a ""...",0
9,665_5,"""): the store will randomly deliver the old/wr...",0


In [8]:
texts = np.concatenate((['"Please enable removing language code from the Dev Center "language history" For example if you ever selected "ru" and "ru-ru" laguages and you published this xap to the Store then it causes Tile localization to show the en-us(default) tile localization which is bad."'],df.iloc[:]['"Please enable removing language code from the Dev Center "language history" For example if you ever selected "ru" and "ru-ru" laguages and you published this xap to the Store then it causes Tile localization to show the en-us(default) tile localization which is bad."']),axis=0)

In [9]:
labels = np.concatenate(([1],df.iloc[:]['1'].values),axis=0)

In [10]:
len(labels),len(texts)

(8500, 8500)

In [11]:
texts = [text.replace('"','') for text in texts]

In [12]:
new_dataframe = pd.DataFrame({'texts':[],'labels':[]})

In [13]:
new_dataframe['texts'] = texts
new_dataframe['labels'] = labels

In [14]:
new_dataframe.head()

Unnamed: 0,texts,labels
0,Please enable removing language code from the ...,1
1,"Note: in your .csproj file, there is a Support...",0
2,Wich means the new version not fully replaced ...,0
3,Some of my users will still receive the old xa...,0
4,The store randomly gives the old xap or the ne...,0


In [15]:
new_dataframe['len'] = ' '

In [16]:
list(labels).count(1)

2085

In [17]:
list(labels).count(0)

6415

In [0]:
# Lowercase and remove whitespaces
texts = [text.lower().strip() for text in texts]


# Remove numbers
texts = [re.sub(r'\d+', '', text) for text in texts]


#Remove Punctuation
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
for i in range(len(texts)):
    texts[i] = ' '.join(tokenizer.tokenize(texts[i]))
new_dataframe['texts'] = texts 

In [0]:
# Now bucketing the data, as the texts are of different lengths.
new_dataframe['len'] = [len(text) for text in texts]

In [0]:
min(new_dataframe['len'].values),max(new_dataframe['len'].values)

(0, 1768)

In [0]:
list(new_dataframe['len'].values).count(0)

12

In [0]:
len(new_dataframe)

8500

In [0]:
len(new_dataframe.dropna())

8500

In [0]:
new_dataframe = pd.DataFrame(new_dataframe)

In [0]:
for i in range(len(new_dataframe)):
    if(new_dataframe.loc[i].len == 0):
        new_dataframe = new_dataframe.drop(i)

In [0]:
len(new_dataframe[(new_dataframe.len==0)])

0

In [0]:
len(new_dataframe)

8488

In [0]:
min(new_dataframe['len'].values),max(new_dataframe['len'].values)

(2, 1768)

In [0]:
def make_batch(data, batch_size=200, gpu=True):# making batches to pass in model during training
    for bx in range(len(bucket_sizes)):
        bucket_data = new_dataframe[(new_dataframe.bucket==bx)].reset_index(drop=True)
        start = 0
        stop = start + batch_size
        while start < bucket_data.shape[0]:
            seq_len = bucket_sizes[bx][1]
            section = bucket_data[start:stop]
            xdata = seq_data_matrix(section.texts, max_len=seq_len)
            ydata = np.asarray(section['labels'].values)
            if gpu == True:
                yield Variable(torch.FloatTensor(xdata).cuda(), requires_grad=True), Variable(torch.LongTensor(ydata)).cuda()
            else:
                yield Variable(torch.FloatTensor(xdata), requires_grad=True), Variable(torch.LongTensor(ydata))
            
            start = stop
            stop = start + batch_size
    

In [0]:
unique, counts = np.unique(new_dataframe['len'].values, return_counts=True)

In [0]:
unique_lengths = dict(zip(unique, counts))

In [0]:
bucket_sizes = [[0, 25],[25,50], [50, 100], [100, 150], [150, 200], [200, 250], [250, 350],[350,700],[700,2000]]

def assign_bucket(x):  
    for bucket in bucket_sizes:
        if x>=bucket[0] and x<=bucket[1]:
            return bucket_sizes.index(bucket)
    return len(bucket_sizes)-1

In [0]:
new_dataframe['bucket'] = new_dataframe.len.apply(assign_bucket)
new_dataframe.head()

Unnamed: 0,labels,texts,len,bucket
0,1,please enable removing language code from the ...,258,6
1,0,note in your csproj file there is a supportedc...,266,6
2,0,wich means the new version not fully replaced ...,102,3
3,0,some of my users will still receive the old xa...,65,2
4,0,the store randomly gives the old xap or the ne...,69,2


In [0]:
new_dataframe = new_dataframe.sort_values(by=['bucket'])
new_dataframe.head()

Unnamed: 0,labels,texts,len,bucket
1556,0,select delete,13,0
1699,0,is it a good idea,17,0
6570,0,waiting for reply,17,0
6190,0,thankyou,8,0
4414,0,best regards from germany,25,0


In [0]:
class SentModel(nn.Module):
    def __init__(self, in_shape=None, out_shape=None, hidden_shape=None):
        super(SentModel, self).__init__()
        self.in_shape = in_shape
        self.out_shape = out_shape
        self.hidden_shape = hidden_shape
        self.n_layers = 1
        
        self.rnn = nn.LSTM(
                        input_size = self.in_shape,
                        hidden_size = self.hidden_shape,
                        num_layers = self.n_layers,
                        batch_first = True
        )
        self.lin = nn.Linear(self.hidden_shape, 64)
        self.dropout = nn.Dropout(0.42)
        self.out = nn.Linear(64, self.out_shape)
        
        
    def forward(self, x, h):
        r_out, h_state = self.rnn(x, h)
        last_out = r_out[:, -1, :]
        y = f.tanh(self.lin(last_out))
        y = self.dropout(y)
        y = f.softmax(self.out(y))
        return y
    
    def predict(self, x):
        h_state = self.init_hidden(1, gpu=False)    
        x = sequence_to_data(x)
        pred = self.forward(torch.FloatTensor(x), h_state)
        return pred
    
    def get_embedding(self, x):
        h_state = self.init_hidden(1, gpu=False)
        
        x = sequence_to_data(x)
        r_out, h = self.rnn(torch.FloatTensor(x), h_state)
        last_out = r_out[:, -1, :]
        
        return last_out.data.numpy()
      
    def init_hidden(self, batch_size, gpu=True):
        if gpu:
            return (Variable(torch.zeros(self.n_layers, batch_size, self.hidden_shape).cuda()),
                    Variable(torch.zeros(self.n_layers, batch_size, self.hidden_shape)).cuda())
        return (Variable(torch.zeros(self.n_layers, batch_size, self.hidden_shape)),
                Variable(torch.zeros(self.n_layers, batch_size, self.hidden_shape)))


In [0]:
model = SentModel(in_shape=384, hidden_shape=256, out_shape=2)
model.cuda()
print(model)

SentModel(
  (rnn): LSTM(384, 256, batch_first=True)
  (lin): Linear(in_features=256, out_features=64, bias=True)
  (dropout): Dropout(p=0.42)
  (out): Linear(in_features=64, out_features=2, bias=True)
)


In [0]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)
criterion = nn.CrossEntropyLoss()

In [0]:
new_dataframe = new_dataframe.reset_index(drop=True)

In [0]:
for epoch in range(50):
    total_loss = 0
    N = 0
    for step, (b_x, b_y) in enumerate(make_batch(new_dataframe, batch_size=200)):
        bsize = b_x.size(0)
        
        h_state = model.init_hidden(bsize, gpu = True)

        pred = model(b_x, h_state)
        loss = criterion(pred, b_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss
        N += 1.0
        if step%20 == 0:
            print('Loss: {} at Epoch: {} | Step: {}'.format(loss, epoch, step))
        
    print("Overall Average Loss: {} at Epoch: {}".format(total_loss / float(N), epoch))
    
 

torch.save(model.state_dict(), "model_256h_epoch_{}.ckpt".format(epoch))



(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)



(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)
(1, 25, 384)

KeyboardInterrupt: ignored

In [0]:
next(model.parameters()).is_cuda

True