In [160]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as f
import spacy
import sklearn as sl

In [161]:
import re
import string

In [162]:
# Import the prebuilt word embeddings, upon which the incoming dataset will be convolved.
word_embeddings = spacy.load('en', vectors='glove.6B.300d.txt')

In [223]:
def sequence_to_data(seq, max_len=None):
    ####Converting sequence to data basically convertig words to vectors
    data = [word_embeddings(ix).vector for ix in seq.split()]
    if max_len == None:
        max_len = len(data)
    data_mat = np.zeros((1, max_len, 96))
    print(data_mat.shape,np.asarray(data).shape)
    for ix in range(min(max_len, len(data))):
        
        data_mat[:, ix, :] = data[ix]

    return data_mat

def seq_data_matrix(seq_data, max_len=None):  ####Now Concating different sentences and converting to a matrix
    data = np.concatenate([sequence_to_data(ix, max_len) for ix in seq_data], axis=0)
    return data

In [164]:
df = pd.read_csv("datasets/V1.4_Training.csv")#loading dataset

### PART 1 : Go through the dataset, and correct basic mistakes.

In [10]:
df

Unnamed: 0,663_3,"""Please enable removing language code from the Dev Center ""language history"" For example if you ever selected ""ru"" and ""ru-ru"" laguages and you published this xap to the Store then it causes Tile localization to show the en-us(default) tile localization which is bad.""",1
0,663_4,"""Note: in your .csproj file, there is a Suppor...",0
1,664_1,"""Wich means the new version not fully replaced...",0
2,664_2,"""Some of my users will still receive the old x...",0
3,664_3,"""The store randomly gives the old xap or the n...",0
4,664_4,"""My app has a WP7 version and a WP8 version XA...",0
5,664_5,"""The wp7 xap works only on WP7 and the wp8 xap...",0
6,665_1,"""Sometimes the Store gives the wrong wp7 xap v...",0
7,665_2,"""It should be an option to remove the ""ru"" lan...",1
8,665_3,"""Currently if you ever mistakenly selected a ""...",0
9,665_5,"""): the store will randomly deliver the old/wr...",0


In [166]:
texts = np.concatenate((['"Please enable removing language code from the Dev Center "language history" For example if you ever selected "ru" and "ru-ru" laguages and you published this xap to the Store then it causes Tile localization to show the en-us(default) tile localization which is bad."'],df.iloc[:]['"Please enable removing language code from the Dev Center "language history" For example if you ever selected "ru" and "ru-ru" laguages and you published this xap to the Store then it causes Tile localization to show the en-us(default) tile localization which is bad."']),axis=0)

In [167]:
labels = np.concatenate(([1],df.iloc[:]['1'].values),axis=0)

In [168]:
len(labels),len(texts)

(8500, 8500)

In [169]:
texts = [text.replace('"','') for text in texts]

In [170]:
new_dataframe = pd.DataFrame({'texts':[],'labels':[]})

In [171]:
new_dataframe['texts'] = texts
new_dataframe['labels'] = labels

In [172]:
new_dataframe.head()

Unnamed: 0,texts,labels
0,Please enable removing language code from the ...,1
1,"Note: in your .csproj file, there is a Support...",0
2,Wich means the new version not fully replaced ...,0
3,Some of my users will still receive the old xa...,0
4,The store randomly gives the old xap or the ne...,0


In [173]:
new_dataframe['len'] = ' '

In [174]:
list(labels).count(1)

2085

In [175]:
list(labels).count(0)

6415

The ratio of classes is 3:1 for classes 0:1, which is neither a good ratio, nor a bad one, however it may pose the problem of class imbalance. So I'll first ignore this problem and proceed, then take care of this problem and proceed thence. I'll then compare the F1 scores of both to see if there's a major boost.

##### So, the dataset had the problem of the first row mismatch, I corrected that and created a new dataframe with texts and their corresponding labels.

## PART 2 : Text preprocessing

1. Lowercase every text
2. Remove all numbers
3. Remove punctuation ([!”#$%&’()*+,-./:;<=>?@[\]^_`{|}~]) and urls
4. Remove whitespaces


In [176]:
# Lowercase and remove whitespaces
texts = [text.lower().strip() for text in texts]


# Remove numbers
texts = [re.sub(r'\d+', '', text) for text in texts]


#Remove Punctuation
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
for i in range(len(texts)):
    texts[i] = ' '.join(tokenizer.tokenize(texts[i]))
new_dataframe['texts'] = texts 

In [178]:
# Now bucketing the data, as the texts are of different lengths.
new_dataframe['len'] = [len(text) for text in texts]

In [180]:
min(new_dataframe['len'].values),max(new_dataframe['len'].values)

(0, 1768)

In [181]:
list(new_dataframe['len'].values).count(0)

12

###### These texts may have been inclusive only of the special characters we removed, so are now outliers in our context

In [182]:
len(new_dataframe)

8500

In [183]:
len(new_dataframe.dropna())

8500

In [55]:
new_dataframe = pd.DataFrame(new_dataframe)

In [184]:
for i in range(len(new_dataframe)):
    if(new_dataframe.loc[i].len == 0):
        new_dataframe = new_dataframe.drop(i)

In [185]:
len(new_dataframe[(new_dataframe.len==0)])

0

In [186]:
len(new_dataframe)

8488

In [188]:
min(new_dataframe['len'].values),max(new_dataframe['len'].values)

(2, 1768)

In [189]:
def make_batch(df, batch_size=10, gpu=False):# making batches to pass in model during training
    for bx in range(len(bucket_sizes)):
        bucket_data = df[(df.bucket==bx)].reset_index(drop=True)
        
        start = 0
        stop = start + batch_size
        while start < bucket_data.shape[0]:
            seq_len = bucket_sizes[bx][1]
            section = bucket_data[start:stop]
            xdata = seq_data_matrix(section.texts, max_len=seq_len)
            ydata = section.labels
            if gpu == True:
                yield Variable(torch.FloatTensor(xdata).cuda(), requires_grad=True), Variable(torch.LongTensor(ydata)).cuda()
            else:
                yield Variable(torch.FloatTensor(xdata), requires_grad=True), Variable(torch.LongTensor(ydata))
            
            start = stop
            stop = start + batch_size
    

In [190]:
unique, counts = np.unique(new_dataframe['len'].values, return_counts=True)

In [191]:
unique_lengths = dict(zip(unique, counts))

In [193]:
bucket_sizes = [[0, 25],[25,50], [50, 100], [100, 150], [150, 200], [200, 250], [250, 350],[350,700],[700,2000]]

def assign_bucket(x):  
    for bucket in bucket_sizes:
        if x>=bucket[0] and x<=bucket[1]:
            return bucket_sizes.index(bucket)
    return len(bucket_sizes)-1

In [194]:
new_dataframe['bucket'] = new_dataframe.len.apply(assign_bucket)
new_dataframe.head()

Unnamed: 0,texts,labels,len,bucket
0,please enable removing language code from the ...,1,258,6
1,note in your csproj file there is a supportedc...,0,266,6
2,wich means the new version not fully replaced ...,0,102,3
3,some of my users will still receive the old xa...,0,65,2
4,the store randomly gives the old xap or the ne...,0,69,2


In [195]:
new_dataframe = new_dataframe.sort_values(by=['bucket'])
new_dataframe.head()

Unnamed: 0,texts,labels,len,bucket
1556,select delete,0,13,0
1699,is it a good idea,0,17,0
6570,waiting for reply,0,17,0
6190,thankyou,0,8,0
4414,best regards from germany,0,25,0


### Model of LSTM

In [196]:
class SentModel(nn.Module):
    def __init__(self, in_shape=None, out_shape=None, hidden_shape=None):
        super(SentModel, self).__init__()
        self.in_shape = in_shape
        self.out_shape = out_shape
        self.hidden_shape = hidden_shape
        self.n_layers = 1
        
        self.rnn = nn.LSTM(
                        input_size = self.in_shape,
                        hidden_size = self.hidden_shape,
                        num_layers = self.n_layers,
                        batch_first = True
        )
        self.lin = nn.Linear(self.hidden_shape, 64)
        self.dropout = nn.Dropout(0.42)
        self.out = nn.Linear(64, self.out_shape)
        
        
    def forward(self, x, h):
        r_out, h_state = self.rnn(x, h)
        last_out = r_out[:, -1, :]
        y = f.tanh(self.lin(last_out))
        y = self.dropout(y)
        y = f.softmax(self.out(y))
        return y
    
    def predict(self, x):
        h_state = self.init_hidden(1)    
        x = sequence_to_data(x)
        pred = self.forward(torch.FloatTensor(x), h_state)
        return pred
    
    def get_embedding(self, x):
        h_state = self.init_hidden(1, gpu=False)
        
        x = sequence_to_data(x)
        r_out, h = self.rnn(torch.FloatTensor(x), h_state)
        last_out = r_out[:, -1, :]
        
        return last_out.data.numpy()
    
    def init_hidden(self, batch_size):
        return (Variable(torch.zeros(self.n_layers, batch_size, self.hidden_shape)),
                Variable(torch.zeros(self.n_layers, batch_size, self.hidden_shape)))


In [197]:
model = SentModel(in_shape=384, hidden_shape=256, out_shape=2)

print(model)

SentModel(
  (rnn): LSTM(384, 256, batch_first=True)
  (lin): Linear(in_features=256, out_features=64, bias=True)
  (dropout): Dropout(p=0.42)
  (out): Linear(in_features=64, out_features=2, bias=True)
)


In [198]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)
criterion = nn.CrossEntropyLoss()

In [199]:
new_dataframe = new_dataframe.reset_index(drop=True)

In [224]:
for epoch in range(50):
    total_loss = 0
    N = 0
    for step, (b_x, b_y) in enumerate(make_batch(new_dataframe, batch_size=200)):
        bsize = b_x.size(0)
        
        h_state = model.init_hidden(bsize)

        pred = model(b_x, h_state)
        loss = criterion(pred, b_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss
        N += 1.0
        if step%2 == 0:
            print('Loss: {} at Epoch: {} | Step: {}'.format(loss, epoch, step))
        
    print("Overall Average Loss: {} at Epoch: {}".format(total_loss / float(N), epoch))
    
 

torch.save(model.state_dict(), "model_256h_epoch_{}.ckpt".format(epoch))


(1, 25, 96) (2, 96)
(1, 25, 96) (5, 96)
(1, 25, 96) (3, 96)
(1, 25, 96) (1, 96)
(1, 25, 96) (4, 96)
(1, 25, 96) (4, 96)
(1, 25, 96) (1, 96)
(1, 25, 96) (3, 96)
(1, 25, 96) (6, 96)
(1, 25, 96) (2, 96)
(1, 25, 96) (3, 96)
(1, 25, 96) (6, 96)
(1, 25, 96) (1, 96)
(1, 25, 96) (2, 96)
(1, 25, 96) (3, 96)
(1, 25, 96) (5, 96)
(1, 25, 96) (3, 96)
(1, 25, 96) (3, 96)
(1, 25, 96) (5, 96)
(1, 25, 96) (3, 96)
(1, 25, 96) (6, 96)
(1, 25, 96) (3, 96)
(1, 25, 96) (4, 96)
(1, 25, 96) (1, 96)
(1, 25, 96) (3, 96)
(1, 25, 96) (3, 96)
(1, 25, 96) (5, 96)
(1, 25, 96) (6, 96)
(1, 25, 96) (2, 96)
(1, 25, 96) (2, 96)
(1, 25, 96) (4, 96)
(1, 25, 96) (3, 96)
(1, 25, 96) (3, 96)
(1, 25, 96) (3, 96)
(1, 25, 96) (3, 96)
(1, 25, 96) (3, 96)
(1, 25, 96) (3, 96)
(1, 25, 96) (4, 96)
(1, 25, 96) (1, 96)
(1, 25, 96) (4, 96)
(1, 25, 96) (4, 96)
(1, 25, 96) (7, 96)
(1, 25, 96) (3, 96)
(1, 25, 96) (7, 96)
(1, 25, 96) (2, 96)
(1, 25, 96) (3, 96)
(1, 25, 96) (2, 96)
(1, 25, 96) (4, 96)
(1, 25, 96) (3, 96)
(1, 25, 96) (5, 96)


RuntimeError: input.size(-1) must be equal to input_size. Expected 384, got 96

In [159]:
new_dataframe.head()

Unnamed: 0,texts,labels,len,bucket
0,hey i have feedly notifier in firefox on windows,0,48,0
1,broadfilesystemaccess works when min version,0,44,0
2,is there any plan to support this protocol in wp,0,48,0
3,here are some examples,0,22,0
4,imagine having an installed app do___ more,0,42,0
