In [70]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import sys
import pickle
import nltk
from nltk.corpus import stopwords
from torch.utils.data import DataLoader
import re
sys.path.append('.')
# nltk.download('stopwords')
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

In [49]:
from gensim.models import word2vec as w2v

sent = [["'s", 'buying', 'stairway', 'heaven'], ['help', 'need', 'somebody', 'help', 'anybody', 'help', 'need', 'someone', 'helppppp']]

model = w2v.Word2Vec(min_count=1)
model.build_vocab(sent)
print(model.wv.vocab)
model.train(sent,total_examples=len(sent),epochs=1)
print (model.wv['buying'])

{"'s": <gensim.models.keyedvectors.Vocab object at 0x7f22da5f8d50>, 'buying': <gensim.models.keyedvectors.Vocab object at 0x7f22da5f8d90>, 'stairway': <gensim.models.keyedvectors.Vocab object at 0x7f22da5f8090>, 'heaven': <gensim.models.keyedvectors.Vocab object at 0x7f22da793cd0>, 'help': <gensim.models.keyedvectors.Vocab object at 0x7f22da5c2590>, 'need': <gensim.models.keyedvectors.Vocab object at 0x7f22da621c50>, 'somebody': <gensim.models.keyedvectors.Vocab object at 0x7f22da6215d0>, 'anybody': <gensim.models.keyedvectors.Vocab object at 0x7f22da621dd0>, 'someone': <gensim.models.keyedvectors.Vocab object at 0x7f22da7938d0>, 'helppppp': <gensim.models.keyedvectors.Vocab object at 0x7f22da5c2e90>}
[-2.0143199e-03 -4.4827542e-04 -1.9989237e-03  1.3504165e-03
 -1.4786172e-03 -2.8101471e-03  4.0211435e-03  3.8053938e-03
 -8.4860559e-04  4.4311318e-03  4.4809170e-03  4.3492666e-03
  7.9447532e-04 -5.0487078e-04  4.6522631e-03 -5.8973208e-04
 -3.9836671e-04 -3.1705368e-03  4.8308964e-03

In [2]:
# params = {"batch_size": args.batch_size,
#                        "shuffle": True,
#                        "num_workers": args.workers,}

NameError: name 'args' is not defined

In [3]:
def preprocess(text):
    sw = set(list(stopwords.words('english'))+ re.split('',"!\"#$%&'()*+, -./:;<=>?@[\]^_`{|}~"))
    text = text.lower()
    word_token = nltk.word_tokenize(text)
    word_token = [word for word in word_token if word not in sw]
    return word_token

In [4]:
def load_data(path):
    texts = []
    labels = []
    for text in (os.listdir(path)):
        with open(os.path.join(path,text),'r') as file:
            texts.append(
                preprocess(str(file.read()))
            )
            labels.append(text[:-4])
    return texts, labels


In [180]:
class EmbeddingModel():

    def __init__(self,args):
        '''
            A wrapper class based on Gensim Word2Vec model.
            Required Libraries: re, gensim, nltk
        '''
        self.built = False
        self.args = args
        self.trained = False
        self.data = False
        self.sw = set(list(stopwords.words('english'))+ re.split('',"!\"#$%&'()*+, -./:;<=>?@[\]^_`{|}~"))
        print('embedding model initialized with parameters:\n', args)

    def build(self):
        '''
            Builds a new model with parameters specified in args.
        '''
        if not self.built and self.data:
            self.model = Word2Vec(
                            self.texts,
                            size = self.args['size'], 
                            min_count = self.args['min_count'],
                            workers = self.args['workers'],
                            sg = self.args['sg'],
                        )
            self.build = True
        else:
            print('model alreay built')

    def load_model(self,path):
        '''
            Loads the existing model from specified path
        '''
        if self.build:
            print('A model exists. Still want to load?(y or n')
            if str(input()) == 'y':
                self.model = Word2Vec.load(path)
                self.built = True

    def save_model(self,path):
        self.model.save(path)

    def load_data(self,path):
        texts = []
        labels = []
        print(f'loading data from {path}...')
        for text in (os.listdir(path)):
            with open(os.path.join(path,text),'r') as file:
                texts.append(
                    self.preprocess(str(file.read()))
                )
                labels.append(text.split('.')[0])
        self.texts = texts
        self.labels = labels
        self.data = True
        print(f'Total {len(texts)} texts loaded...')

    def __train(self,text,epochs=5):
        if self.data:
            self.text = text
            self.model.train(
                self.text, 
                total_examples=len(self.texts), 
                epochs=epochs
            )
        else:
            print('Load the data first.')

    def embedd(self,sentence):
        if self.build:
            return self.model.wv[self.preprocess(sentence)]
        else: print("Build the model first.")

    def save_embeddings(self,path):
        embeddings = list(map(self.embedd,[' '.join(x) for x in self.texts]))
        print(f'saving embeddings to {path}')
        with open(path,'wb') as file:
            pickle.dump(np.array(embeddings),file)
        return embeddings
    
    def preprocess(self,text):
        text = text.lower()
        text = re.sub('[^a-zA-Z]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        word_token = nltk.word_tokenize(text)
        word_token = [word for word in word_token if word not in self.sw]
        return word_token

In [181]:
args = {
    'size' : 100, 
    'min_count' : 1,
    'workers' : os.cpu_count(),
    "sg" : 1,
}

In [182]:
em = EmbeddingModel(args)
em.load_data('training')
em.build()

embedding model initialized with parameters:
 {'size': 100, 'min_count': 1, 'workers': 8, 'sg': 1}
loading data from training...
Total 2 texts loaded...


In [31]:
class TextDataset(Dataset):

    def __init__(self, texts, labels, args):
        self.vocab = args.vocab
        self.max_len = args.max_length

        self.length = len(texts)
        self.texts = texts
        self.labels = labels
        self.identity = np.eye(len(self.vocab))
        self.stopwords = set(list(stopwords.words('english'))+ re.split('',"!\"#$%&'()*+, -./:;<=>?@[\]^_`{|}~"))

    def __len__(self):
        return self.length

    def preprocess(self,text):
        text = text.lower()
        word_token = nltk.word_tokenize(text)
        word_token = [word for word in word_token if word not in self.stopwords]
        return word_token

    def __getitem__(self,index):
        raw_text = self.texts[index]
        processed_text = self.preprocess(raw_text)

        label = self.labels[index]

        return ' '.join(processed_text), label

In [32]:
dataset = TextDataset(load_data('training')[0], [1,2],args)

In [33]:
dataloader = DataLoader(dataset,**params)

In [11]:
X = iter(dataloader)
X = X.next()
X

[('help need somebody help anybody help need someone helppppp',
  "'s buying stairway heaven"),
 tensor([2, 1])]

In [11]:
def weights_init_uniform(m,mean=0.0,var=0.05):
    classname = m.__class__.__name__
    # for every Linear layer in a model..
    if classname.find('Linear') != -1:
        # apply a uniform distribution to the weights and a bias=0
        m.weight.data.uniform_(mean, var)
        m.bias.data.fill_(0)

In [12]:
class CharCNN(nn.Module):
    def __init__(self):
        super(CharCNN,self).__init__()

        self.model = nn.Sequential(

                        nn.Conv1d(len(args.vocab), 256, kernel_size=7, padding=0),
                        nn.ReLU(),
                        nn.MaxPool1d(3),
                        nn.Conv1d(256, 256, kernel_size=7, padding=0),
                        nn.ReLU(),
                        nn.MaxPool1d(3),
                        nn.Conv1d(256, 256, kernel_size=3, padding=0),
                        nn.ReLU(),
                        nn.Conv1d(256, 256, kernel_size=3, padding=0),
                        nn.ReLU(),
                        nn.Conv1d(256, 256, kernel_size=3, padding=0),
                        nn.ReLU(),
                        nn.Conv1d(256, 256, kernel_size=3, padding=0),
                        nn.ReLU(),
                        nn.MaxPool1d(3),
                        nn.Flatten(),
                        nn.Linear(256*34,1024),
                        nn.ReLU(),
                        nn.Dropout(0.5),
                        nn.Linear(1024, 1024),
                        nn.ReLU(),
                        nn.Dropout(0.5)
        )

    def forward(self,x):
        return self.model(x)

In [13]:
net = CharCNN()
net.apply(weights_init_uniform)

CharCNN(
  (model): Sequential(
    (0): Conv1d(70, 256, kernel_size=(7,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
    (3): Conv1d(256, 256, kernel_size=(7,), stride=(1,))
    (4): ReLU()
    (5): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
    (6): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (7): ReLU()
    (8): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (9): ReLU()
    (10): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (11): ReLU()
    (12): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (13): ReLU()
    (14): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
    (15): Flatten()
    (16): Linear(in_features=8704, out_features=1024, bias=True)
    (17): ReLU()
    (18): Dropout(p=0.5, inplace=False)
    (19): Linear(in_features=1024, out_features=1024, bias=True)
    (20): ReLU()
    (21): Dropout(p=0.5, inplace=False)
  )
)

In [15]:
X = iter(dataloader)
X = X.next()

In [16]:
net(X[0].view(-1,70,1024)).shape

torch.Size([2, 1024])

In [19]:
import gensim