In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
torch.manual_seed(1)
import numpy as np
import time

In [2]:
#read into product description
product_desc = pd.read_csv("C:\\Users\\kyle.becker\\Desktop\\Product Descriptions.csv", encoding="latin")

In [None]:
#filter df to remove product descriptions with less than 4 words
product_hierarchy['length'] = product_hierarchy['PRODUCT_DESC'].str.split(" ").str.len()
product_hierarchy = product_hierarchy[product_hierarchy['length']>4]

#remove na columns
product_hierarchy = product_hierarchy[product_hierarchy.notnull()]

#count rows
product_hierarchy.shape

In [5]:
#convert to list
product_desc_full_list = list(product_hierarchy['PRODUCT_DESC'])

In [7]:
def create_words_dict(text):
    word_dict = {}
    paragraph_dict = {}

    #split string
    for i in range(len(text)):
        split_words = text[i].split(" ")

            #check if word is in dictionary if not add it
        for x in range(len(split_words)):
            if split_words[x] in word_dict:
                x+=1
            else:
                word_dict[split_words[x]]=len(word_dict)
    
    
    #create paragraph tokens
    for i in range(len(text)):
        paragraph_dict['paragraph_'+str(i)] = len(word_dict)+i
            
    return word_dict, paragraph_dict

In [8]:
#create dictionary of words
words_dict, paragraph_dict = create_words_dict(product_desc_full_list)

In [9]:
def tokenize_words(dictionary,text):
    
    tokenized_words = []
    
    for i in range(len(text)):
        split_words = text[i].split(" ")
        
        for w in range(len(split_words)):
            split_words[w] = dictionary.get(split_words[w])

        tokenized_words.append(split_words)
    
    return tokenized_words

In [10]:
def create_target_words(binary_list, paragraph_dict):
    target_word_list = []
    preprocessed_data = []
    for row in range(0,len(binary_list)):
        for words in range(2,len(binary_list[row])-2):
            context = [binary_list[row][words-2], binary_list[row][words-1], binary_list[row][words+1], binary_list[row][words+2],paragraph_dict['paragraph_'+str(row)]]
            target_word = binary_list[row][words]
            word_group = context
            preprocessed_data.append(word_group)
            target_word_list.append(target_word)
    return preprocessed_data, target_word_list

In [None]:
model_start = time.time()

embedding_dim = 1000
vocab_size = len(words_dict)
paragraph_size = len(paragraph_dict)
context_words_size = 5
epoch=1000
batch_size = 12000

class Word_Embeddings(nn.Module):
    
    def __init__(self, vocab_size, paragraph_size, embedding_dim, context_size):
        super(Word_Embeddings, self).__init__()
        #create embeddings
        self.embedding = nn.Embedding(vocab_size+paragraph_size, embedding_dim)
        #pass through neural network hidden layer
        self.linear1 = nn.Linear(context_size*embedding_dim ,128)
        #pass through softmax layer
        self.linear2 = nn.Linear(128, vocab_size+1)
        
        
    def forward(self, inputs):
        embedding = self.embedding(inputs)
        flatten = embedding.view((embedding.shape[0],-1))
        layer_1 = self.linear1(flatten)
        relu1 = F.relu(layer_1)
        layer_2 = self.linear2(relu1)
        log_probs = F.log_softmax(layer_2, dim=1)
        return log_probs
    
losses = []
loss_function = torch.nn.CrossEntropyLoss()
model = Word_Embeddings(vocab_size, paragraph_size,  embedding_dim, context_words_size)
optimizer = optim.SGD(model.parameters(), lr=0.1)


for epoch in range(epoch):
    print(epoch)
    
    start_epoch = time.time()
    total_loss=0
    
    
    for batch in range(batch_size,len(product_desc_sample),batch_size):
        start_batch = time.time()
        
        #filter dataframe for batches
        product_desc_batch = product_desc_sample[batch-batch_size:batch]

        #tokenize words
        tokenized_words = tokenize_words(words_dict,product_desc_batch)

        #create target words
        preprocessed_data, target_word = create_target_words(tokenized_words,paragraph_dict)
        
        context = torch.tensor(preprocessed_data, dtype=torch.long)
        
        #reset tensor to zero for each epoch
        model.zero_grad()
        
        #calculate probability of target word
        log_probs = model(context)

        #calculate loss using probability compared to actual
        loss = loss_function(log_probs, torch.tensor(target_word,dtype=torch.long))
    
        #Run backward propogation
        loss.backward()

        #take step in gradient descent 
        optimizer.step()                                                                           

        #total loss for epoch
        total_loss += loss.item()   

        #update total loss                                                               
        losses.append(total_loss)

    end_epoch = time.time()

print(losses)  # The loss decreased every iteration over the training data!

model_end = time.time()

print(model_end-model_start)

In [12]:
#load model
model.load_state_dict(torch.load("C:\\Users\\kyle.becker\\Desktop\\Word Embedding Results\\model V3"))

In [14]:
#get word embedding for each paragraph
embeds = model.embedding

#create list to hold embeddings for each paragraph
paragraph_embeddings = []

#loop through each paragraph and return embedding for each paragraph
for i in range(0,len(paragraph_dict)):
    tokenized_word_embedding = torch.tensor(paragraph_dict['paragraph_'+str(i)], dtype=torch.long)
    word_embedding = embeds(tokenized_word_embedding)
    word_embeddings_array = word_embedding.detach().numpy().reshape(1,-1)[0]
    word_embeddings_list = word_embeddings_array.tolist()
    paragraph_embeddings.append(word_embeddings_list)

In [15]:
#save embeddings of paragraphs
import pickle

with open('C:\\Users\\kyle.becker\\Desktop\\Word Embedding Results.txt, "wb") as fp:
        pickle.dump(paragraph_embeddings, fp)