In [17]:
# data handling and computation
import numpy as np
import pandas as pd
import re
import operator
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\julie.a.nguyen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
def clean_stopwords(sentence):
    if type(sentence) == str:
        sentence = sentence.lstrip("\'")
        return ' '.join([word for word in sentence.split() if word not in (stop)])
    else:
        return sentence

In [19]:
def clean_regex(pattern, sentence):
    if type(sentence) == str:
        return re.sub(r'[0-9]|[^\w\s]', '', sentence)

In [20]:
# GloVe pre-trained embedding from: https://nlp.stanford.edu/projects/glove/)
glove_filename = 'glove.twitter.27B/glove.twitter.27B.25d.txt' 

def loadGloVe(filename):
    vocab = []
    embd = []
    
    # Open pre-trained GloVe files
    file = open(filename,'r', encoding="utf8")
    for line in file.readlines():
        row = line.strip().split(' ')
        vocab.append(row[0])
        embd.append(row[1:])
    print('GloVe Loaded.')
    file.close()
    
    error_values_index = []
    # Checks for words with length issues
    for i in range(0, len(embd)):
        if len(embd[i]) != len(embd[0]):
            print("error on line ", i, ": length of vector is ", len(embd[i]), "\n")
            error_values_index.append(i)
    
    # Deletes the words in GloVe with length error
    for index in error_values_index:
        embd.pop(index)
        vocab.pop(index)
        
    return vocab,embd

# Pre-trained GloVe embedding
vocab,embd = loadGloVe(glove_filename)
word2vec_dictionary = dict(zip(vocab, embd))
word_vec_dim = len(embd[0]) # word_vec_dim = dimension of each word vectors

print('Done')

GloVe Loaded.
error on line  38522 : length of vector is  24 

Done


In [21]:
def word_embbeded_sentence(array_sentence, word_embedding, word_vector_dimension):
    sentence_vectors = []
    
    for sentence in array_sentence:
        sentence_vector = np.zeros((word_vector_dimension,))
        for word in sentence.split():
            current_word_vector = list(map(float, word_embedding.get(word, np.zeros((25,)))))
            sentence_vector = (list(map(operator.add, sentence_vector, current_word_vector)))
        sentence_vectors.append((np.array(sentence_vector)/(len(sentence.split()))).tolist())
        
    return sentence_vectors

### Using functions 

In [22]:
# Import data
datapath = "mbti_concatened.csv"
data = pd.read_csv(datapath, sep=';')


In [24]:
stop = stopwords.words('english')
pattern = r'[0-9]|[^\w\s]'

# Cleaning data
data['doc'] = data['doc'].str.lower()
data['posts_unstopword'] = data['doc'].apply(lambda x: clean_stopwords(x))
data['posts_unstopword'] = data['posts_unstopword'].apply(lambda x: clean_regex(pattern, x))

In [25]:
posts = data['posts_unstopword'].tolist()

# Take first 500 users
sentences = posts[:500]

# Apply sentence embedding algorithm
sentence_embedded = word_embbeded_sentence(sentences, word2vec_dictionary, word_vec_dim)
sentence_embedded

[[-0.11763548217391297,
  0.2951929548494982,
  0.012335941471571967,
  -0.05786506849498329,
  0.03643067190635452,
  0.020936237123745805,
  0.898050826755853,
  -0.5809618130434785,
  -0.07121023655518391,
  -0.08751489033444822,
  -0.2853003461538463,
  0.31409156180602,
  -3.122601003344483,
  0.056368521739130435,
  0.12983341772575246,
  0.2775217321070233,
  0.25421751270903,
  -0.20578131404682268,
  0.09988507023411372,
  -0.2213881383277591,
  -0.10790747023411372,
  0.06387372441471571,
  -0.07841712571906359,
  -0.04096042140468228,
  -0.27604750033444797],
 [0.10153164326923078,
  0.22013619711538457,
  -0.12940318846153845,
  -0.026475691346153865,
  0.08689851153846151,
  0.05422920192307697,
  0.7989162980769231,
  -0.4733248019230766,
  -0.07703823653846156,
  -0.10983042788461536,
  -0.2503290480769231,
  0.3574835307692309,
  -3.0782699038461536,
  -0.04671323653846153,
  0.06432722980769232,
  0.22335821923076923,
  0.21117569711538456,
  -0.21413394855769236,
  0.