# Word2Vec
This notebook aims to train word embeddings using the Word2Vec model.

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

ModuleNotFoundError: No module named 'google'

In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from time import time
from collections import Counter

In [None]:
SEED = 4222
EPOCHS = 5

In [None]:
# Change to your own directory
try: 
    os.chdir("/content/drive/MyDrive/MindfulAIProject")
    print("Directory changed")
except OSError:
    print("Error: Can't change the Current Working Directory")

Directory changed


In [None]:
# Load dataset and reset index
suicide_detection_df = pd.read_csv('Data/suicide_detection_final_cleaned.csv', header=0)
suicide_detection_df.reset_index(drop=True, inplace=True)
suicide_detection_df.replace({"class": {"suicide": 1, "non-suicide": 0}}, inplace=True)
suicide_detection_df.drop(columns=['text'], inplace=True)
suicide_detection_df = suicide_detection_df.rename(columns={"cleaned_text": "text"})
suicide_detection_df

Unnamed: 0,class,text
0,1,sex wife threaten suicide recently leave wife ...
1,0,weird not affect compliment come know real lif...
2,0,finally hear bad year swear fucking god annoying
3,1,need help just help cry hard
4,1,end tonight not anymore quit
...,...,...
174431,0,today go sled friend not like of pretty big mi...
174432,0,not like rock not go
174433,0,tell friend not lonely deprive are bought litt...
174434,0,pee probably taste like salty tea drink pee co...


In [None]:
# Split dataset into train, validation and test sets
train_text, test_text, train_labels, test_labels = train_test_split(suicide_detection_df['text'], suicide_detection_df['class'],
                                                                    random_state=SEED,
                                                                    test_size=0.2,
                                                                    stratify=suicide_detection_df['class'])

# word2vec

### Building a vocab

In [None]:
# define vocab 
vocab = Counter()
# tokenise each sentence
tokens_list = [(s.split()) for s in train_text]
# add each sentence to vocab
for i in tokens_list:
  vocab.update(i)
# removing words with a low occurance
min_occurance = 2
tokens = [k for k,c in vocab.items() if c >= min_occurance]
print(len(tokens))

20398


In [None]:
# save list to file
def save_list(lines, filename):
	# convert lines to a single blob of text
	data = '\n'.join(lines)
	# open file
	file = open(filename, 'w')
	# write text
	file.write(data)
	# close file
	file.close()
 
# save tokens to a vocabulary file
save_list(vocab, 'Data/vocab.txt')

In [None]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load the vocabulary
vocab_filename = 'Data/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

### Removing out-of-vocab words

In [None]:
# clean each line
def clean_line(line, vocab):
  tokens = line.split()
  # filter out tokens not in vocab
  tokens_clean = [w for w in tokens if w in vocab]
  return [tokens_clean]

# clean entire dataset
def process_lines(data, vocab):
  lines = list()
  for i in data:
    line = clean_line(i, vocab)
    # add lines to list
    lines += line
  return lines

In [None]:
train_clean = process_lines(train_text, vocab)
test_clean = process_lines(test_text, vocab)

### Training the model

In [None]:
# set up the parameters of the model
model = Word2Vec(size=300, window=10, min_count=1, iter=EPOCHS, seed=SEED)

# it builds the vocabulary from a sequence of sentences and thus initialized the model.
t = time()
model.build_vocab(train_clean, progress_per=1000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

# training the model
t = time()
model.train(train_clean, total_examples=model.corpus_count, epochs=EPOCHS, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.15 mins
Time to train the model: 0.61 mins


In [None]:
# save model in ASCII (word2vec) format
filename = 'Data/embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [None]:
model.most_similar('suicide')

  """Entry point for launching an IPython kernel.


[('ethically', 0.7455946207046509),
 ('huhheheheuheheuhe', 0.5838792324066162),
 ('involuntarily', 0.5762062072753906),
 ('suicidal', 0.5316386222839355),
 ('unsuccessful', 0.5260963439941406),
 ('arson', 0.518305778503418),
 ('impulsive', 0.502522349357605),
 ('shambles', 0.4928218722343445),
 ('suicides', 0.49047738313674927),
 ('immolation', 0.4868507385253906)]