# Word2Vec

In [1]:
import pickle
import re
import numpy as np
from scipy.sparse import csr_matrix
from collections import defaultdict
import json
from utils import *
import gensim
from sklearn.cluster import KMeans
from gensim.models import KeyedVectors
import nltk
#nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import time
courses = load_json('data/courses.txt')
stopwords = load_pkl('data/stopwords.pkl')

https://radimrehurek.com/gensim_3.8.3/models/keyedvectors.html


In [2]:
model = KeyedVectors.load_word2vec_format('/ix/model.txt', binary=False)

In [3]:
word_vectors = model.wv

  """Entry point for launching an IPython kernel.


## Pre-processing/utility methods

In [4]:
#Apply different transformations to the string in order to preprocess the data.
def process_string(s):
    #We remove the punctuation, because (for example) data.the should be two words in the bag of words
    s = re.sub(r'\W',' ',s).split()
    #We remove the stopwords, because there are not relevant to the data
    s = [word for word in s if word.lower() not in stopwords]
    bigrams = [str1+' '+str2 for (str1, str2) in nltk.bigrams(s)]
    s.extend(bigrams)
    #Filtrer et enlever les bigrams qui apparaissent trop peu souvent
    return s

In [5]:
# Returns normalized vector
def normalize_vec(vec):
    norm = np.linalg.norm(vec)
    result = []
    for v in vec:
        result.append(v/norm)
    return np.array(result)

In [6]:
# Returns a dict with words of wl mapped to their occurence
def word_list_to_counts(wl):
    result = defaultdict(int)
    for w in wl:
        result[w] += 1
    return result

In [7]:
# Updates dict d with number occurences
def number_of_occurences_in_list(l, d):
    for a in l:
       d[a] += 1
    return d

In [8]:
# Computes cosine similarity between two vectors
def cosine_sim(a, b):
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

In [9]:
# Computes Euclidean distance between two vectors
def euclidean_dist(a, b):
    a = np.array(a)
    b = np.array(b)
    return np.linalg.norm(a-b)

In [10]:
total_word_counts = defaultdict(int)
for course in courses:
    words = process_string(course['description'])
    total_word_counts = number_of_occurences_in_list(words, total_word_counts)

In [11]:
#with this filter: faster kmeans, but without slower search
total_word_counts = {key:val for key, val in total_word_counts.items() if val >= 2}

In [12]:
#The dataset for kmeans (4.12)
dataset = {}

#The corpus for the search function (4.13)
doc_corpus = []

for course in courses:
    doc_name = course['name']
    description = course['description']
    pre_processed_data = process_string(description)
    
    counts = word_list_to_counts(pre_processed_data) #word counts for this document
    #maps a word its word2vec in this document
    word2vecs = {} 
    #number of words in this document
    tot_words = 0
    for word in pre_processed_data:
        if word in word_vectors.vocab:
            v = normalize_vec(word_vectors.get_vector(word))
            if word in total_word_counts:
                dataset[word] = v
            word2vecs[word] = v
        else:
            #all unknown words same vector: all zeroes except a one
            v = np.concatenate((np.zeros(word_vectors.vector_size - 1), np.ones(1)))
            if word in total_word_counts:
                dataset[word] = v  
            word2vecs[word] = v
        tot_words += 1
    doc_corpus.append({'doc_name': doc_name, 'word2vecs': word2vecs, 'counts': counts, 'total': tot_words})

#### Comments:

- For all _unknown_ words, we choose the same default vector. Having all zero's except a 1 in one place allows us to have all _uknown_ words clustered in one place, and very different from most other vectors so it doesn't have too much impact on the search function.

- For the dataset used for exercise 4.12, it was decided to filter out words that appear only once for performance reasons. But we kept these words for parts 4.13 and 4.14 as they can be important for search.

In [13]:
#input of KMeans
X = []
for w in dataset.items():
    X.append(w[1])

## 1. Clustering word vectors

We chose k = 15 according to the k chosen in execise 4.10

In [19]:
k = 15

In [20]:
kmeans = KMeans(n_clusters=k, random_state=1)
kmeans.fit_predict(X)
centers = kmeans.cluster_centers_
labels = kmeans.labels_

In [21]:
def print_top_words_cluster_k(k, centers, labels):
    center = centers[k]
    indices_of_members = [] #members of center k
    
    for i in range(0,len(labels)):
        if labels[i] == k:
            indices_of_members.append(i)
            
    members = [list(dataset.items())[j] for j in indices_of_members]
    
    cosine_similarities = [cosine_sim(center, w[1]) for w in members]
    #euclidean_distances = [euclidean_dist(center, w[1]) for w in members]
    
    indices = np.flip(np.argsort(cosine_similarities))
    #indices = np.argsort(euclidean_distances)
    print("---Cluster " + str(k) + "---")
    for j in range(0,10):
        print(list(dataset.items())[indices_of_members[indices[j]]][0])
        

In [23]:
for i in range(15):
    print_top_words_cluster_k(i, centers, labels)

---Cluster 0---
localization bacteria
professionals operate
approach Plan
disciplinary approach
multi disciplinary
involve multi
learning involve
research understanding
sociological research
draws sociological
---Cluster 1---
photoluminescence
nanostructure
dielectric
pyroelectric
dopants
piezoelectricity
conductivity
photodetectors
ionization
adsorbate
---Cluster 2---
Context
Perspective
Transformation
Evaluate
Practice
Knowledge
Theory
Influence
Mind
Concepts
---Cluster 3---
Optimization
Measurement
Applications
Multiscale
Automation
Processes
Systems
Imaging
Resistive
Analysis
---Cluster 4---
proteins
enzymatic
metabolism
inhibition
metabolic
biochemical
intracellular
neuronal
vitro
bacterial
---Cluster 5---
Mathematics
Psychology
Physics
Biochemistry
Applied
Economics
Microbiology
Interdisciplinary
Philosophy
Chemistry
---Cluster 6---
enable
ensure
desired
interfere
needed
demonstrate
necessity
affect
fail
helpful
---Cluster 7---
thin
vertical
thick
circular
horizontal
cylindrical


#### Comments: 

Cluster **0** contains all the 'unknown' words.
In general, we observe that k-means clusters the words by type of word rather than by subject, but some clusters, like cluster **4** and **11**, still represent a specific topic.
We observe several types of clusters. Some contain names of scientific fields, like cluster **3**. Cluster **8** contains quite a lot of first names. Cluster **9** only contains numbers. Clusters **6** and. **14** contain quite random words.

The labels for 10 clusters (excluding cluster 0):

1. Cluster 1: Advanced scientific terms
2. Cluster 2: General academic terms
3. Cluster 3: General scientific terms
4. Cluster 4: Biology
5. Cluster 5: Taught subjects
6. Cluster 7: Adjectives that describe shapes
7. Cluster 8: First names
8. Cluster 11: Computer science
9. Cluster 12: Finance
10. Cluster 13: Dates

Compared to LSI and LDA, k-means on Word2Vec vectors generates quite a good clustering of words, but doesn't separate topics (as the definition of a topic) as well as LDA and LSI.

## 2. Document similarity search

Some more pre-processing and TF-IDF computation before we can implement a search function:

In [26]:
def compute_tf(word_counts, total_doc):
    result = {}
    for word, count in word_counts.items():
        result[word] = count / float(total_doc)
    return result

In [27]:
#compute idf
idf = defaultdict(int)
N = len(doc_corpus)
for doc in doc_corpus:
    for word, count in doc['counts'].items():
        if count > 0:
            idf[word] += 1
for word, count in idf.items():
    idf[word] = np.log(N/float(count))


In [28]:
def compute_tf_idf(word_counts, total_doc, idf):
    tf_idf = {}
    tf = compute_tf(word_counts, total_doc)
    for word, val in tf.items():
        tf_idf[word] = val*idf[word]
    return tf_idf

In [29]:
processed_corpus = []
for doc in doc_corpus:
    tf_idf = compute_tf_idf(doc['counts'], doc['total'], idf)
    processed_corpus.append({'doc_name': doc['doc_name'], 'word2vecs': doc['word2vecs'], 'tf_idf': tf_idf})

In [30]:
# list of {doc_name: weight}    
doc_averages = {}
for doc in processed_corpus:
    name = doc['doc_name']
    somme = 0
    for word,vec in doc['word2vecs'].items():
        weight = doc['tf_idf'][word]
        somme += weight*np.array(vec)
    doc_averages[name] = somme

In [38]:
# Returns similarity score for each document (the query has to be our the dataset)
def search(string):
    start = time.time()
    #similarity score is cosine_similarity between the document's weighted average and the word's representation
    if string not in dataset:
        print("Your search - " + string + " - did not match any documents.")
        return
    vec = dataset[string]
    avgs = list(doc_averages.items())
    sims = [cosine_sim(vec, w[1]) for w in avgs]
    r = np.flip(np.argsort(sims))
    print(string + ':')
    for i in range(5):
        print(avgs[r[i]][0] + ' (score: ' + str(sims[r[i]]) + ')')

In [40]:
search('Markov chains')

Markov chains:
Project in Biotechnology (score: 0.9926106825631218)
Optics I (score: 0.9925580402560084)
Mathematical modelling of behavior (score: 0.9885238787107079)
Optics II (score: 0.9881659074790364)
Concurrent algorithms (score: 0.9879251764226877)


In [39]:
search('Facebook')

Facebook:
Computational Social Media (score: 0.12890162)
Social media (score: 0.067226306)
Privacy Protection (score: 0.06489868)
Fundamentals of Biometrics (score: 0.064659774)
Computer networks (score: 0.064403765)


In [41]:
search('exam')

exam:
Training Rotation (EDNE) (score: 0.18543461)
Project 2 (EDIC) (score: 0.17522454)
Project 1 (EDIC) (score: 0.17522454)
Field Research Project A (score: 0.11598104)
Field Research Project B (score: 0.11155017)


#### Comments: 

The search function gives a relatively relevant result most of the time, giving courses that seem to have something to do with the given word (especially for 'Facebook'). Of course, the result is relevant as long as we ask for quite precise words. For example, searching for 'exam' does not make much sense. In some ways it is inconsistent, for example when giving the same word with the first letter uppercase and lowercase, the result can be quite different.


The results found for 'Markov chains' seem less relevant than the results using Vector Space Models (which had spot-on results of courses that are explicitely about Markov chains) or LSI that generated more implicitely relevant results. 

VSM had only 1 result for 'Facebook' which was the same as our first result. And compared to LSI, the results found here seem more relevant and linked to the searched word.

## 3. Document similarity search with outside terms

In [118]:
def search_general(string):
    if string not in dataset:
        if string not in word_vectors:
            print("Your search - " + string + " - did not match any documents.")
            return
        vec = normalize_vec(word_vectors[string])
        avgs = list(doc_averages.items())
        sims = [cosine_sim(vec, w[1]) for w in avgs]
        r = np.flip(np.argsort(sims))
        print(string + ':')
        for i in range(0,5):
            print(avgs[r[i]][0] + " (score: " + str(sims[r[i]]) + ')')
    else:
        print("The query was found in our dataset")
        search(string)

_Note: from a coding perspective, we can easily combine the two searches to have more elegant code, but for clarity of the exercise we keep them separate._

In [119]:
search_general('MySpace')

MySpace:
Computational Social Media (score: 0.10054466)
Training Rotation (EDNE) (score: 0.09367811)
Computer networks (score: 0.052398767)
Satellite communications  systems and networks (score: 0.050787453)
Social media (score: 0.048894677)


In [120]:
search_general('Orkut')

Orkut:
Computational Social Media (score: 0.16581577)
Computer networks (score: 0.1276489)
Social media (score: 0.124385275)
Satellite communications  systems and networks (score: 0.122522555)
Privacy Protection (score: 0.11618064)


#### Comments: 

The results for 'MySpace' and 'Orkut', which are both old social networking service, are pretty relevant, although for MySpace _Social Media_ only comes at fifth place, and _Satellite communications systems and networks_ tends to come pretty high. Compared to 'Facebook', both come satisfyingly close.

In [122]:
search_general('coronavirus')

coronavirus:
Training Rotation (EDNE) (score: 0.12073704)
Chemosensory receptors: Applications for biosensors and medical therapies (score: 0.11710419)
Infection biology (score: 0.108508505)
Chemistry of small biological molecules (score: 0.104032196)
Practical - Brisken Lab (score: 0.10148764)


#### Comments: 

Again, the results are good for 'coronavirus', as we obtain biology-related results (_Brisken Lab_ is also in the BIO faculty).

Remark: For some reason, _Training Rotation (EDNE)_ seems to appear in the top 5 results quite often. This may be explained by the fact that its course description only contains 2 words.