# Importing libraries

In [None]:
import sys
!{sys.executable} -m pip -q install  numpy pandas keras tensorflow-gpu matplotlib gensim nltk sklearn --user

In [None]:
import numpy as np
import pandas as pd
import nltk
import string
import matplotlib.pyplot as plt
import keras

from keras.models import Sequential
from keras.layers import Dense
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import brown
from nltk.corpus import stopwords
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D

nltk.download('brown')
nltk.download('stopwords')

# Read data

This is about how we would read our data if files were contained in the folder data :

In [None]:
#import os
#
#path = '/path/to/data'
#all_texts = []
#
#for filename in os.listdir(path):
#  f = open(filename, 'r')
#  all_texts.append(f.read())

Then we would work on / analyze elements of the "all_texts" Python list.

In [None]:
stop_words = set(stopwords.words('english'))

Let's just consider the first two categories of text in the brown corpus. Let's also say that "ca" files represent CV, and "cb" files represent job offers.

In [None]:
train_ids = brown.fileids()[0:71].copy()
test_ids = ['ca01', 'cb01']
for id_ in test_ids:
  train_ids.remove(id_)

In [None]:
train_texts = [" ".join(brown.words(fileids=[id_])) for id_ in train_ids]
test_texts = [" ".join(brown.words(fileids=[id_])) for id_ in test_ids]

In [None]:
for i in range(len(train_ids)):
  train_ids[i] = train_ids[i].replace('ca','CV').replace('cb','Job_offer')

for i in range(len(test_ids)):
  test_ids[i] = test_ids[i].replace('ca','CV').replace('cb','Job_offer')

We aren't really working with CV and offers, but we would handle the problem in the same way.

In [None]:
test_texts

# Cleaning data

In [None]:
clean_train = [text.lower().translate(str.maketrans('', '', string.punctuation)).split() for text in train_texts]
clean_test = [text.lower().translate(str.maketrans('', '', string.punctuation)).split() for text in test_texts]

We can also remove stopwords to make the data cleaner, if we want.

In [None]:
tagged_data = [TaggedDocument(txt, [id_]) for txt, id_ in zip(clean_train, train_ids)]

# Training Doc2Vec model

In [None]:
max_epochs = 100
vec_size = 30
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=len(train_ids),
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

#model.save("d2v.model")
#print("Model Saved")

# Testing Doc2Vec model

## Infer vectors for new texts

In [None]:
CV_test_vector = model.infer_vector(clean_test[0])
CV_test_vector

In [None]:
offer_test_vector = model.infer_vector(clean_test[1])
offer_test_vector

## Get list of most similar texts used during training

In [None]:
model.docvecs.most_similar([CV_test_vector])

In [None]:
model.docvecs.most_similar([offer_test_vector])

Results seem good : most similar vectors to CV vector are CV vectors and vice-versa.


Then, we can use vectors to build any kind of machine learning model, for example to predict a text's category.

# Neural Network example

Using Doc2Vec vectors to build a model that tries to predict a text's category.

## Building training and test sets

In [None]:
vector_list = []
cat_list = []
for id_ in train_ids:
  vector_list.append(list(model.docvecs[id_]))
  if "CV" in id_:
    cat_list.append("CV")
  else:
    cat_list.append("Job_offer")

vector_list = np.array(vector_list)

In [None]:
train_df = pd.DataFrame(data=vector_list)
train_df["category"] = cat_list
train_df.sample(5)

In [None]:
test_vectors = []
test_cats = []
for index_id in range(len(test_ids)):
  if "CV" in test_ids[index_id]:
    test_cats.append("CV")
  else:
    test_cats.append("Job_offer")
  test_vectors.append(list(model.infer_vector(clean_test[index_id])))

test_vectors = np.array(test_vectors)

In [None]:
test_df = pd.DataFrame(data=test_vectors)
test_df["category"] = test_cats
test_df

Encoding categories.

In [None]:
train_df["target"] = train_df["category"].map({"CV":0,"Job_offer":1})
#train_df = pd.concat([train_df,pd.get_dummies(train_df["category"])], axis=1)

train_df.sample(10)

In [None]:
test_df["target"] = test_df["category"].map({"CV":0,"Job_offer":1})
test_df

In [None]:
train_df.drop(columns=["category"], inplace=True)
test_df.drop(columns=["category"], inplace=True)

In [None]:
feature_columns = []
cat_columns = []
for col in train_df.columns:
  if type(col) == int:
    feature_columns.append(col)
  else:
    cat_columns.append(col)

## Building Neural Network architecture

Very simple model: 4 fully connected layers, with 30, 20, 20 and 1 neurons respectively.

Last layer specifically has 1 neuron because we want to predict 0 if the input vector corresponds to a CV and 1 if it is a job offer. 

In [None]:
nn_model = Sequential()
nn_model.add(Dense(30, input_dim=vec_size, activation='relu'))
nn_model.add(Dense(20, activation='relu'))
nn_model.add(Dense(20, activation='relu'))
nn_model.add(Dense(1, activation='sigmoid'))

nn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
nn_model.summary()

## Training Neural Network

In [None]:
x = train_df[feature_columns].values
y = train_df[cat_columns].values

test_x = test_df[feature_columns].values
test_y = test_df[cat_columns].values

print("X shape : " + str(x.shape))
print("Y shape : " + str(y.shape))
print("Test X shape : " + str(test_x.shape))
print("Test Y shape : " + str(test_y.shape))

In [None]:
nn_model.fit(x, y, batch_size=8, epochs=100, validation_data=[test_x, test_y])

# T-SNE / PCA visualisations

## 2D PCA

In [None]:
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(vector_list)

In [None]:
principalDf = pd.DataFrame()

principalDf["file_id"] = train_ids
principalDf["category"] = cat_list
principalDf["Principal Component 1"] = principalComponents[:,0]
principalDf["Principal Component 2"] = principalComponents[:,1]

In [None]:
principalDf.head()

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)

targets = principalDf["category"].unique()
colors = ['r', 'b']

for target, color in zip(targets,colors):
    indicesToKeep = principalDf['category'] == target
    ax.scatter(principalDf.loc[indicesToKeep, 'Principal Component 1']
               , principalDf.loc[indicesToKeep, 'Principal Component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

## 3D PCA

In [None]:
pca3d = PCA(n_components=3)

PC3d = pca3d.fit_transform(vector_list)

principalDf3d = pd.DataFrame()

principalDf3d["file_id"] = train_ids
principalDf3d["category"] = cat_list
principalDf3d["Principal Component 1"] = PC3d[:,0]
principalDf3d["Principal Component 2"] = PC3d[:,1]
principalDf3d["Principal Component 3"] = PC3d[:,2]

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1,  projection='3d') 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_zlabel('Principal Component 3', fontsize = 15)
ax.set_title('3 component PCA', fontsize = 20)

targets = principalDf3d["category"].unique()
colors = ['r', 'b']

for target, color in zip(targets,colors):
    indicesToKeep = principalDf3d['category'] == target
    ax.scatter(principalDf3d.loc[indicesToKeep, 'Principal Component 1']
               , principalDf3d.loc[indicesToKeep, 'Principal Component 2']
               , principalDf3d.loc[indicesToKeep, 'Principal Component 3']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

## 2D T-SNE

In [None]:
tsne = TSNE(n_components=2)

tsne_values = tsne.fit_transform(vector_list)

TSNEdf = pd.DataFrame()

TSNEdf["file_id"] = train_ids
TSNEdf["category"] = cat_list
TSNEdf["TSNE Component 1"] = tsne_values[:,0]
TSNEdf["TSNE Component 2"] = tsne_values[:,1]

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('TSNE Component 1', fontsize = 15)
ax.set_ylabel('TSNE Component 2', fontsize = 15)
ax.set_title('2 component TSNE', fontsize = 20)

targets = principalDf["category"].unique()
colors = ['r', 'b']

for target, color in zip(targets,colors):
    indicesToKeep = TSNEdf['category'] == target
    ax.scatter(TSNEdf.loc[indicesToKeep, 'TSNE Component 1']
               , TSNEdf.loc[indicesToKeep, 'TSNE Component 2']
               , c = color
               , s = 50)
    
ax.legend(targets)
ax.grid()

## 3D TSNE

In [None]:
tsne3d = TSNE(n_components=3)

tsne_values3d = tsne3d.fit_transform(vector_list)

TSNEdf3d = pd.DataFrame()

TSNEdf3d["file_id"] = train_ids
TSNEdf3d["category"] = cat_list
TSNEdf3d["TSNE Component 1"] = tsne_values3d[:,0]
TSNEdf3d["TSNE Component 2"] = tsne_values3d[:,1]
TSNEdf3d["TSNE Component 3"] = tsne_values3d[:,2]

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1,  projection='3d') 
ax.set_xlabel('TSNE Component 1', fontsize = 15)
ax.set_ylabel('TSNE Component 2', fontsize = 15)
ax.set_zlabel('TSNE Component 3', fontsize = 15)
ax.set_title('3 component TSNE', fontsize = 20)

targets = principalDf3d["category"].unique()
colors = ['r', 'b']

DISPLAY_NB = 5
targets = targets[0:DISPLAY_NB]
colors = colors[0:DISPLAY_NB]

for target, color in zip(targets,colors):
    indicesToKeep = TSNEdf3d['category'] == target
    ax.scatter(TSNEdf3d.loc[indicesToKeep, 'TSNE Component 1']
               , TSNEdf3d.loc[indicesToKeep, 'TSNE Component 2']
               , TSNEdf3d.loc[indicesToKeep, 'TSNE Component 3']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()