In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import gensim 
import os
from gensim.models import FastText
from gensim.models.callbacks import CallbackAny2Vec
from gensim.test.utils import get_tmpfile

from sklearn.manifold import TSNE

In [2]:
def load_model(strategy):
    embedding_path = "../../DATA/input_train/"+strategy+"/"
    model = gensim.models.KeyedVectors.load_word2vec_format(embedding_path+"fr_posos.bin",
                                                            fvocab = embedding_path+"fr_vocab_posos.txt", 
                                                            binary = True)
    return model

## Explore soft model

In [3]:
model_soft = load_model('soft')

In [4]:
model_soft.most_similar(["femme"], topn=10)

[('femmes', 0.8564898371696472),
 ('tombée', 0.4782788157463074),
 ('homme', 0.4496311843395233),
 ('tombé', 0.4441327452659607),
 ('tombe', 0.4399041533470154),
 ('sg', 0.4276544153690338),
 ('tomber', 0.39237430691719055),
 ('patiente', 0.37483054399490356),
 ('appris', 0.36021730303764343),
 ('corticoïdes', 0.35344362258911133)]

## Explore model with no accent

In [5]:
model_no_accent = load_model('no_accent')

In [6]:
model_no_accent.most_similar(["femme"], topn=10)

[('femmes', 0.8792619705200195),
 ('sg', 0.5141574740409851),
 ('homme', 0.5127766132354736),
 ('tombee', 0.4655846357345581),
 ('hommes', 0.4606422781944275),
 ('patiente', 0.45002931356430054),
 ('rdv', 0.4499751329421997),
 ('patients', 0.44738325476646423),
 ('patient', 0.4365091323852539),
 ('tombe', 0.4360780715942383)]

In [7]:
f_moins_h = model_no_accent['femme']-model_no_accent['homme'] 

In [8]:
model_no_accent.most_similar(positive=['homme','enceinte'], negative=['femme'])

[('ejaculation', 0.3233213722705841),
 ('environ', 0.32219427824020386),
 ('relation', 0.3171359598636627),
 ('membron', 0.31242355704307556),
 ('atteinte', 0.305403470993042),
 ('maintenant', 0.29485321044921875),
 ('intestinale', 0.2927056550979614),
 ('ovulation', 0.273392915725708),
 ('substitution', 0.26973411440849304),
 ('limite', 0.26726263761520386)]

In [9]:
model_no_accent.most_similar(positive=['adulte'], negative=['age'])

[('adultes', 0.48268142342567444),
 ('patients', 0.42689263820648193),
 ('rdv', 0.4010530114173889),
 ('parents', 0.38615673780441284),
 ('patiente', 0.36329445242881775),
 ('patient', 0.3384075462818146),
 ('apparue', 0.3171440064907074),
 ('apparemment', 0.31018611788749695),
 ('voltarene', 0.30255329608917236),
 ('toilette', 0.2930900752544403)]

In [10]:
model_no_accent.most_similar(positive=['enfant','age'], negative=['adulte'])

[('voyage', 0.4466325044631958),
 ('usage', 0.4210672974586487),
 ('enfants', 0.3890521228313446),
 ('cortisone', 0.34604424238204956),
 ('fils', 0.32477647066116333),
 ('trois', 0.3237701952457428),
 ('ge', 0.3227575719356537),
 ('b', 0.3087555468082428),
 ('pc', 0.3064334988594055),
 ('rappel', 0.29131683707237244)]

## Explore model with stemming

In [11]:
model_stemming = load_model('stemming')

In [12]:
model_stemming.most_similar(positive=['pédiatr','femm'], negative=['enfant'])

[('gynéco', 0.4379604756832123),
 ('médecin', 0.41605496406555176),
 ('ancien', 0.36724790930747986),
 ('doc', 0.3515803813934326),
 ('cicatric', 0.34740161895751953),
 ('sg', 0.33731627464294434),
 ('mm', 0.33683276176452637),
 ('calcibronat', 0.33278754353523254),
 ('m', 0.3319445252418518),
 ('gym', 0.32532599568367004)]

In [13]:
model_stemming.most_similar(positive=['âgé','adult'], negative=['jeun'])

[('enfant', 0.41694557666778564),
 ('supprim', 0.4018644094467163),
 ('fill', 0.3977648615837097),
 ('homm', 0.3682008981704712),
 ('suppos', 0.3660631775856018),
 ('nourrisson', 0.3452257513999939),
 ('adné', 0.31737369298934937),
 ('allerg', 0.3131595253944397),
 ('support', 0.3030516505241394),
 ('fich', 0.2962445020675659)]

In [14]:
model_stemming.most_similar(positive=['âgé','adult'], negative=['jeun'])

[('enfant', 0.41694557666778564),
 ('supprim', 0.4018644094467163),
 ('fill', 0.3977648615837097),
 ('homm', 0.3682008981704712),
 ('suppos', 0.3660631775856018),
 ('nourrisson', 0.3452257513999939),
 ('adné', 0.31737369298934937),
 ('allerg', 0.3131595253944397),
 ('support', 0.3030516505241394),
 ('fich', 0.2962445020675659)]

In [15]:
model_stemming.most_similar(positive=['mois','jour'])

[('semain', 0.6013951301574707),
 ('an', 0.48737025260925293),
 ('journali', 0.44771409034729004),
 ('anné', 0.4185478687286377),
 ('jou', 0.39527076482772827),
 ('journ', 0.3508337140083313),
 ('juin', 0.3495370149612427),
 ('minut', 0.33221542835235596),
 ('ok', 0.3177497982978821),
 ('valérian', 0.31620359420776367)]

In [16]:
model_stemming.most_similar(positive=['ok','antidépresseur'],negative=['non'])

[('dépresseur', 0.41024959087371826),
 ('antibiot', 0.3995637893676758),
 ('antalg', 0.38374194502830505),
 ('sépi', 0.3726158142089844),
 ('crister', 0.35172343254089355),
 ('progress', 0.3308964669704437),
 ('éjacul', 0.3285560607910156),
 ('cess', 0.3237244188785553),
 ('ancien', 0.32158076763153076),
 ('anti', 0.3178263306617737)]

## Explore model with stemming and without accent

In [17]:
model_stemming_no_accent = load_model('stemming_no_accent')

In [20]:
model_stemming_no_accent.most_similar(positive=['ok','antidepresseur'],negative=['non'])

[('depresseur', 0.4693041443824768),
 ('annul', 0.37050050497055054),
 ('ad', 0.3627379238605499),
 ('antibiot', 0.3547939658164978),
 ('moiti', 0.33712780475616455),
 ('valerian', 0.3283240795135498),
 ('molecul', 0.3194998800754547),
 ('statin', 0.31861573457717896),
 ('anti', 0.3174428343772888),
 ('granul', 0.30949005484580994)]

In [26]:
model_no_accent.most_similar('granules')

[('ampoules', 0.5794285535812378),
 ('grand', 0.38648971915245056),
 ('grande', 0.3852323293685913),
 ('ovules', 0.3642845153808594),
 ('gras', 0.3426024317741394),
 ('gelules', 0.3408167064189911),
 ('prenez', 0.3349892497062683),
 ('homeopathiques', 0.33269166946411133),
 ('sepia', 0.313345342874527),
 ('dilue', 0.31069672107696533)]