# APHASIC PATIENTS' LINGUISTIC PRODUCTION
# BY COMPUTATIONAL MEANS WORD2VEC

Aim of this work is to automatize the diagnosis of aphasic patients productions using the word2vec algorithm.
Word2vec uses a neural network model to learn word associations from a large corpus of text.

We collected our input data into 2 lists of word pairs from the Aphasia Bank (https://aphasia.talkbank.org/).
One list is made of 170 EN target/response word pairs.
One list is made of 290 IT target/response word pairs.

We want to compare the cosine similarity between the target/response pairs in EN.
We used the pre-trained 'word2vec-google-news-300' vectors to run the cosine similarity task.

We compute the cosine similarity between target and response words using the built-in wv.similarity task of Word2Vec, taking as input our word pairs
and using their vectorized form as vectorized in the Google model.

For the cosine similarity task for the target/response pairs in IT we trained a model with the SkipGram algorithm of Word2Vec basing on 10 million word from Wikipedia using plainstream.

In [None]:
# upload the excel file with pair target/response
# pandas***

from csv import reader
import csv

with open("/Users/silviafabbi/Desktop/Pairs_EN.csv", "r") as pairs:
    
    csv_reader = reader(pairs)

In [55]:
import pandas as pd
import numpy

# Create a dataframe from csv
df = pd.read_csv("/Users/silviafabbi/Desktop/Pairs_EN.csv", "r", delimiter=',', engine='python')

# User list comprehension to create a list of lists from Dataframe rows
list_of_rows = [list(row) for row in df.values]

print(df)
type(df)

       ball   bound
0      ball    bask
1    window    womb
2    broken  bottom
3      word   money
4      rain     run
..      ...     ...
97      age    days
98   to say  to see
99     foot    head
100   woman     man
101     put    foot

[102 rows x 2 columns]


pandas.core.frame.DataFrame

In [30]:
# MODEL 1 - EN - vectors from GoogleNews

# We use gensim to import a word2vec model pretrained on google news 
# We load the pretrained model of the type #gensim.models.keyedvectors.Word2VecKeyedVectors
# using the gensim standard method .load()
# This type of pretrained model cannot be refined with additional data
# but has the advantage of saving RAM by dealing with huge quantity of data
# The 'word2vec-google-news-300' are pre-trained vectors trained on Google News dataset (about 100 billion words)
# The model contains 300-dimensional vectors for 3 million words and phrases

from gensim.models import Word2Vec
import gensim.downloader as api

In [2]:
wv = api.load('word2vec-google-news-300')
type(wv) # gensim.models.keyedvectors.Word2VecKeyedVectors

gensim.models.keyedvectors.Word2VecKeyedVectors

In [7]:
# Now we compare the similarity of the target/response word pairs
# using wv = api.load('word2vec-google-news-300') as a pretrained model
# pairs - is a list of the tuples made by target/response word
# type(pairs) = _io.TextIOWrapper

pairs = [
    ('ball', 'bound'),
    ('ball', 'bask'),
    ('window', 'womb'),
    ('broken', 'bottom'),
    ('word', 'money'),
    ('rain', 'run'),
    ('dog', 'guy'),
    ('barking', 'biting'),
    ('cat', 'god'),
    ('girl', 'guy'),
    ('tree', 'train'),
    ('ball', 'bald'),
    ('ball', 'barn'),
    ('ball', 'banks'),
    ('glass', 'grass'),
    ('woman', 'man'),
    ('somewhere', 'someone'),
    ('wear', 'words'),
    ('shoe', 'scene'),
    ('boy', 'man'),
    ('mother', 'wife'),
    ('school', 'cool'),
    ('girl', 'earl'),
    ('slipper', 'sipper'),
    ('lamp', 'lap'),
    ('umbrella', 'ball'),
    ('short', 'sort'),
    ('boy', 'boil'),
    ('woman', 'man'),
    ('fourth', 'force'),
    ('door', 'window'),
    ('fireman', 'policeman'),
    ('glass', 'gas'),
    ('soaking', 'stoking'),
    ('ladder', 'window'),
    ('kick', 'cook'),
    ('window', 'door'),
    ('umbrella', 'comb'),
    ('umbrella', 'bread'),
    ('umbrella', 'read'),
    ('catch', 'kitchen'),
    ('ball', 'bell'),
    ('soccer', 'sock'),
    ('mother', 'daughter'),
    ('lamp', 'lights'),
    ('give', 'gay'),
    ('rescue', 'like'),
    ('dog', 'door'),
    ('bark', 'talk'),
    ('kick', 'cook'),
    ('show', 'go'),
    ('kick', 'hit'),
    ('both', 'bow'),
    ('cat', 'hat'),
    ('dog', 'boy'),
    ('cat', 'girl'),
    ('reach', 'crawl'),
    ('there', 'hair'),
    ('ride', 'drive'),
    ('quite', 'white'),
    ('girl', 'boy'),
    ('grow', 'goes'),
    ('page', 'bar'),
    ('umbrella', 'black'),
    ('naughty', 'nasty'),
    ('walk', 'speak'),
    ('girl', 'woman'),
    ('ride', 'run'),
    ('cat', 'hat'),
    ('branch', 'window'),
    ('father', 'mother'),
    ('tree', 'try'),
    ('tree', 'far'),
    ('dog', 'fog'),
    ('ladder', 'water'),
    ('truck', 'bottle'),
    ('truck', 'home'),
    ('fire', 'ocean'),
    ('tree', 'hospital'),
    ('bird', 'song'),
    ('maid', 'main'),
    ('woman', 'man'),
    ('that', 'dat'),
    ('foot', 'pit'),
    ('slipper', 'pitcher'),
    ('put', 'pit'),
    ('sisters', 'brothers'),
    ('boy', 'man'),
    ('hands', 'heads'),
    ('cry', 'try'),
    ('wheels', 'feels'),
    ('ladder', 'letter'),
    ('age', 'days'),
    ('say', 'see'),
    ('foot', 'head'),
    ('woman', 'man'),
    ('put', 'foot'),   
]

for w1, w2 in pairs:
    print('%r\t%r\t%.4f' % (w1, w2, wv.similarity(w1, w2)))
    
# meaning last line of code:    
# "%r\t%r\t" sono caratteri "jolly" che vengono sostituiti col contenuto nella n-upla passata per mezzo di "%"
# In questo caso hai una stringa "%r" separata da una tabulazione "\t" seguita da un altro "%r" e relativa tabulazione 
# "% .2f" chiude con un float a 4 decimali
# Python sa che deve sostituire, nell'ordine specificato,
# quelle sequenze con il contenuto delle variabili passate per mezzo di "%" prima di stamparle

'ball'	'bound'	0.0339
'ball'	'bask'	0.0549
'window'	'womb'	0.1947
'broken'	'bottom'	0.1166
'word'	'money'	0.2119
'rain'	'run'	0.1114
'dog'	'guy'	0.2742
'barking'	'biting'	0.3858
'cat'	'god'	0.1099
'girl'	'guy'	0.3644
'tree'	'train'	0.1557
'ball'	'bald'	0.1118
'ball'	'barn'	0.0713
'ball'	'banks'	0.0107
'glass'	'grass'	0.1117
'woman'	'man'	0.7664
'somewhere'	'someone'	0.4419
'wear'	'words'	0.0561
'shoe'	'scene'	0.0908
'boy'	'man'	0.6825
'mother'	'wife'	0.7551
'school'	'cool'	0.1263
'girl'	'earl'	0.2010
'slipper'	'sipper'	0.1738
'lamp'	'lap'	0.1483
'umbrella'	'ball'	0.0570
'short'	'sort'	0.1629
'boy'	'boil'	0.1139
'woman'	'man'	0.7664
'fourth'	'force'	0.1147
'door'	'window'	0.6213
'fireman'	'policeman'	0.5428
'glass'	'gas'	0.0585
'soaking'	'stoking'	0.2246
'ladder'	'window'	0.3029
'kick'	'cook'	0.0898
'window'	'door'	0.6213
'umbrella'	'comb'	0.0946
'umbrella'	'bread'	0.0462
'umbrella'	'read'	-0.0125
'catch'	'kitchen'	0.0604
'ball'	'bell'	0.0863
'soccer'	'sock'	0.1546
'mother'	'daughter'	0

In [None]:
wv.most_similar("ball", topn=20)

[('balls', 0.6992625594139099),
 ('upfield', 0.6896207928657532),
 ('downfield', 0.6390728950500488),
 ('dribbler', 0.6218727827072144),
 ('balll', 0.6199932098388672),
 ('dribble', 0.616877555847168),
 ('ball_squirted', 0.6110137701034546),
 ('leftfooted', 0.6020259857177734),
 ('puck', 0.5981724262237549),
 ('mishit', 0.5948782563209534),
 ('lofted', 0.5933606028556824),
 ('theball', 0.5924203395843506),
 ('bobbling', 0.5848650336265564),
 ('dinked', 0.5820186138153076),
 ('dribbles', 0.5811805725097656),
 ('beautifully_flighted', 0.5757741928100586),
 ('mistimes', 0.5747321844100952),
 ('onsides', 0.5730898380279541),
 ('perfectly_flighted', 0.5724466443061829),
 ('deadball', 0.5708563923835754)]

In [None]:
result = wv.most_similar(positive=['ball', 'bask'], negative=['barn'])

most_similar_key, similarity = result[0]  # look at the first match
print(f"{most_similar_key}: {similarity:.4f}")

revel: 0.4761


In [None]:
# MOST SIMILAR FOR ENGLISH

result = wv.most_similar(positive=['tiger', 'puma'], negative=['lion'])

most_similar_key, similarity = result[0]  # look at the first match
print(f"{most_similar_key}: {similarity:.4f}")

panther: 0.5573


In [None]:
result = wv.most_similar(positive=['tiger', 'puma'], negative=['lion'])

most_similar_key, similarity = result[1]  # look at the second match
print(f"{most_similar_key}: {similarity:.4f}")

tigers: 0.5206


In [None]:
result = wv.most_similar(positive=['tiger', 'puma'], negative=['lion'])

most_similar_key, similarity = result[2]  # look at the third match
print(f"{most_similar_key}: {similarity:.4f}")

jaguars: 0.4805


In [None]:
# MODEL 2 - based on wikipedia extracted with GloVe (an alternative to Word2Vec)

In [5]:
model = api.load('glove-wiki-gigaword-100')

# type(model) = gensim.models.keyedvectors.Word2VecKeyedVectors

In [8]:
# let's compare the similarity of the word pairs using
# model = api.load('glove-wiki-gigaword-100')

for w1, w2 in pairs:
    print('%r\t%r\t%.4f' % (w1, w2, model.similarity(w1, w2)))

'ball'	'bound'	0.3754
'ball'	'bask'	0.0131
'window'	'womb'	0.2678
'broken'	'bottom'	0.4862
'word'	'money'	0.4260
'rain'	'run'	0.3650
'dog'	'guy'	0.5169
'barking'	'biting'	0.3643
'cat'	'god'	0.3051
'girl'	'guy'	0.5142
'tree'	'train'	0.2779
'ball'	'bald'	0.2374
'ball'	'barn'	0.2118
'ball'	'banks'	0.2028
'glass'	'grass'	0.3207
'woman'	'man'	0.8323
'somewhere'	'someone'	0.6049
'wear'	'words'	0.2899
'shoe'	'scene'	0.2811
'boy'	'man'	0.7915
'mother'	'wife'	0.9026
'school'	'cool'	0.2501
'girl'	'earl'	0.1803
'slipper'	'sipper'	0.1260
'lamp'	'lap'	0.2265
'umbrella'	'ball'	0.1566
'short'	'sort'	0.5315
'boy'	'boil'	0.0712
'woman'	'man'	0.8323
'fourth'	'force'	0.4121
'door'	'window'	0.8181
'fireman'	'policeman'	0.4406
'glass'	'gas'	0.3752
'soaking'	'stoking'	0.2127
'ladder'	'window'	0.4182
'kick'	'cook'	0.2695
'window'	'door'	0.8181
'umbrella'	'comb'	0.1076
'umbrella'	'bread'	0.1757
'umbrella'	'read'	0.1107
'catch'	'kitchen'	0.2419
'ball'	'bell'	0.4173
'soccer'	'sock'	0.0379
'mother'	'daughter'	0.

In [None]:
# MOST similar for Glove Model

In [9]:
model.most_similar("ball")

[('kick', 0.7723649740219116),
 ('throw', 0.7400174140930176),
 ('balls', 0.7349199056625366),
 ('off', 0.7285565137863159),
 ('pitch', 0.7162729501724243),
 ('bounced', 0.7043442130088806),
 ('catch', 0.6956070065498352),
 ('missed', 0.6835888624191284),
 ('pass', 0.677986741065979),
 ('deflected', 0.6777893900871277)]

In [None]:
# MODEL 3 - IT - vectors from a model from plainstream/wikipedia
# model ITA from Wikipedia - 10 mln words
# 
# After training a w2v model in ITA, I saved it as a .model file
# The model was trained using the Skip Gram algorithm of Word2Vec


In [24]:
import nltk
import plainstream
import gensim
from gensim.models import KeyedVectors

# We are introducing the time module: it has many uses, but here we are just using the .time() method
# to measure the execution time of a process
import time

In [25]:
s = time.time()
# Here we are asking plainstream to give us a certain amount of words (10 milion in this case).
# NB: a plainstream.get_text() obejct is a generator, which is empty after one use
# Generator functions allow you to declare a function that behaves like an iterator
# i.e. it can be used in a for loop
some_wiki = plainstream.get_text("it", max_words=10000000, tokenize=True)
some_text = []

# we want to make sure that every word is lower case. Because some_wiki generates lists
# of lists of tokens (i.e: tokenized sentences) we need to nest a couple of for loops in order to 
# reach the strings that we want to manipulate

for tokens_list in some_wiki:
    temp = []
    for word in tokens_list:
        temp.append(word.lower())
    some_text.append(temp)
e = time.time()
print(e-s)

113.66088199615479


In [27]:
some_wiki # <generator object get_text at 0x7fce21e0ac10>

<generator object get_text at 0x7fdafab93740>

In [28]:
s = time.time()
# this is where we train the model. We are using a couple of parameters here, but the most
# relevant is "sg", which means that we are using the skipgram algorithm
model_ita = gensim.models.word2vec.Word2Vec(sentences=some_text, size=300, min_count=4, sg=1)
e = time.time()
print(e-s)

258.0889081954956


In [29]:
model_ita.save("/Users/silviafabbi/Desktop/ord2vec_10mil_wiki.model")

In [37]:
# reload the trained model

model_ita = KeyedVectors.load("/Users/silviafabbi/Desktop/ord2vec_10mil_wiki.model")

In [38]:
model_ita.wv.most_similar("gamba")

[('nuca', 0.7554254531860352),
 ('caviglia', 0.7333213686943054),
 ('fratturata', 0.728863537311554),
 ('mento', 0.7269062995910645),
 ('quadriga', 0.7184330224990845),
 ('schiena', 0.7161000370979309),
 ('pistola', 0.7155369520187378),
 ('guancia', 0.7101519107818604),
 ('tridente', 0.709312379360199),
 ('porge', 0.7087308168411255)]

In [10]:
def tupleize(file):
    output = []
    with open(file, "r") as input:
        line = input.readline()
        while line:
            nuple = tuple(line.rstrip().split(", "))
            output.append(nuple)
            line = input.readline()
    return output

In [39]:
# I apply my tupleize function to my .csv file

coppie = tupleize("/Users/silviafabbi/Desktop/pairs_IT.csv")[1:]

coppie

[('abbaiare;saluta',),
 ('scala;piede',),
 ('calciare;lanciare',),
 ('prendere;chiedere',),
 ('ombrello;acqua',),
 ('bagnato;dormito',),
 ('rincorrere;prendere',),
 ('arrampicarsi;girare',),
 ('abbaiare;giocare',),
 ('vetro;vetro',),
 ('ombrello;pioggia',),
 ('palla;finestra',),
 ('rompere;venire',),
 ('prendere;mettere',),
 ('salvare;trascinare',),
 ('sorellastre;donne',),
 ('topolini;scoiattoli',),
 ('fata;amante',),
 ('fata;aspetta',),
 ('fata;la',),
 ('piovere;pioggia',),
 ('bloccarsi;restare',),
 ('figlie;sorelle',),
 ('scomparire;spegnersi',),
 ('lanciare;fare',),
 ('uscire;partire',),
 ('palla;palo',)]

In [40]:
# I need to normalize my function using regex to get normalized tuples:
# I substitute every ";" with " " (empty space) and ","

import re

def my_csv_tokenizer(s):
    for line in coppie:
        pattern = r"""([a-z]+)(;)([a-z]+)([,]+)"""
        if pattern in coppie:
    
    # I define a function that takes s = /data/ (-> list of stings) as input
    # and for every string in s = /data/
    # every time my for loop meets one or more /;/ substituites it with empty space
      
            return re.sub(pattern, r"\1 \3 ", s)

In [52]:
my_csv_tokenizer(coppie)

coppie

[('scala', 'piede'),
 ('calciare', 'lanciare'),
 ('prendere', 'chiedere'),
 ('ombrello', 'acqua'),
 ('bagnato', 'dormire'),
 ('rincorrere', 'prendere'),
 ('scalare', 'girare'),
 ('vetro', 'vetro'),
 ('ombrello', 'pioggia'),
 ('palla', 'finestra'),
 ('rompere', 'venire'),
 ('prendere', 'mettere'),
 ('salvare', 'trascinare'),
 ('sorellastre', 'donne'),
 ('topi', 'scoiattoli'),
 ('fata', 'amante'),
 ('fata', 'aspetta'),
 ('fata', 'la'),
 ('pioggia', 'pioggia'),
 ('bloccare', 'restare'),
 ('figlie', 'sorelle'),
 ('scomparire', 'spegnersi'),
 ('lanciare', 'fare'),
 ('uscire', 'partire'),
 ('palla', 'palo')]

In [53]:
for w1, w2 in coppie:
    print('%r\t%r\t%.4f' % (w1, w2, model_ita.wv.similarity(w1, w2)))
    
# in case the response word matches the target word (like "pioggia" vs "pioggia") the similarity is 1 (maximum)

'scala'	'piede'	0.1823
'calciare'	'lanciare'	0.5785
'prendere'	'chiedere'	0.5782
'ombrello'	'acqua'	0.4904
'bagnato'	'dormire'	0.4128
'rincorrere'	'prendere'	0.4610
'scalare'	'girare'	0.2930
'vetro'	'vetro'	1.0000
'ombrello'	'pioggia'	0.4927
'palla'	'finestra'	0.4999
'rompere'	'venire'	0.4567
'prendere'	'mettere'	0.5360
'salvare'	'trascinare'	0.6874
'sorellastre'	'donne'	0.4786
'topi'	'scoiattoli'	0.6643
'fata'	'amante'	0.5440
'fata'	'aspetta'	0.5140
'fata'	'la'	0.2890
'pioggia'	'pioggia'	1.0000
'bloccare'	'restare'	0.5101
'figlie'	'sorelle'	0.8361
'scomparire'	'spegnersi'	0.6333
'lanciare'	'fare'	0.4572
'uscire'	'partire'	0.3452
'palla'	'palo'	0.4779


In [None]:
# let's now compare the cosine similarity and Spearman correlation

In [None]:
import numpy
import json
from sklearn.metrics.pairwise import cosine_similarity
from scipy import stats

with open("/Users/silviafabbi/Desktop/Lab_2_files/word2idx_v2.json") as in_file:   # open the file that maps words to their own index in the matrix
    word2idx = json.load(in_file)
idx2word = {v: k for k, v in word2idx.items()}  # build the reversed dictionary: from indeces to words

matrix = numpy.load("/Users/silviafabbi/Desktop/Lab_2_files/word2vec_vectors_lab_v2.npz")["arr_0"]
human_relatedness = []  # Prepare a list for relatedness values assigned by human annotators for a word pair
word2vec_relatedness = []  # Prepare a list for cosine similarities between word pairs using word embeddings 

men = open("/Users/silviafabbi/Desktop/Lab_2_files/MEN_lab_2.txt")  # Open the test set file (MEN dataset)
for line in men.readlines():  # iterate over the dataset file line by line 
    word1 = line.split()[0]  # split (no argument=whitespace) and take the first word
    word2 = line.split()[1]  # split and take the second word
    h_r = line.split()[2]  # split and take the third value, which is the relatedness assigned by humans to the word pair [word1, word2]
    human_relatedness.append(float(h_r))  # add relatedness assigned by humans to the human_relatedness list. float() converts a string into a number. N.B.: when you read a file, everything is considered to be a string
    word_embedding_1 = matrix[word2idx[word1]].reshape(1, -1)  # take the word embedding of word1. reshape(1,-1) turns a vector into a matrix of one single row. This is required to run the cosine_similarity function
    word_embedding_2 = matrix[word2idx[word2]].reshape(1, -1)  # take the word embedding of word2. reshape(1,-1) turns a vector into a matrix of one single row. This is required to run the cosine_similarity function
    cos_sim = cosine_similarity(word_embedding_1, word_embedding_2)[0]  # compute the cosine similarity between the two words. [0] is extract the value of the cosine similarity out of a matrix with a single element
    word2vec_relatedness.append(float(cos_sim))  # add this cosime similarity value to the word2vec_relatedness list

    
print(stats.spearmanr(human_relatedness, word2vec_relatedness))  # compute the Spearman's r coefficient between the relatedness values assigned by human annotators and the cosine similarity between word embeddings
# The first value corresponds to the correlation coefficient and the second value to the p-value

SpearmanrResult(correlation=0.7706852377098591, pvalue=1.5603042276140894e-197)


In [None]:
# how can I cycle through the whole 

men_raw = open("/Users/silviafabbi/Desktop/MEN_2/agreement/elias-men-ratings.txt")

for line in men_raw.readlines():
    word1s = line.split()[0]
    word2s = line.split()[1]
    h_r = line.split()[2]

    
print("Word 1 is", line.split()[0])
print("Word 2 is", line.split()[1])
print("Human relatedness is", h_r)
    

Word 1 is shade
Word 2 is whiskers
Human relatedness is 1


In [None]:
type(men_raw)

_io.TextIOWrapper

In [None]:
# qui non capisco come andare oltre la riga 1...

word_embedding_1 = matrix[word2idx[word1]].reshape(1, -1)
word_embedding_2 = matrix[word2idx[word2]].reshape(1, -1)
cos_sim = cosine_similarity(word_embedding_1, word_embedding_2)[0]

print(word1, word2, cos_sim)

ice snow [0.53915557]


In [None]:
coppie = [   
    
    ('scala', 'piede'),
    ('calciare', 'lanciare'),
    ('prendere', 'chiedere'),
    ('ombrello', 'acqua'),
    ('bagnato', 'dormire'),
    ('rincorrere', 'prendere'),
    ('scalare', 'girare'),
    ('vetro', 'vetro'),
    ('ombrello', 'pioggia'),
    ('palla', 'finestra'),
    ('rompere', 'venire'),
    ('prendere', 'mettere'),
    ('salvare', 'trascinare'),
    ('sorellastre', 'donne'),
    ('topi', 'scoiattoli'),
    ('fata', 'amante'),
    ('fata', 'aspetta'),
    ('fata', 'la'),
    ('pioggia', 'pioggia'),
    ('bloccare', 'restare'),
    ('figlie', 'sorelle'),
    ('scomparire', 'spegnersi'),
    ('lanciare', 'fare'),
    ('uscire', 'partire'),
    ('palla', 'palo')

]