<h1 style="text-align:center;"><strong>Evaluacion del Modelo<strong></h1>

### 1) Importar librerías

In [1]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from py2neo import Graph

In [2]:
from unidecode import unidecode
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nicolas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from string import punctuation 
from nltk.corpus import stopwords

### 2) Cargar el modelo

#### Definir el path del modelo

In [4]:
basePath = 'resources/models/tf-idf/'
version = 'v9.0'
modelPath = basePath+"model-"+version+".pkl"

#### Leer el modelo

In [5]:
with open(modelPath, "rb") as fp: 
    model = pickle.load(fp)

In [6]:
print(model['matrix'])

  (0, 103362)	0.09923212487214342
  (0, 43175)	0.06886584546383225
  (0, 8238)	0.07078582159576213
  (0, 8205)	0.08644800745109515
  (0, 92948)	0.10016422068273652
  (0, 43095)	0.12047859376933612
  (0, 75983)	0.07326409734135113
  (0, 41850)	0.1039336392953721
  (0, 10251)	0.06819450408499904
  (0, 8382)	0.09963160024033874
  (0, 45490)	0.10901159827994265
  (0, 57294)	0.09884113157844591
  (0, 109205)	0.11846335948205136
  (0, 30058)	0.1263272961937833
  (0, 32946)	0.0938851023709985
  (0, 19333)	0.12035374520750669
  (0, 110901)	0.0986487046202866
  (0, 40726)	0.0683563637092741
  (0, 46200)	0.08156953898252392
  (0, 16854)	0.07070169585633913
  (0, 27971)	0.10092808978243235
  (0, 56755)	0.06400635424331756
  (0, 35438)	0.08156953898252392
  (0, 57841)	0.05302019314937439
  (0, 115869)	0.06709275194070413
  :	:
  (37525, 88293)	0.16497810008348493
  (37525, 71710)	0.15840487494991684
  (37525, 102363)	0.1691939800736844
  (37525, 90112)	0.14938705482506517
  (37525, 24578)	0.359863

### 3) Evaluación manual del modelo

#### Construir una funcion para tokenizar

In [7]:
tokenizer = TfidfVectorizer().build_tokenizer()

In [8]:
input_data = "covid ecuador"

In [9]:
tokenizer(input_data)

['covid', 'ecuador']

In [10]:
if 'artificial' in model['vocabulary']:
    print(model['vocabulary']['artificial'])

13790


In [11]:
if 'intelligence' in model['vocabulary']:
    print(model['vocabulary']['intelligence'])

58610


In [12]:
print(model['matrix'].getcol(13790))

  (46, 0)	0.07056208123690834
  (74, 0)	0.128190797693362
  (117, 0)	0.06608254888628867
  (144, 0)	0.060198355374348006
  (148, 0)	0.12982967544910354
  (186, 0)	0.05137822896663687
  (196, 0)	0.055797951158238046
  (217, 0)	0.058484880966530026
  (369, 0)	0.09406448580119775
  (370, 0)	0.1458931499702944
  (442, 0)	0.038158989547799
  (469, 0)	0.14100045656856397
  (577, 0)	0.056277063191191315
  (578, 0)	0.11676893141840292
  (658, 0)	0.05858455091781073
  (713, 0)	0.0825124961373678
  (748, 0)	0.05693955167925489
  (816, 0)	0.1518949035008579
  (823, 0)	0.05705502933403545
  (869, 0)	0.11206216613027258
  (874, 0)	0.0801269354833185
  (933, 0)	0.1443871118961135
  (997, 0)	0.06620055391974425
  (1201, 0)	0.09132330616848218
  (1233, 0)	0.05292176897335271
  :	:
  (34352, 0)	0.0760548311421886
  (34898, 0)	0.09154728529601053
  (34938, 0)	0.122622498189258
  (34978, 0)	0.042146875000870804
  (35028, 0)	0.08070127711315
  (35278, 0)	0.06369165860002116
  (35285, 0)	0.0431870139763367

In [13]:
print(model['matrix'].getcol(13790).sorted_indices().toarray())

[[0.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]


In [14]:
artificial = model['matrix'].getcol(13790).sorted_indices().toarray()

In [15]:
print(artificial)

[[0.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]


In [16]:
intelligence = model['matrix'].getcol(58610).sorted_indices().toarray()

In [17]:
recast_artificial = [item[0] for item in artificial]

In [18]:
recast_intelligence = [item[0] for item in intelligence]

In [19]:
dfTest = pd.DataFrame({'13790':recast_artificial, '58610':recast_intelligence}, index=model['indexes'])

In [20]:
dfTest

Unnamed: 0,13790,58610
85133492759,0.0,0.0
85133293730,0.0,0.0
85132518705,0.0,0.0
85112575431,0.0,0.0
85109263966,0.0,0.0
...,...,...
0012371997,0.0,0.0
84918742422,0.0,0.0
34347185703,0.0,0.0
33947340215,0.0,0.0


In [21]:
dfTest.drop(dfTest[(dfTest['13790'] == 0) | (dfTest['58610'] == 0)].index).sum(axis=1).sort_values(ascending=False).head(2500)

85108692604    0.551039
85105437353    0.537960
85120658066    0.495037
85074789913    0.466055
85126221397    0.446731
                 ...   
85094159534    0.111390
85127056253    0.108430
85119413504    0.108337
85027222541    0.106231
85120833191    0.100875
Length: 317, dtype: float64

In [22]:
model['indexes'][6064]

'85096046296'

### 4) Evaluación del modelo

#### Definición de las stop_words

In [23]:
stop_words = [unidecode(stopW) for stopW in stopwords.words('english')]
non_words = list(punctuation)
non_words.extend(['¿', '¡', '...', '..'])
stop_words = stop_words + non_words

#### Definir método para preprocesar y tokenizar los topics

In [24]:
def preprocessing(topic):
    return [word.lower() for word in tokenizer(unidecode(topic)) if word.lower() not in stop_words]

#### Configurar la conexión a la base de datos de Neo4j

In [25]:
graph = Graph("bolt://localhost:7687", auth=("neo4j", "narias"))

#### Obtener 20 Topics random

In [26]:
query = """
match (topics:Topic )
optional match (topics)-[r:USES]-(ar:Article)
with topics, count(r) as frequency ORDER BY RAND()
where frequency >= 10
return topics.name, frequency
LIMIT 10
"""
res = graph.run(query)

In [27]:
topics = []
for item in res:
    topic = {}
    topic['name'] = item.data()['topics.name']
    topics.append(topic)
topics

[{'name': 'Inflammation'},
 {'name': 'Isoflavones'},
 {'name': 'Chatbot'},
 {'name': 'Cryptic diversity'},
 {'name': 'WCAG'},
 {'name': 'Sensors'},
 {'name': 'Brucellosis'},
 {'name': 'Competitiveness'},
 {'name': 'Galapagos'},
 {'name': 'Dispersal'}]

In [28]:
topics.append({
    'name':'covid ecuador'
})

#### Preprocesar y tokenizar los topics

In [29]:
for topic in topics:
    topic['preprocessed'] = preprocessing(topic['name'])
topics

[{'name': 'Inflammation', 'preprocessed': ['inflammation']},
 {'name': 'Isoflavones', 'preprocessed': ['isoflavones']},
 {'name': 'Chatbot', 'preprocessed': ['chatbot']},
 {'name': 'Cryptic diversity', 'preprocessed': ['cryptic', 'diversity']},
 {'name': 'WCAG', 'preprocessed': ['wcag']},
 {'name': 'Sensors', 'preprocessed': ['sensors']},
 {'name': 'Brucellosis', 'preprocessed': ['brucellosis']},
 {'name': 'Competitiveness', 'preprocessed': ['competitiveness']},
 {'name': 'Galapagos', 'preprocessed': ['galapagos']},
 {'name': 'Dispersal', 'preprocessed': ['dispersal']},
 {'name': 'covid ecuador', 'preprocessed': ['covid', 'ecuador']}]

#### Verificar si todos los tokens estan en el vocabulario 

In [30]:
for index, topic in enumerate(topics):
    topicInVocabulary = all(token in model['vocabulary']  for token in topic['preprocessed'])
    print(topic['preprocessed'], topicInVocabulary)
    if not topicInVocabulary:
        topics.pop(index)

['inflammation'] True
['isoflavones'] True
['chatbot'] True
['cryptic', 'diversity'] True
['wcag'] True
['sensors'] True
['brucellosis'] True
['competitiveness'] True
['galapagos'] True
['dispersal'] True
['covid', 'ecuador'] True


#### Obtener los ids de cada token

In [31]:
for topic in topics:
    topic['tokens_id'] = [model['vocabulary'][token]  for token in topic['preprocessed']]
topics

[{'name': 'Inflammation',
  'preprocessed': ['inflammation'],
  'tokens_id': [57754]},
 {'name': 'Isoflavones',
  'preprocessed': ['isoflavones'],
  'tokens_id': [60208]},
 {'name': 'Chatbot', 'preprocessed': ['chatbot'], 'tokens_id': [24061]},
 {'name': 'Cryptic diversity',
  'preprocessed': ['cryptic', 'diversity'],
  'tokens_id': [30568, 36193]},
 {'name': 'WCAG', 'preprocessed': ['wcag'], 'tokens_id': [118193]},
 {'name': 'Sensors', 'preprocessed': ['sensors'], 'tokens_id': [100296]},
 {'name': 'Brucellosis',
  'preprocessed': ['brucellosis'],
  'tokens_id': [20022]},
 {'name': 'Competitiveness',
  'preprocessed': ['competitiveness'],
  'tokens_id': [27423]},
 {'name': 'Galapagos', 'preprocessed': ['galapagos'], 'tokens_id': [47591]},
 {'name': 'Dispersal', 'preprocessed': ['dispersal'], 'tokens_id': [35855]},
 {'name': 'covid ecuador',
  'preprocessed': ['covid', 'ecuador'],
  'tokens_id': [29731, 38364]}]

#### Definir metodo para obtener los artículos mas relevantes por topic

In [32]:
#topic: tokens_id (ej. [73720, 99235])
def mostRelevantDocsByTopic(tokens_id):
    data = {}
    for token_id in tokens_id:
        data[token_id] = [item[0] for item in model['matrix'].getcol(token_id).sorted_indices().toarray()]
    dfResult = pd.DataFrame(data=data, index=model['indexes'])   
    return dfResult[(dfResult != 0).all(1)].sum(axis=1).sort_values(ascending=False).head(50).index.to_list()

In [33]:
for topic in topics:
    topic['top_docs'] = mostRelevantDocsByTopic(topic['tokens_id'])

In [34]:
topics

[{'name': 'Inflammation',
  'preprocessed': ['inflammation'],
  'tokens_id': [57754],
  'top_docs': ['85073292079',
   '85123673947',
   '85101805554',
   '85054410994',
   '85117153578',
   '85128000215',
   '84876785654',
   '85100869710',
   '85029532763',
   '85098006871',
   '84952333801',
   '85133441912',
   '33846217312',
   '84952932536',
   '85079404426',
   '67649268360',
   '85131810451',
   '84865610078',
   '85115699250',
   '85102441230',
   '33645857665',
   '84896396182',
   '85066777908',
   '85089122701',
   '84918526111',
   '85102328073',
   '85051023355',
   '85085921128',
   '85083785645',
   '85073267272',
   '85121579333',
   '70349170306',
   '85044470406',
   '85085754876',
   '85114909981',
   '85123325418',
   '84882641636',
   '85079753228',
   '85054905783',
   '85111632472',
   '84869186783',
   '85044453078',
   '0035208903',
   '85091356808',
   '85021073020',
   '85085952916',
   '85109735174',
   '85077715134',
   '85116151601',
   '85064838133']},
 