<h1 style="text-align:center;"><strong>Generación del Modelo<strong></h1>

### 1) Importar librerías

In [1]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

### 2) Cargar el corpus

#### Definir el path del Corpus

In [2]:
basePath = 'resources/corpus/'
modelo = 'tf-idf'
version = 'v10.0'
corpusPath = basePath+'corpus-'+modelo+"-"+version+".pkl"

#### Leer el corpus

In [3]:
with open(corpusPath, "rb") as fp: 
    corpus = pickle.load(fp)

In [4]:
len(corpus)

37569

In [5]:
corpus[:2]

Unnamed: 0,doc_id,preprocessed_doc
0,57781304700,metamodeling audio signals design process enco...
1,57581708700,facility layout design textile msmes literatur...


In [6]:
len(corpus['preprocessed_doc'].to_list())

37569

### 3) Generación del modelo

#### Instanciar el modelo tf-idf

In [7]:
tfidf = TfidfVectorizer(norm='l2', smooth_idf=True, sublinear_tf=True)

#### Obtener la matrix tf-idf del método fit_transform()

In [8]:
matrix = tfidf.fit_transform(corpus['preprocessed_doc'].to_list())

#### Obtener el vocabulario

In [9]:
len(tfidf.get_feature_names_out())

120632

In [10]:
tfidf.vocabulary_

{'metamodeling': 69826,
 'audio': 14854,
 'signals': 101427,
 'design': 33979,
 'process': 89625,
 'encounter': 40003,
 'sound': 103242,
 'changing': 23912,
 'forms': 46263,
 'context': 28601,
 'following': 45982,
 'work': 119107,
 'arises': 13511,
 'exploratory': 43253,
 'interest': 58811,
 'around': 13616,
 'order': 80034,
 'able': 7411,
 'structure': 105540,
 'fictional': 44854,
 'architectural': 13288,
 'envelopes': 40708,
 'reactive': 93332,
 'nature': 75182,
 'whose': 118667,
 'materialization': 68219,
 'achieved': 7969,
 'digital': 35001,
 'visualization': 117205,
 'techniques': 108525,
 'generative': 48290,
 'software': 102861,
 'art': 13684,
 'research': 95168,
 'implies': 56883,
 'interdisciplinary': 58801,
 'development': 34268,
 'different': 34898,
 'categories': 22574,
 'knowledge': 62279,
 'unconventional': 114105,
 'processes': 89630,
 'built': 20286,
 'activated': 8216,
 'element': 39300,
 'generates': 48283,
 'random': 92927,
 'values': 115869,
 'information': 57841,
 

In [11]:
'covid' in tfidf.vocabulary_

True

In [12]:
tfidf.vocabulary_['intelligence']

58610

In [13]:
print(matrix)

  (0, 103362)	0.09901086259320825
  (0, 43175)	0.06663115536333616
  (0, 8238)	0.06547821775297065
  (0, 8205)	0.08270357649366306
  (0, 92948)	0.09776080408384201
  (0, 43095)	0.12266942781233617
  (0, 75983)	0.07326111334114051
  (0, 41850)	0.09839959075191457
  (0, 10251)	0.06242527840617196
  (0, 8382)	0.10080393291937903
  (0, 45490)	0.10703542853606968
  (0, 57294)	0.09620306620182087
  (0, 109205)	0.12358305820200322
  (0, 30058)	0.12887263277652924
  (0, 32946)	0.09064033758494239
  (0, 19333)	0.12632474266794103
  (0, 110901)	0.09387844285454777
  (0, 40726)	0.06365927967255641
  (0, 46200)	0.07837554497149288
  (0, 16854)	0.06647102555164909
  (0, 27971)	0.09944483590059525
  (0, 56755)	0.05976030951144175
  (0, 35438)	0.07796418515942816
  (0, 57841)	0.04979480373307328
  (0, 115869)	0.06281040494452338
  :	:
  (37568, 13991)	0.3796167120676995
  (37568, 106425)	0.143221841833236
  (37568, 101184)	0.0661446741092075
  (37568, 42020)	0.12522927158507755
  (37568, 108828)	0.08

### 4) Almacenar el modelo en un diccionario

#### Definir el modelo

In [14]:
model = {
    'vocabulary': tfidf.vocabulary_,
    'matrix': matrix,
    'indexes': corpus['doc_id'].to_list()
}

#### Definir la ruta del modelo

In [15]:
basePath = 'resources/models/tf-idf/'
version = 'v10.0'
modelPath = basePath+"model-"+version+".pkl"

#### Almacenar el modelo

In [16]:
with open(modelPath, "wb") as fp:  
    pickle.dump(model, fp)