Text Mining - 5. Vectorización

AFI - Máster en Data Science y Big Data

Juan de Dios Romero Palop

Abril 2022


### Bag of words

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
dataset = ['This is the first document.',
          'This document is the second document.',
          'And this is the third one.',
          'Is this the first document?'
          ]

In [3]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(dataset)
print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [4]:
print(X.toarray())

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer

In [5]:
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(1, 2))
X2 = vectorizer2.fit_transform(dataset)
print(vectorizer2.get_feature_names())

['and', 'and this', 'document', 'document is', 'first', 'first document', 'is', 'is the', 'is this', 'one', 'second', 'second document', 'the', 'the first', 'the second', 'the third', 'third', 'third one', 'this', 'this document', 'this is', 'this the']


In [6]:
print(X2.toarray())

[[0 0 1 0 1 1 1 1 0 0 0 0 1 1 0 0 0 0 1 0 1 0]
 [0 0 2 1 0 0 1 1 0 0 1 1 1 0 1 0 0 0 1 1 0 0]
 [1 1 0 0 0 0 1 1 0 1 0 0 1 0 0 1 1 1 1 0 1 0]
 [0 0 1 0 1 1 1 0 1 0 0 0 1 1 0 0 0 0 1 0 0 1]]


### TF-IDF

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
dataset = ['This is the first document.',
          'This document is the second document.',
          'And this is the third one.',
          'Is this the first document?'
          ]

In [9]:
vectorizer = TfidfVectorizer(use_idf =True)
X = vectorizer.fit_transform(dataset)

In [10]:
print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [11]:
vectorizer.idf_

array([1.91629073, 1.22314355, 1.51082562, 1.        , 1.91629073,
       1.91629073, 1.        , 1.91629073, 1.        ])

In [12]:
vectorizer.vocabulary_

{'this': 8,
 'is': 3,
 'the': 6,
 'first': 2,
 'document': 1,
 'second': 5,
 'and': 0,
 'third': 7,
 'one': 4}

In [13]:
print(X.shape)

(4, 9)


In [14]:
print(X.toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

### word2vect

In [15]:
import gensim

In [16]:
dataset = ['This is the first document.',
          'This document is the second document.',
          'And this is the third one.',
          'Is this the first document?'
          ]

In [17]:
word_split=[text.split(' ') for text in dataset]
print(word_split)

[['This', 'is', 'the', 'first', 'document.'], ['This', 'document', 'is', 'the', 'second', 'document.'], ['And', 'this', 'is', 'the', 'third', 'one.'], ['Is', 'this', 'the', 'first', 'document?']]


In [18]:
model = gensim.models.Word2Vec(word_split, min_count=1)

https://radimrehurek.com/gensim/models/word2vec.html

In [19]:
#Una vez entrenado el modelo se puede persistir para no tener que entrenarlo otra vez
#model.save('model.bin')

#Para cargarlo
#model = gensim.models.KeyedVectors.load_word2vec_format('model.bin', binary=True) 

In [20]:
#Palabras más parecidas a una dada
model.wv.most_similar(positive=['first'], topn=2)

[('this', 0.1702040433883667), ('document', 0.15014396607875824)]

In [21]:
#Valor del vector para una palabra
model.wv["Is"]

array([ 8.1673553e-03, -4.4426569e-03,  8.9866212e-03,  8.2577700e-03,
       -4.4339881e-03,  3.0367024e-04,  4.2783460e-03, -3.9272429e-03,
       -5.5631334e-03, -6.5125106e-03, -6.6869758e-04, -2.9750692e-04,
        4.4656140e-03, -2.4733855e-03, -1.7145963e-04,  2.4627838e-03,
        4.8685172e-03, -3.0844465e-05, -6.3409745e-03, -9.2650224e-03,
        2.7163460e-05,  6.6650123e-03,  1.4701253e-03, -8.9689158e-03,
       -7.9374677e-03,  6.5533957e-03, -3.7879522e-03,  6.2532760e-03,
       -6.6817394e-03,  8.4776711e-03, -6.5132668e-03,  3.2889931e-03,
       -1.0552995e-03, -6.7842505e-03, -3.2899419e-03, -1.1597391e-03,
       -5.4733083e-03, -1.2108886e-03, -7.5639915e-03,  2.6432187e-03,
        9.0722553e-03, -2.3770204e-03, -9.7549811e-04,  3.5118519e-03,
        8.6662313e-03, -5.9210621e-03, -6.8897335e-03, -2.9336286e-03,
        9.1469856e-03,  8.6554210e-04, -8.6789466e-03, -1.4480762e-03,
        9.4787329e-03, -7.5504123e-03, -5.3587202e-03,  9.3142521e-03,
      

In [22]:
#parecido entre palabras
model.wv.similarity('second', 'third')

-0.07423881

## Ejemplo más real

In [23]:
import pandas as pd

In [24]:
df = pd.read_csv("./data.csv")
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [25]:
# Nueva columna - marca + modelo
df['Maker_Model']= df['Make']+ " " + df['Model']

In [26]:
# Elegimos algunas columnas que tengan texto y las combinamos en un solo campo
df1 = df[['Engine Fuel Type','Transmission Type','Driven_Wheels','Market Category','Vehicle Size', 'Vehicle Style', 'Maker_Model']]
df2 = df1.apply(lambda x: ','.join(x.astype(str)), axis=1) 
df_clean = pd.DataFrame({'clean': df2})
df_clean.head()

Unnamed: 0,clean
0,"premium unleaded (required),MANUAL,rear wheel ..."
1,"premium unleaded (required),MANUAL,rear wheel ..."
2,"premium unleaded (required),MANUAL,rear wheel ..."
3,"premium unleaded (required),MANUAL,rear wheel ..."
4,"premium unleaded (required),MANUAL,rear wheel ..."


In [27]:
# Creamos la lista de listas de palabras que sera la entrada del word2vec 
sent = [row.split(',') for row in df_clean['clean']] 
sent[:2]

[['premium unleaded (required)',
  'MANUAL',
  'rear wheel drive',
  'Factory Tuner',
  'Luxury',
  'High-Performance',
  'Compact',
  'Coupe',
  'BMW 1 Series M'],
 ['premium unleaded (required)',
  'MANUAL',
  'rear wheel drive',
  'Luxury',
  'Performance',
  'Compact',
  'Convertible',
  'BMW 1 Series']]

In [28]:
# Entrenamos el word2vec
model = gensim.models.Word2Vec(sent, min_count=1, window =3, sg = 1)

In [29]:
model.wv.similarity('BMW 1 Series M', 'BMW 1 Series')

0.93926346

In [30]:
model.wv.most_similar('BMW 1 Series M')

[('Ford Shelby GT350', 0.9400246739387512),
 ('BMW 1 Series', 0.9392634034156799),
 ('Mercedes-Benz SL-Class', 0.9388341307640076),
 ('Scion FR-S', 0.9382849335670471),
 ('Lamborghini Murcielago', 0.9380284547805786),
 ('Audi S5', 0.9379854202270508),
 ('Ford Mustang', 0.9376536011695862),
 ('Lotus Evora', 0.936672568321228),
 ('Aston Martin Vanquish', 0.93629390001297),
 ('BMW 2 Series', 0.9361668229103088)]

In [31]:
df[(df.Make == "BMW") & (df.Model == "1 Series M")]

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP,Maker_Model
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135,BMW 1 Series M


In [32]:
df[(df.Make == "Mercedes-Benz") & (df.Model == "SLS AMG GT")]

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP,Maker_Model
9687,Mercedes-Benz,SLS AMG GT,2013,premium unleaded (required),583.0,8.0,AUTOMATED_MANUAL,rear wheel drive,2.0,"Exotic,Factory Tuner,Luxury,High-Performance",Compact,Coupe,19,13,617,199500,Mercedes-Benz SLS AMG GT
9688,Mercedes-Benz,SLS AMG GT,2013,premium unleaded (required),583.0,8.0,AUTOMATED_MANUAL,rear wheel drive,2.0,"Exotic,Factory Tuner,Luxury,High-Performance",Compact,Convertible,19,13,617,206000,Mercedes-Benz SLS AMG GT
9689,Mercedes-Benz,SLS AMG GT,2014,premium unleaded (required),622.0,8.0,AUTOMATED_MANUAL,rear wheel drive,2.0,"Exotic,Factory Tuner,Luxury,High-Performance",Compact,Coupe,17,13,617,275000,Mercedes-Benz SLS AMG GT
9690,Mercedes-Benz,SLS AMG GT,2014,premium unleaded (required),583.0,8.0,AUTOMATED_MANUAL,rear wheel drive,2.0,"Exotic,Factory Tuner,Luxury,High-Performance",Compact,Coupe,19,13,617,201500,Mercedes-Benz SLS AMG GT
9691,Mercedes-Benz,SLS AMG GT,2014,premium unleaded (required),583.0,8.0,AUTOMATED_MANUAL,rear wheel drive,2.0,"Exotic,Factory Tuner,Luxury,High-Performance",Compact,Convertible,19,13,617,208000,Mercedes-Benz SLS AMG GT
