In [1]:
import re
import pandas as pd
import numpy as np

 

corpus = {'D1': "in the new york times in",
          'D2': "the new york post",
          'D3': "the los angeles times"}

 

corpus = pd.DataFrame.from_dict(corpus, orient='index', columns=['texto'])

corpus

Unnamed: 0,texto
D1,in the new york times in
D2,the new york post
D3,the los angeles times


In [2]:
corpus['d'] = corpus['texto'].apply(lambda fila: len(fila.split()))

corpus

Unnamed: 0,texto,d
D1,in the new york times in,6
D2,the new york post,4
D3,the los angeles times,4


In [3]:
from sklearn.feature_extraction.text import CountVectorizer

 

count_vect = CountVectorizer()
bow_rep = count_vect.fit_transform(corpus['texto'].values)

In [5]:
tf = pd.DataFrame(bow_rep.toarray())
tf.columns = count_vect.get_feature_names()
tf.index = corpus.index
tf

Unnamed: 0,angeles,in,los,new,post,the,times,york
D1,0,2,0,1,0,1,1,1
D2,0,0,0,1,1,1,0,1
D3,1,0,1,0,0,1,1,0


In [6]:
tf.loc['D1'].sort_values(ascending=False)

in         2
york       1
times      1
the        1
new        1
post       0
los        0
angeles    0
Name: D1, dtype: int64

In [7]:
tf = tf.div(corpus['d'], axis=0).round(3)
tf

Unnamed: 0,angeles,in,los,new,post,the,times,york
D1,0.0,0.333,0.0,0.167,0.0,0.167,0.167,0.167
D2,0.0,0.0,0.0,0.25,0.25,0.25,0.0,0.25
D3,0.25,0.0,0.25,0.0,0.0,0.25,0.25,0.0


In [8]:
tf.loc['D1'].sort_values(ascending=False)

in         0.333
york       0.167
times      0.167
the        0.167
new        0.167
post       0.000
los        0.000
angeles    0.000
Name: D1, dtype: float64

In [10]:
df = {}
for palabra in count_vect.get_feature_names():
    cnt = corpus['texto'].apply(lambda fila: palabra in fila).sum()
    df[palabra] = cnt
df = pd.DataFrame.from_dict(df, orient='index', columns=['doc_count'])
N = corpus.shape[0]
df['df'] = df['doc_count']/N
df

Unnamed: 0,doc_count,df
angeles,1,0.333333
in,1,0.333333
los,1,0.333333
new,2,0.666667
post,1,0.333333
the,3,1.0
times,2,0.666667
york,2,0.666667


In [12]:
df['idf'] = 1/df['df']
df['log_idf'] = np.log10(df['idf'])
df

Unnamed: 0,doc_count,df,idf,log_idf
angeles,1,0.333333,3.0,0.477121
in,1,0.333333,3.0,0.477121
los,1,0.333333,3.0,0.477121
new,2,0.666667,1.5,0.176091
post,1,0.333333,3.0,0.477121
the,3,1.0,1.0,0.0
times,2,0.666667,1.5,0.176091
york,2,0.666667,1.5,0.176091


In [13]:
df['idf'] = 1/df['df']
df['log_idf'] = np.log10(df['idf'])

 

df

Unnamed: 0,doc_count,df,idf,log_idf
angeles,1,0.333333,3.0,0.477121
in,1,0.333333,3.0,0.477121
los,1,0.333333,3.0,0.477121
new,2,0.666667,1.5,0.176091
post,1,0.333333,3.0,0.477121
the,3,1.0,1.0,0.0
times,2,0.666667,1.5,0.176091
york,2,0.666667,1.5,0.176091


In [14]:
tfidf = df.join(tf.T)
tfidf['tfidf_d1'] = tfidf['D1'] * tfidf['log_idf']
tfidf['tfidf_d2'] = tfidf['D2'] * tfidf['log_idf']
tfidf['tfidf_d3'] = tfidf['D3'] * tfidf['log_idf']

 

tfidf[['tfidf_d1', 'tfidf_d2', 'tfidf_d3']]

Unnamed: 0,tfidf_d1,tfidf_d2,tfidf_d3
angeles,0.0,0.0,0.11928
in,0.158881,0.0,0.0
los,0.0,0.0,0.11928
new,0.029407,0.044023,0.0
post,0.0,0.11928,0.0
the,0.0,0.0,0.0
times,0.029407,0.0,0.044023
york,0.029407,0.044023,0.0


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
corpus

Unnamed: 0,texto,pp
n1,La compañía Boring de Elon Musk construirá una...,compañía boring elon musk construirá conexión ...
n2,La compañía Boring de Elon Musk construirá un ...,compañía boring elon musk construirá enlace al...
n3,La empresa Boring de Elon Musk aprobó la const...,empresa boring elon musk aprobó construcción t...
n4,Tanto la manzana como la naranja son frutas,manzana naranja frutas


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

 

tfidf_vect = TfidfVectorizer()
tfidf = tfidf_vect.fit_transform(corpus['texto'].values)

 

tfidf_matrix = pd.DataFrame(tfidf.toarray(), columns=tfidf_vect.get_feature_names())
tfidf_matrix.index = corpus.index

 

tfidf_matrix.T.round(3)

Unnamed: 0,D1,D2,D3
angeles,0.0,0.0,0.584
in,0.811,0.0,0.0
los,0.0,0.0,0.584
new,0.308,0.48,0.0
post,0.0,0.632,0.0
the,0.239,0.373,0.345
times,0.308,0.0,0.445
york,0.308,0.48,0.0


In [17]:
n1 = "La compañía Boring de Elon Musk construirá una conexión de alta velocidad en el aeropuerto de Chicago"
n2 = "La compañía Boring de Elon Musk construirá un enlace de alta velocidad al aeropuerto de Chicago"
n3 = "La empresa Boring de Elon Musk aprobó la construcción del tránsito de alta velocidad entre el centro de Chicago y el aeropuerto O'Hare."
n4 = "Tanto la manzana como la naranja son frutas"

 

corpus = {'n1': n1,
          'n2': n2,
          'n3': n3,
          'n4': n4}

 

corpus = pd.DataFrame.from_dict(corpus, orient='index', columns=['texto'])

In [18]:
import re
from nltk.corpus import stopwords
stopwords_sp = stopwords.words('spanish')

 

def pre_procesado(texto):
    texto = texto.lower()
    texto = re.sub(r"[\W\d_]+", " ", texto)
    texto = " ".join([palabra for palabra in texto.split() if palabra not in stopwords_sp])
    return texto

 

corpus['pp'] = corpus['texto'].apply(lambda texto: pre_procesado(texto))

 

corpus

Unnamed: 0,texto,pp
n1,La compañía Boring de Elon Musk construirá una...,compañía boring elon musk construirá conexión ...
n2,La compañía Boring de Elon Musk construirá un ...,compañía boring elon musk construirá enlace al...
n3,La empresa Boring de Elon Musk aprobó la const...,empresa boring elon musk aprobó construcción t...
n4,Tanto la manzana como la naranja son frutas,manzana naranja frutas


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

 

tfidf_vect = TfidfVectorizer()
tfidf = tfidf_vect.fit_transform(corpus.pp.values)

 

tfidf_matrix = pd.DataFrame(data=tfidf.toarray(), columns=tfidf_vect.get_feature_names())

 

tfidf_matrix = tfidf_matrix.T.round(3)
tfidf_matrix.columns = corpus.index

 

tfidf_matrix

Unnamed: 0,n1,n2,n3,n4
aeropuerto,0.283,0.283,0.215,0.0
alta,0.283,0.283,0.215,0.0
aprobó,0.0,0.0,0.336,0.0
boring,0.283,0.283,0.215,0.0
centro,0.0,0.0,0.336,0.0
chicago,0.283,0.283,0.215,0.0
compañía,0.349,0.349,0.0,0.0
conexión,0.443,0.0,0.0,0.0
construcción,0.0,0.0,0.336,0.0
construirá,0.349,0.349,0.0,0.0


Distancia Euclidiana 

In [21]:
from sklearn.metrics.pairwise import euclidean_distances
dist_euc = euclidean_distances(tfidf_matrix.T.values)
dist_euc = pd.DataFrame(dist_euc, columns = tfidf_matrix.columns, index = tfidf_matrix.columns)
dist_euc

Unnamed: 0,n1,n2,n3,n4
n1,0.0,0.626497,1.072192,1.413952
n2,0.626497,0.0,1.072192,1.413952
n3,1.072192,1.072192,0.0,1.414121
n4,1.413952,1.413952,1.414121,0.0


Distancia del coseno

In [22]:
from sklearn.metrics.pairwise import cosine_distances

 

dist_cos = cosine_distances(tfidf_matrix.T.values)
dist_cos = pd.DataFrame(dist_cos, columns = tfidf_matrix.columns, index = tfidf_matrix.columns)
dist_cos

Unnamed: 0,n1,n2,n3,n4
n1,0.0,0.196156,0.574388,1.0
n2,0.196156,0.0,0.574388,1.0
n3,0.574388,0.574388,0.0,1.0
n4,1.0,1.0,1.0,0.0


DIstancia de Jaccard 

In [23]:
def jaccard_distance(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return 1 - len(s1.intersection(s2)) / len(s1.union(s2))

 

jaccard_distance(corpus.loc['n1']['pp'].split(), corpus.loc['n3']['pp'].split())

0.5625

Distancia de Levenshtein

In [25]:
import nltk

nltk.edit_distance(corpus.loc['n1']['pp'].split(), corpus.loc['n2']['pp'].split())
nltk.edit_distance(corpus.loc['n1']['pp'].split(), corpus.loc['n3']['pp'].split())
nltk.edit_distance(corpus.loc['n1']['pp'].split(), corpus.loc['n4']['pp'].split())
nltk.edit_distance(corpus.loc['n2']['pp'].split(), corpus.loc['n3']['pp'].split())
nltk.edit_distance(corpus.loc['n2']['pp'].split(), corpus.loc['n4']['pp'].split())

10