In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.spatial.distance import cosine

In [3]:
df = pd.read_csv("processed.csv")

In [13]:
documents = list(df["text"])[::-1]

In [14]:
good_documents = []
for d in documents:
    if not isinstance(d,str):
        print(d)
        print("carramba!")
    else:
        good_documents.append(d)

In [15]:
tfidf=TfidfVectorizer(use_idf=True, smooth_idf=False) 
trans = tfidf.fit_transform(good_documents)

In [16]:
trans_np = trans.toarray()

In [17]:
dfTFIDF = pd.DataFrame(trans_np, index=np.arange(len(good_documents)), columns=tfidf.get_feature_names_out())

In [18]:
query = "Which instrument are used in Jazz most often?"
query = tfidf.transform([query]).toarray()[0] 
1-dfTFIDF.apply(lambda x: cosine(x, query), axis=1)

0    0.097264
1    0.273358
dtype: float64

In [11]:
good_documents[1106]

IndexError: list index out of range

Calculating SVD

In [29]:
to_svc = trans_np.T # we need terms in rows and documents in columns

In [31]:
u, s, vh = np.linalg.svd(to_svc, full_matrices=False)

In [37]:
s.shape

(1496,)

In [89]:
u_c = np.concatenate(u[:,None])
vh_c = np.concatenate(vh[:,None])
s_shape = (s.shape[0], s.shape[0])
s_c = np.zeros(s_shape)
s_c[np.arange(s_shape[0]), np.arange(s_shape[0])] = s
s_c_inv = np.zeros(s_shape)
s_c_inv[np.arange(s_shape[0]), np.arange(s_shape[0])] = 1/s

In [55]:
reconstruction = u_c @ s_c @ vh_c

In [57]:
np.allclose(to_svc, reconstruction)

True

In [83]:
(s_c > 1).sum() # Select only 383 concepts

383

In [85]:
vh_c.shape

(1496, 1496)

In [111]:
u_c_k = u_c[:, :383]
s_c_k = s_c[:383, :383]
s_c_inv_k = s_c_inv[:383, :383]
vh_c_k = vh_c[:383, :]

In [112]:
query = "Which instrument are used in Jazz most often?"
query = tfidf.transform([query]).toarray()[0]
query_svd = query @ u_c_k @ s_c_inv_k 

In [113]:
dfSVD = pd.DataFrame(vh_c_k.T, index=np.arange(len(good_documents)))

In [121]:
1-dfSVD.apply(lambda x: cosine(x, query_svd), axis=1).sort_values()

262     0.256950
902     0.247742
778     0.247742
886     0.235292
600     0.217865
          ...   
1027   -0.113587
691    -0.113740
123    -0.125571
417    -0.126511
429    -0.130846
Length: 1496, dtype: float64

In [140]:
good_documents[123]

'Biguine big-IN French biɡin Antillean Creole bigin rhythm-centric style music originated Saint Pierre Martinique century It fuses Bèlè French ballroom dance steps African rhythms Two main types French antillean biguine identified based instrumentation contemporary musical practice called drum biguine orchestrated biguine Each refers characteristics specific origin The drum biguine bidgin bèlè Creole comes series bèlè dances performed since early colonial times slaves inhabited great sugar plantations Musically bidgin bèlè distinguished orchestrated biguine following ways instrumentation cylindrical single-membraned drum bèlè rhythm sticks tibwa call-and-response singing style soloist improvisation nasal voice quality According study Rosemain biguine figured fertility rituals practiced West Africa ritual significance since disappeared Martinique Bidgin bèlè originates slave bèlè dances characterized use bèlè drums tibwa rhythm sticks along call response nasal vocals improvised instrume

Calculating PCA

In [59]:
from sklearn.decomposition import PCA

In [61]:
pca = PCA(n_components=1000)
pca.fit(trans_np)

PCA(n_components=1000)

In [63]:
pca.explained_variance_ratio_.sum()

0.9403229903846362

In [64]:
trans_np.shape

(1496, 66609)

In [122]:
docs_transformed = pca.transform(trans_np)

In [128]:
query = "Which instrument are used in Jazz most often?"
query = tfidf.transform([query]).toarray()[0]
query_pca = pca.transform(query[None,:])

In [129]:
dfPCA = pd.DataFrame(docs_transformed, index=np.arange(len(good_documents)))

In [131]:
1-dfPCA.apply(lambda x: cosine(x, query_pca), axis=1).sort_values()

886     0.328625
890     0.247540
374     0.221042
903     0.218829
907     0.217452
          ...   
1324   -0.137649
1226   -0.137649
1326   -0.141872
1422   -0.142796
1323   -0.155055
Length: 1496, dtype: float64

In [144]:
good_documents[1323]

