In [1]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
from nltk.corpus import brown
 
data = []
 
for fileid in brown.fileids():
    document = ' '.join(brown.words(fileid))
    data.append(document)

In [3]:
len(data)

500

In [4]:
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)

In [15]:
data_vectorized[1][:20]

<1x10625 sparse matrix of type '<class 'numpy.int64'>'
	with 571 stored elements in Compressed Sparse Row format>

In [16]:
# Build a Latent Semantic Indexing Model
NUM_TOPICS = 10
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(500, 10)


In [17]:
text = "The economy is working better than ever"
a = vectorizer.transform([text])

In [20]:
print(a)

  (0, 933)	1
  (0, 3095)	1
  (0, 10532)	1


In [21]:
b = lsi_model.transform(a)

In [26]:
b[0]

array([ 0.07701771,  0.01117207, -0.0182993 ,  0.00038927,  0.02189971,
        0.0315538 , -0.06086479,  0.00171008, -0.00868334,  0.0150389 ])

In [6]:
print(lsi_Z[0])

[ 23.306844     1.59517984  21.7764597   -0.12922833   0.87456984
  11.20116348   4.00098305  -2.10129853   1.69774492 -14.67177308]


In [7]:
text = "The economy is working better than ever"
x = lsi_model.transform(vectorizer.transform([text]))[0]
print(x)

[ 0.0770177   0.01116973 -0.01833687  0.00091544  0.02300427  0.0317107
 -0.06423964  0.00012078 -0.00786014  0.01639095]


In [8]:
from sklearn.metrics.pairwise import euclidean_distances
 
def most_similar(x, Z, top_n=5):
    dists = euclidean_distances(x.reshape(1, -1), Z)
    pairs = enumerate(dists[0])
    most_similar = sorted(pairs, key=lambda item: item[1])[:top_n]
    return most_similar
 
similarities = most_similar(x, lsi_Z)
document_id, similarity = similarities[0]
print(data[document_id][:1000])

6.4 . The primary decomposition theorem We are trying to study a linear operator T on the finite-dimensional space V , by decomposing T into a direct sum of operators which are in some sense elementary . We can do this through the characteristic values and vectors of T in certain special cases , i.e. , when the minimal polynomial for T factors over the scalar field F into a product of distinct monic polynomials of degree 1 . What can we do with the general T ? ? If we try to study T using characteristic values , we are confronted with two problems . First , T may not have a single characteristic value ; ; this is really a deficiency in the scalar field , namely , that it is not algebraically closed . Second , even if the characteristic polynomial factors completely over F into a product of polynomials of degree 1 , there may not be enough characteristic vectors for T to span the space V . This is clearly a deficiency in T . The second situation is illustrated by the operator T on Af ( 