In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### Fitting and transforming a list of documents (in this case a list of sentences)

In [2]:
v = CountVectorizer()

sent = ["The sky is blue.", "The sun is bright today.", \
        "The sun in the sky is bright.", "We can see the shining sun, the bright sun."]
v.fit(sent)

transformed = v.transform(sent)

print(v.vocabulary_, '\nWord sky has id %d' % v.vocabulary_['sky'])

{'the': 9, 'sky': 7, 'is': 4, 'blue': 0, 'sun': 8, 'bright': 1, 'today': 10, 'in': 3, 'we': 11, 'can': 2, 'see': 5, 'shining': 6} 
Word sky has id 7


### Printing the vocabulary of the fitted vectorizer and the term frequency matrix

In [3]:
## Getting the vocabulary dictionary and sorting it based on value (word id)
print(sorted(v.vocabulary_.items(), key=lambda x: x[1])) 

print(transformed.toarray()) # The term frequency matrix for the four sentences

[('blue', 0), ('bright', 1), ('can', 2), ('in', 3), ('is', 4), ('see', 5), ('shining', 6), ('sky', 7), ('sun', 8), ('the', 9), ('today', 10), ('we', 11)]
[[1 0 0 0 1 0 0 1 0 1 0 0]
 [0 1 0 0 1 0 0 0 1 1 1 0]
 [0 1 0 1 1 0 0 1 1 2 0 0]
 [0 1 1 0 0 1 1 0 2 2 0 1]]


### The sparse matrix, a tuple of (doc_id, word_id) and the corresponding word count

In [4]:
print(transformed)

  (0, 0)	1
  (0, 4)	1
  (0, 7)	1
  (0, 9)	1
  (1, 1)	1
  (1, 4)	1
  (1, 8)	1
  (1, 9)	1
  (1, 10)	1
  (2, 1)	1
  (2, 3)	1
  (2, 4)	1
  (2, 7)	1
  (2, 8)	1
  (2, 9)	2
  (3, 1)	1
  (3, 2)	1
  (3, 5)	1
  (3, 6)	1
  (3, 8)	2
  (3, 9)	2
  (3, 11)	1


### Transforming a new sentence using the same vectorizer (will maintain word ids)

In [5]:
sent2 = ['The moon is bright today']

print(v.transform(sent2))

  (0, 1)	1
  (0, 4)	1
  (0, 9)	1
  (0, 10)	1


### The word moon doesn't appear in original vocabulary, so will not be found even after transforming new sentence

In [6]:
print('moon' in v.vocabulary_)

False


### Generating the Tfidf Matrix

In [8]:

vv = TfidfVectorizer(norm = None)
tfidf = vv.fit_transform(sent)
print(sorted(vv.vocabulary_.items(), key=lambda x : x[1]))

for row in tfidf.toarray():
    print(["%.4f"% val for val in row])

[('blue', 0), ('bright', 1), ('can', 2), ('in', 3), ('is', 4), ('see', 5), ('shining', 6), ('sky', 7), ('sun', 8), ('the', 9), ('today', 10), ('we', 11)]
['1.9163', '0.0000', '0.0000', '0.0000', '1.2231', '0.0000', '0.0000', '1.5108', '0.0000', '1.0000', '0.0000', '0.0000']
['0.0000', '1.2231', '0.0000', '0.0000', '1.2231', '0.0000', '0.0000', '0.0000', '1.2231', '1.0000', '1.9163', '0.0000']
['0.0000', '1.2231', '0.0000', '1.9163', '1.2231', '0.0000', '0.0000', '1.5108', '1.2231', '2.0000', '0.0000', '0.0000']
['0.0000', '1.2231', '1.9163', '0.0000', '0.0000', '1.9163', '1.9163', '0.0000', '2.4463', '2.0000', '0.0000', '1.9163']


### Building the Tfidf matrix for the first 200 documents in the simple-wiki

In [9]:
import os

vv = TfidfVectorizer()

docs = [] # a list of the text content of every document
names = [] # maintains a list of file names
for root, dirs, files in os.walk('single-docs'):
    for file in files[:200]:
        with open(os.path.join(root,file), 'r') as f:
            docs.append(f.read())
            names.append(file)
            
vv.fit(docs)

transformed = vv.transform(docs)

print(transformed.shape) # 200 documents by the number of unique words
print(transformed.toarray()) # will be mostly zeros as a document contains only a small subset of the number of words



### Fitting a KMeans clustering model

In [None]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=10, n_jobs=-2)
km.fit(transformed) # Takes as input the tf-idf matrix of the first 200 documents

print(km.labels_) # Document at index X in the list has cluster label Y

### Printing the titles of articles of each cluster

In [9]:
from collections import defaultdict

titles = defaultdict(list) # A dictionary of {cluster id: [list of file names]}

for (label, fname) in zip(km.labels_, names): # A list of tuples where every tuple is the file name and its cluster label
    
    # The title is the first line in each file
    titles[label].append(open('datasets/simple-wiki/single-docs/'+fname, 'r').readline().strip())
    
for key in titles:
    print(key, '\n', '\n'.join(titles[key]))
    print('-----')

NameError: name 'km' is not defined

#  Lab 4 content

### Example for cosine similarity and argmax functions

In [11]:
import numpy as np
from scipy.spatial.distance import cosine, cdist

print(1-cosine([1,2], [1,3])) # 1 - cosine(x,y) because cosine(x,y) in scipy returns distance not similarity
print(np.argmax([0.8,0.85,0.9]))

0.989949493661
2


### Transforming a query to tf-idf (using the same vectorizer)

In [12]:
q = ['president of']
tq = vv.transform(q)
print(tq, '\n', vv.vocabulary_['president'])

  (0, 4041)	0.95664224072
  (0, 3665)	0.291265554556 
 4041


### Printing file content having maximum similarity with query

In [18]:
import numpy as np

# Generates similarities between query tf-idf matrix and document tf-idf matrix
# Since there's only 1 query, access the first element of the array
# Format is a list of 200 elements, where every element is the cosine similarity between the query and the ith document
sims = 1-cdist(tq.toarray(), transformed.toarray(), metric='cosine')[0]

max_sim = max(sims)
# print('Maximum cosine similarity is', max_sim)
max_pos = np.argmax(sims)
# print('Document index with max similarity is %d, with file name %s' % (max_pos, names[max_pos]))

# print('\nFile:\n')

# print(open('single-docs/'+names[max_pos], 'r').read())

print(sims)


[ 0.06885819  0.          0.02191335  0.03012404  0.01739799  0.03286284
  0.01393504  0.03360573  0.07005985  0.02528124  0.02578343  0.00904867
  0.02149231  0.          0.04868966  0.          0.04121301  0.02451704
  0.02337811  0.07599082  0.00933449  0.02252396  0.0493637   0.00839028
  0.00922074  0.05406458  0.          0.0311793   0.03187265  0.00844349
  0.03916728  0.00979012  0.04553795  0.          0.02134364  0.01126531
  0.02539291  0.04297382  0.01939566  0.          0.          0.00988081
  0.01497567  0.01577973  0.01120685  0.01735907  0.01400123  0.00952379
  0.00636691  0.02047459  0.02146042  0.          0.08668022  0.03771595
  0.01448459  0.02382447  0.03391476  0.          0.02788774  0.01094955
  0.          0.          0.          0.          0.          0.03758774
  0.          0.02389195  0.02891978  0.04080641  0.00716038  0.
  0.05156767  0.04855129  0.01303399  0.00872571  0.0838551   0.0223763
  0.14460949  0.03320612  0.          0.02709157  0.        

### Ranking results

In [19]:
ids = np.argsort(sims)[::-1] # Returns a list of sorted document ids according to maximum similarity
print(ids)
# Printing the top 10 most similar docs
# print('--------\n'.join([open('single-docs/'+names[ids[i]], 'r').read() for i in range(10)]))


[106  78  52 118  76 104  19 163   8   0 137  84  25 107  72  22  14  73
 178  32 141 160 170  37 116 128  16  69 150 177  30 143 184  93 139  53
  65 181 171  56   7  79   5  28  91  27 153 182 117 102  94 124   3 168
  68 142 135  58  81 149  89 127 185  10  36   9 147 108  17  67  98  55
 190  18 114 186 154 133 194 161  21  77 109   2  12  50  34  99 165  49
 148 100 187 101  38 111 183 152   4  45 174 193 192 129 164 132  43  90
 169 103  87  42 162  54 105  46   6  88  83 189  74 112 151 196 180  35
  44  59 130 157 140 122 167  41  31 120  47  20  24  11 145  75  29  23
 110  96 176  70  48 198 188 191 195   1  33  15 175 197 166 173  26 172
  13 179 156  39 125 131  82  85  86 126  92 123 134  95  97 121 119 115
 113  80 136  40  61 159  51 158 155  57  60  62 138  63  64  66 146  71
 144 199]
