In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from operator import itemgetter
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.manifold import MDS
import seaborn as sn
import glob as glob

In [None]:
vec = CountVectorizer(input='filename',
                      lowercase = True,
                      min_df = .2,
                      strip_accents = 'unicode')

In [None]:
input_files = glob.glob("../data/na*/data/texts/*.txt")
dtm = vec.fit_transform(input_files)

In [None]:
dc, vc = dtm.shape
print("document count:",dc,"vocabulary count:",vc)

In [None]:
# kmeans cluster
means = KMeans(n_clusters=10)
fitted = means.fit_transform(dtm)
centroids = means.cluster_centers_
fitted_dist = means.transform(dtm)
labels = means.fit_predict(dtm)

In [None]:
for class_num in set(labels):
    print("Class:",class_num)
    for doc, kclass in zip(input_files,labels):
        if class_num == kclass:
            print(doc)
    print("\n")

In [None]:
dist_matrix = 1 - cosine_similarity(dtm)

In [None]:
# display the similarity matrix (white=similar, dark blue=different)
%matplotlib inline
sn.clustermap(dist_matrix,cmap='Blues',fmt='g',linewidths=.5)

In [None]:
plt.scatter(centroids[:,0] , centroids[:,1], color = 'red', s=200, alpha=0.5)
plt.show()

In [None]:
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist_matrix)
plt.scatter(pos[:, 0],pos[:, 1])

In [None]:
for x, y in sorted(enumerate(dist_matrix[21]), key=itemgetter(1)):
    print('{0:.2f} {1}'.format(y,input_files[x]))