In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import nltk

In [2]:
pd.options.display.max_colwidth=200;
%matplotlib inline

In [3]:
pet_analysis = pd.read_csv('pet_analysis.csv')
pet_analysis.dropna(subset=['Description'], inplace=True) #removing blank entries

In [4]:
wpt=nltk.WordPunctTokenizer()
stop_words=nltk.corpus.stopwords.words('english')
def normalize_document (doc):
    #lowercase and remove special characters\whitespace
    doc=re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A) #re.I ignore case sensitive, ASCII-only matching
    doc=doc.lower()
    doc=doc.strip()
    #tokenize document
    tokens=wpt.tokenize(doc)
    #filter stopwords out of document
    filtered_tokens=[token for token in tokens if token not in stop_words]
    #re-create documenr from filtered tokens
    doc=' '.join(filtered_tokens)
    return doc
normalize_corpus=np.vectorize(normalize_document)
norm_corpus=normalize_corpus(pet_analysis['Description'])
norm_corpus

array(['nibble month old ball cuteness energetic playful rescued couple cats months ago could get neutered time clinic fully scheduled result little kitty enough space funds care cats household looking responsible people take nibbles care',
       'found alone yesterday near apartment shaking bring home provide temporary care',
       'pregnant mother dumped irresponsible owner roadside near shops subang jaya gave birth roadside healthy adorable puppies already dewormed vaccinated ready go home tying caging long hours guard dogs however acceptable cage tie precautionary purposes interested adopt pls call',
       ..., 'mix breed good temperament kittens love humans friendly',
       'shyadventures independentshe hates cagesbut loves climbing trees rooftopshowever loving',
       'fili loves laying around also loves sun laidback quiet'],
      dtype='<U4018')

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix=cv.fit_transform(norm_corpus)
cv_matrix

<14981x25098 sparse matrix of type '<class 'numpy.int64'>'
	with 444101 stored elements in Compressed Sparse Row format>

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
tt=TfidfTransformer(norm='l2', use_idf=True)
tt_mtarix=tt.fit_transform(cv_matrix)
tt_mtarix=tt_mtarix.toarray()
vocab=cv.get_feature_names()
pd.DataFrame(np.round(tt_mtarix,2), columns=vocab)

Unnamed: 0,10,11,12,13,22,2岁,30am,35,95,________________,...,都有告訴她,除了分享一下我的小故事,难怪它那么瘦,难怪我常看见它四处游荡,非誠勿擾,领养,领养一条生命比起购买来得更有意义,领养不出就真的必须打回原形,领养代替购买,领养者必须不定期
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
#Calculate TF-IDF for raw data, using TfidVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tv=TfidfVectorizer (min_df=0., max_df=1., norm='l2', 
                    #max_df ignore terms that appear in more than x% of the documents, 1 means ignore nothing!
                    #min_df ignore terms that appear in less than x% of the documents 
                  use_idf=True, smooth_idf=True)
tv_matrix=tv.fit_transform(norm_corpus)
tv_matrix=tv_matrix.toarray()
vocab=tv.get_feature_names()
pd.DataFrame (np.round(tv_matrix,2), columns=vocab)

Unnamed: 0,10,11,12,13,22,2岁,30am,35,95,________________,...,都有告訴她,除了分享一下我的小故事,难怪它那么瘦,难怪我常看见它四处游荡,非誠勿擾,领养,领养一条生命比起购买来得更有意义,领养不出就真的必须打回原形,领养代替购买,领养者必须不定期
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
from sklearn.metrics.pairwise import cosine_similarity 

In [10]:
similarity_matrix = cosine_similarity(tv_matrix)
similarity_df = pd.DataFrame(similarity_matrix)
similarity_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14971,14972,14973,14974,14975,14976,14977,14978,14979,14980
0,1.000000,0.037060,0.000000,0.023645,0.115458,0.000000,0.019433,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.028733,0.0,0.053014,0.010914,0.000000,0.000000,0.000000
1,0.037060,1.000000,0.044731,0.000000,0.025708,0.000000,0.000000,0.000000,0.031992,0.034484,...,0.010944,0.000000,0.018482,0.047838,0.0,0.014737,0.010128,0.000000,0.000000,0.000000
2,0.000000,0.044731,1.000000,0.063013,0.033673,0.000000,0.042148,0.131569,0.019402,0.039149,...,0.262213,0.028073,0.009894,0.025609,0.0,0.056903,0.005422,0.000000,0.000000,0.000000
3,0.023645,0.000000,0.063013,1.000000,0.013524,0.000000,0.010931,0.096133,0.032111,0.056052,...,0.053523,0.000000,0.057982,0.000000,0.0,0.043868,0.000000,0.066515,0.000000,0.000000
4,0.115458,0.025708,0.033673,0.013524,1.000000,0.000000,0.028019,0.072991,0.000000,0.094451,...,0.046207,0.000000,0.009502,0.004444,0.0,0.061853,0.036867,0.025034,0.025904,0.038280
5,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.026508,0.000000,0.061654,0.218287,...,0.000000,0.000000,0.000000,0.040157,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
6,0.019433,0.000000,0.042148,0.010931,0.028019,0.026508,1.000000,0.023367,0.000000,0.030374,...,0.057482,0.000000,0.000000,0.000000,0.0,0.030542,0.018536,0.000000,0.000000,0.000000
7,0.000000,0.000000,0.131569,0.096133,0.072991,0.000000,0.023367,1.000000,0.000000,0.019991,...,0.053426,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
8,0.000000,0.031992,0.019402,0.032111,0.000000,0.061654,0.000000,0.000000,1.000000,0.024590,...,0.000000,0.056182,0.000000,0.028576,0.0,0.017800,0.000000,0.000000,0.000000,0.000000
9,0.000000,0.034484,0.039149,0.056052,0.094451,0.218287,0.030374,0.019991,0.024590,1.000000,...,0.058653,0.000000,0.055737,0.000000,0.0,0.022999,0.039874,0.000000,0.000000,0.000000


In [11]:
from scipy.cluster.hierarchy import dendrogram, linkage
Z = linkage (similarity_matrix, 'ward')
pd.DataFrame(Z, columns=['Documents\Cluster 1','Documents\Cluster 2',
                        'Distance', 'Cluster Size'], dtype='object')
plt.figure (figsize = (8,3))
plt.title ('Hierarchical Clustering Dendrogram')
plt.xlabel('Data Point')
plt.ylabel('Distance')
dendrogram(Z)
plt.axhline(y=1.0,c='k',ls='--',lw=0.5);

KeyboardInterrupt: 