In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#Collections is fetching dictionary of labels and clusters
import collections
#Natural Language ToolKit = nltk
import nltk 
nltk.download('stopwords')
nltk.download('punkt')
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint

#Reading dataset contains words and text
dataset = pd.read_csv("E:\\ML Zero to Hero\\Quotes.csv")
print(dataset)

#Converting data into list
dataset_list = dataset["Quotes"].tolist()
dataset_list

                                               Quotes
0         Graphics designers are most creative people
1   Artificial Intelligence or AI is the last inve...
2   Snooker is a billiards sport for normally two ...
3   Snooker is played on a large (12 feet by 6 fee...
4    FOREX is the stock market for trading currencies
5   Software Engineering is hotter and hotter topi...
6                                       Love is blind
7   Snooker is popular in the United Kingdom and m...
8   The flying or operating of aircraft is known a...
9   AI is likely to be either the best or worst th...
10               Design is Intelligence made visible.
11            Falling in love is like being on drugs.
12  There is only one happiness in Life to Love an...
13  Boeing 777 is considered world's largest econo...
14  Warren Buffet is famous for making good invest...
15  The biggest of the many uses of aviation are i...
16  All giant majors in Silicon Valley is focusing...
17  Investing in stocks and 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['Graphics designers are most creative people',
 'Artificial Intelligence or AI is the last invention - humans could ever make',
 'Snooker is a billiards sport for normally two players.',
 'Snooker is played on a large (12 feet by 6 feet) table that is covered with a smooth green material.',
 'FOREX is the stock market for trading currencies',
 'Software Engineering is hotter and hotter topic in Silicon Valley',
 'Love is blind',
 'Snooker is popular in the United Kingdom and many other countries',
 'The flying or operating of aircraft is known as aviation.',
 'AI is likely to be either the best or worst thing happen to humanity',
 'Design is Intelligence made visible.',
 'Falling in love is like being on drugs.',
 'There is only one happiness in Life to Love and to be loved.',
 "Boeing 777 is considered world's largest economical plane in the world of Aviation.",
 'Warren Buffet is famous for making good investments.He knows stock markets',
 'The biggest of the many uses of aviation a

In [6]:
#Defining function tokenizer for text
def tokenizer(text):
    
    #tokenization is the process of splitting a large sample of text into words.
    tokens = word_tokenize(text)
    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(t) for t in tokens if t not in stopwords.words('english')]
    return tokens

In [11]:
#---------------------Train our k-model and find tfidf Vectorizer matrix within same function--------------------------#

def  cluster_sectences(dataset_list, k):
    
    #Creating tf ifd again:
    #stopword ==== we filter out common words. for example (I, my, the, and etc....)
    tfidf_vectorizer = TfidfVectorizer(tokenizer = tokenizer, stop_words = stopwords.words('english'), lowercase = True)
    
    #Build a tf-idf matrix for sentences
    #Transform text to feature vectors that can be used as input to estimator
    tdidf_matrix = tfidf_vectorizer.fit_transform(dataset_list)
    
    #Fitting k-means clustering
    kmeans = KMeans(n_clusters = k)
    kmeans.fit(tdidf_matrix)
    
    clusters = collections.defaultdict(list)
    
    for i, label in enumerate(kmeans.labels_):
        clusters[label].append(i)
        
    
    return dict(clusters)

#Testing the model
k = 7
clusters = cluster_sectences(dataset_list, k)

for cluster in range (k):
    print("Cluster ", cluster, ":\n")
    for i, sentence in enumerate(clusters[cluster]):
        print("\t", (i + 1), " : ", dataset_list[sentence])



Cluster  0 :

	 1  :  Artificial Intelligence or AI is the last invention - humans could ever make
	 2  :  AI is likely to be either the best or worst thing happen to humanity
	 3  :  Google will fulfill its mission only when its search engine is AI - complete You guys know what that means? That's Artificial Intelligence.
	 4  :  Auomation is the biggest blessing given by Artificial Inteligence.
	 5  :  AI would have a low error rate compared to humans if coded properly. 
Cluster  1 :

	 1  :  Graphics designers are most creative people
	 2  :  Design is Intelligence made visible.
	 3  :  Being in love is the number one reason why people wed.
	 4  :  Graphics Designing is high rated freelance subject
Cluster  2 :

	 1  :  Love is blind
	 2  :  Falling in love is like being on drugs.
	 3  :  There is only one happiness in Life to Love and to be loved.
	 4  :  Loving from a long distance actually strengthens a relationship
	 5  :  Real love is able to awaken your soul.
Cluster  3 :

	 1 