<a href="https://colab.research.google.com/github/khuloodnasher/Text-Clustering/blob/main/Text_Clustering_by_K_means.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import re

In [2]:
corpus=""" 
Amazon  posted quarterly net sales of $125.6 billion.
up 44% from the same period in the prior year and well ahead of the $119.7 billion Wall Street analysts had projected.
Net income in the quarter hit $7.2 billion nearly double the $3.7 billion Wall Street predicted
and more than double the $3.3 billion in income the company earned in the year-ago quarter.
Earnings per diluted share were $14.09.After a week of practices here in Mobile. 100plus prospects for the 2021 NFL Draft capped off a crucial job interview in Saturday's Reese's Senior Bowl.
The National team, coached by the Miami Dolphins staff. defeated the Carolina Panthers-led American team, 27-24. 
""".split("\n")[1:-1]

In [3]:
corpus

['Amazon  posted quarterly net sales of $125.6 billion.',
 'up 44% from the same period in the prior year and well ahead of the $119.7 billion Wall Street analysts had projected.',
 'Net income in the quarter hit $7.2 billion nearly double the $3.7 billion Wall Street predicted',
 'and more than double the $3.3 billion in income the company earned in the year-ago quarter.',
 "Earnings per diluted share were $14.09.After a week of practices here in Mobile. 100plus prospects for the 2021 NFL Draft capped off a crucial job interview in Saturday's Reese's Senior Bowl.",
 'The National team, coached by the Miami Dolphins staff. defeated the Carolina Panthers-led American team, 27-24. ']

In [4]:
# clearing and tokenizing
l_A = corpus[0].lower().split()
l_B = corpus[1].lower().split()
l_C = corpus[2].lower().split()

# Calculating bag of words
word_set = set(l_A).union(set(l_B)).union(set(l_C))

word_dict_A = dict.fromkeys(word_set, 0)
word_dict_B = dict.fromkeys(word_set, 0)
word_dict_C = dict.fromkeys(word_set, 0)

for word in l_A:
    word_dict_A[word] += 1

for word in l_B:
    word_dict_B[word] += 1

for word in l_C:
    word_dict_C[word] += 1

In [5]:
word_dict_A

{'$119.7': 0,
 '$125.6': 1,
 '$3.7': 0,
 '$7.2': 0,
 '44%': 0,
 'ahead': 0,
 'amazon': 1,
 'analysts': 0,
 'and': 0,
 'billion': 0,
 'billion.': 1,
 'double': 0,
 'from': 0,
 'had': 0,
 'hit': 0,
 'in': 0,
 'income': 0,
 'nearly': 0,
 'net': 1,
 'of': 1,
 'period': 0,
 'posted': 1,
 'predicted': 0,
 'prior': 0,
 'projected.': 0,
 'quarter': 0,
 'quarterly': 1,
 'sales': 1,
 'same': 0,
 'street': 0,
 'the': 0,
 'up': 0,
 'wall': 0,
 'well': 0,
 'year': 0}

In [6]:
word_dict_B

{'$119.7': 1,
 '$125.6': 0,
 '$3.7': 0,
 '$7.2': 0,
 '44%': 1,
 'ahead': 1,
 'amazon': 0,
 'analysts': 1,
 'and': 1,
 'billion': 1,
 'billion.': 0,
 'double': 0,
 'from': 1,
 'had': 1,
 'hit': 0,
 'in': 1,
 'income': 0,
 'nearly': 0,
 'net': 0,
 'of': 1,
 'period': 1,
 'posted': 0,
 'predicted': 0,
 'prior': 1,
 'projected.': 1,
 'quarter': 0,
 'quarterly': 0,
 'sales': 0,
 'same': 1,
 'street': 1,
 'the': 3,
 'up': 1,
 'wall': 1,
 'well': 1,
 'year': 1}

In [7]:
word_dict_C

{'$119.7': 0,
 '$125.6': 0,
 '$3.7': 1,
 '$7.2': 1,
 '44%': 0,
 'ahead': 0,
 'amazon': 0,
 'analysts': 0,
 'and': 0,
 'billion': 2,
 'billion.': 0,
 'double': 1,
 'from': 0,
 'had': 0,
 'hit': 1,
 'in': 1,
 'income': 1,
 'nearly': 1,
 'net': 1,
 'of': 0,
 'period': 0,
 'posted': 0,
 'predicted': 1,
 'prior': 0,
 'projected.': 0,
 'quarter': 1,
 'quarterly': 0,
 'sales': 0,
 'same': 0,
 'street': 1,
 'the': 2,
 'up': 0,
 'wall': 1,
 'well': 0,
 'year': 0}

In [8]:
def compute_tf(word_dict, l):
    tf = {}
    sum_nk = len(l)
    for word, count in word_dict.items():
        tf[word] = count/sum_nk
    return tf
  
tf_A = compute_tf(word_dict_A, l_A)
tf_B = compute_tf(word_dict_B, l_B)
tf_C = compute_tf(word_dict_C, l_C)


In [9]:
tf_A

{'$119.7': 0.0,
 '$125.6': 0.125,
 '$3.7': 0.0,
 '$7.2': 0.0,
 '44%': 0.0,
 'ahead': 0.0,
 'amazon': 0.125,
 'analysts': 0.0,
 'and': 0.0,
 'billion': 0.0,
 'billion.': 0.125,
 'double': 0.0,
 'from': 0.0,
 'had': 0.0,
 'hit': 0.0,
 'in': 0.0,
 'income': 0.0,
 'nearly': 0.0,
 'net': 0.125,
 'of': 0.125,
 'period': 0.0,
 'posted': 0.125,
 'predicted': 0.0,
 'prior': 0.0,
 'projected.': 0.0,
 'quarter': 0.0,
 'quarterly': 0.125,
 'sales': 0.125,
 'same': 0.0,
 'street': 0.0,
 'the': 0.0,
 'up': 0.0,
 'wall': 0.0,
 'well': 0.0,
 'year': 0.0}

In [10]:
tf_B

{'$119.7': 0.045454545454545456,
 '$125.6': 0.0,
 '$3.7': 0.0,
 '$7.2': 0.0,
 '44%': 0.045454545454545456,
 'ahead': 0.045454545454545456,
 'amazon': 0.0,
 'analysts': 0.045454545454545456,
 'and': 0.045454545454545456,
 'billion': 0.045454545454545456,
 'billion.': 0.0,
 'double': 0.0,
 'from': 0.045454545454545456,
 'had': 0.045454545454545456,
 'hit': 0.0,
 'in': 0.045454545454545456,
 'income': 0.0,
 'nearly': 0.0,
 'net': 0.0,
 'of': 0.045454545454545456,
 'period': 0.045454545454545456,
 'posted': 0.0,
 'predicted': 0.0,
 'prior': 0.045454545454545456,
 'projected.': 0.045454545454545456,
 'quarter': 0.0,
 'quarterly': 0.0,
 'sales': 0.0,
 'same': 0.045454545454545456,
 'street': 0.045454545454545456,
 'the': 0.13636363636363635,
 'up': 0.045454545454545456,
 'wall': 0.045454545454545456,
 'well': 0.045454545454545456,
 'year': 0.045454545454545456}

In [11]:
tf_C

{'$119.7': 0.0,
 '$125.6': 0.0,
 '$3.7': 0.0625,
 '$7.2': 0.0625,
 '44%': 0.0,
 'ahead': 0.0,
 'amazon': 0.0,
 'analysts': 0.0,
 'and': 0.0,
 'billion': 0.125,
 'billion.': 0.0,
 'double': 0.0625,
 'from': 0.0,
 'had': 0.0,
 'hit': 0.0625,
 'in': 0.0625,
 'income': 0.0625,
 'nearly': 0.0625,
 'net': 0.0625,
 'of': 0.0,
 'period': 0.0,
 'posted': 0.0,
 'predicted': 0.0625,
 'prior': 0.0,
 'projected.': 0.0,
 'quarter': 0.0625,
 'quarterly': 0.0,
 'sales': 0.0,
 'same': 0.0,
 'street': 0.0625,
 'the': 0.125,
 'up': 0.0,
 'wall': 0.0625,
 'well': 0.0,
 'year': 0.0}

In [12]:
import numpy as np
import re


corpus=""" 
Amazon  posted quarterly net sales of $125.6 billion.
up 44% from the same period in the prior year and well ahead of the $119.7 billion Wall Street analysts had projected.
Net income in the quarter hit $7.2 billion nearly double the $3.7 billion Wall Street predicted
and more than double the $3.3 billion in income the company earned in the year-ago quarter.
Earnings per diluted share were $14.09.After a week of practices here in Mobile. 100plus prospects for the 2021 NFL Draft capped off a crucial job interview in Saturday's Reese's Senior Bowl.
The National team, coached by the Miami Dolphins staff. defeated the Carolina Panthers-led American team, 27-24. 
""".split("\n")[1:-1]



In [13]:
corpus

['Amazon  posted quarterly net sales of $125.6 billion.',
 'up 44% from the same period in the prior year and well ahead of the $119.7 billion Wall Street analysts had projected.',
 'Net income in the quarter hit $7.2 billion nearly double the $3.7 billion Wall Street predicted',
 'and more than double the $3.3 billion in income the company earned in the year-ago quarter.',
 "Earnings per diluted share were $14.09.After a week of practices here in Mobile. 100plus prospects for the 2021 NFL Draft capped off a crucial job interview in Saturday's Reese's Senior Bowl.",
 'The National team, coached by the Miami Dolphins staff. defeated the Carolina Panthers-led American team, 27-24. ']

In [14]:
# cleaning
def preprocessing(line):
    line = line.lower()
    line = re.sub(r"[{}]", " ", line)
    return line

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans





tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocessing)
tfidf = tfidf_vectorizer.fit_transform(corpus)

kmeans = KMeans(n_clusters=2).fit(tfidf)


In [16]:

lines_for_predicting = ["The National team, coached by the Miami Dolphins staff. defeated the Carolina Panthers-led American team, 27-24.", "and more than double the $3.3 billion in income the company earned in the year-ago quarter."]
kmeans.predict(tfidf_vectorizer.transform(lines_for_predicting))

array([0, 0], dtype=int32)

In [17]:
predict_1 = ["Amazon  posted quarterly net sales of $125.6 billion.", "up 44% from the same period in the prior year and well ahead of the $119.7 billion Wall Street analysts had projected."]
kmeans.predict(tfidf_vectorizer.transform(predict_1))

array([1, 0], dtype=int32)

In [18]:
corpus=""" 
Amazon  posted quarterly net sales of $125.6 billion.
up 44% from the same period in the prior year and well ahead of the $119.7 billion Wall Street analysts had projected.
Net income in the quarter hit $7.2 billion nearly double the $3.7 billion Wall Street predicted
and more than double the $3.3 billion in income the company earned in the year-ago quarter.
Earnings per diluted share were $14.09.After a week of practices here in Mobile. 100plus prospects for the 2021 NFL Draft capped off a crucial job interview in Saturday's Reese's Senior Bowl.
The National team, coached by the Miami Dolphins staff. defeated the Carolina Panthers-led American team, 27-24. 
"""


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

In [20]:
documents = ["Amazon  posted quarterly net sales of $125.6 billion.",
             "up 44% from the same period in the prior year and well ahead of the $119.7 billion Wall Street analysts had projected.",
             "Net income in the quarter hit $7.2 billion nearly double the $3.7 billion Wall Street predicted.",
             "more than double the $3.3 billion in income the company earned in the year-ago quarter.",
             "Earnings per diluted share were $14.09.After a week of practices here in Mobile. 100plus prospects for the 2021 NFL Draft capped off a crucial job interview in Saturday's Reese's Senior Bowl.",
             "The National team, coached by the Miami Dolphins staff. defeated the Carolina Panthers-led American team, 27-24."]

In [21]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)


In [22]:
true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=2, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [23]:
# Top terms per cluster
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
order_centroids

array([[ 0, 27, 34, 38, 22, 42, 31, 30, 58, 46, 16, 15, 49,  5, 51, 52,
        53,  4, 20, 25,  1, 18, 17, 19, 11, 14, 13, 12, 10,  9,  8,  7,
         6,  3,  2, 21, 59, 23, 43, 57, 56, 55, 54, 50, 48, 47, 45, 44,
        41, 24, 40, 39, 37, 36, 35, 33, 32, 28, 26, 29],
       [14, 37, 29, 47, 24, 59, 57, 55, 56, 41, 11, 50, 48,  3, 26, 19,
         9, 43, 28, 36, 44, 45, 40, 13,  2, 10,  8, 21, 23, 12, 18, 35,
         7, 32, 33,  6, 39, 54, 17, 15,  5,  4,  1, 53, 16, 20, 22, 25,
        52, 27, 58, 30, 31, 34, 38, 42, 46, 49, 51,  0]])

In [24]:
terms = vectorizer.get_feature_names()
terms

['09',
 '100plus',
 '119',
 '125',
 '14',
 '2021',
 '24',
 '27',
 '44',
 'ago',
 'ahead',
 'amazon',
 'american',
 'analysts',
 'billion',
 'bowl',
 'capped',
 'carolina',
 'coached',
 'company',
 'crucial',
 'defeated',
 'diluted',
 'dolphins',
 'double',
 'draft',
 'earned',
 'earnings',
 'hit',
 'income',
 'interview',
 'job',
 'led',
 'miami',
 'mobile',
 'national',
 'nearly',
 'net',
 'nfl',
 'panthers',
 'period',
 'posted',
 'practices',
 'predicted',
 'prior',
 'projected',
 'prospects',
 'quarter',
 'quarterly',
 'reese',
 'sales',
 'saturday',
 'senior',
 'share',
 'staff',
 'street',
 'team',
 'wall',
 'week',
 'year']

In [25]:
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print('------------------------------------------')

print("\n")
print("Prediction")

Cluster 0:
 09
 earnings
 mobile
 nfl
 diluted
 practices
 job
 interview
 week
 prospects
------------------------------------------
Cluster 1:
 billion
 net
 income
 quarter
 double
 year
 wall
 street
 team
 posted
------------------------------------------


Prediction


In [26]:
Y = vectorizer.transform(["The National team, coached by the Miami Dolphins staff."])
prediction = model.predict(Y)
print(prediction)

[1]
