In [2]:
#import libraries

import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [47]:
#library to create a dictionaty
import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
import re

In [4]:
from collections import defaultdict
import itertools

In [5]:
#import training data

df = pd.read_csv("C:/Users/270168/Downloads/personal/nlp-starter-test/social_media_clean_text.csv")

In [6]:
#check the data set

df.head()

Unnamed: 0,text,choose_one,class_label
0,just happened a terrible car crash,Relevant,1
1,our deeds are the reason of this earthquake m...,Relevant,1
2,"heard about earthquake is different cities, s...",Relevant,1
3,"there is a forest fire at spot pond, geese are...",Relevant,1
4,forest fire near la ronge sask canada,Relevant,1


In [77]:
#separate text column for preprocessing

df["processed_text"] = df.iloc[:,0]

In [78]:
def process (G):
    tok = G.split(" ")
    stop_words = stopwords.words('english')
    no_tok = [t for t in tok if t not in stop_words]
    filtered_tok = [tok1 for tok1 in no_tok if re.search('[a-zA-Z]', tok1)]
    word_lemmat = WordNetLemmatizer()
    lemmat_words = [word_lemmat.lemmatize(t) for t in filtered_tok]
    return lemmat_words

In [79]:
df["processed_text"] = df["processed_text"].apply(process)

In [81]:
data_text = df["processed_text"].values.tolist()

In [82]:
dictionary = Dictionary(df["processed_text"])

In [83]:
dictionary.token2id.get("flood")

72

In [84]:
corpus = [dictionary.doc2bow(t) for t in data_text]

In [85]:
gross_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    gross_count[word_id] += word_count

In [86]:
sorted_gross_count = sorted(gross_count.items(), key=lambda w: w[1], reverse=True)

In [87]:
#print top 10 words

for word_id, word_count in sorted_gross_count[:10]:
    print(dictionary.get(word_id), word_count)

fire 443
amp 442
like 436
i'm 309
u 307
get 303
new 268
one 253
people 245
via 235


In [88]:
#create tfidf model

tfidf = TfidfModel(corpus)


In [91]:
dictionary.values()

ValuesView(<gensim.corpora.dictionary.Dictionary object at 0x0000026202CD4DA0>)

In [92]:
from sklearn import feature_extraction

In [93]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [94]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000, min_df=0.01, use_idf=True, tokenizer=process, ngram_range=(1,3))

In [95]:
tfidf_matrix = tfidf_vectorizer.fit_transform(df["text"])

In [96]:
print(tfidf_matrix.shape)

(9282, 78)


In [97]:
terms = tfidf_vectorizer.get_feature_names()

In [98]:
print(terms)

['accident', 'amp', 'attack', 'back', 'body', 'bomb', 'building', 'burning', 'california', "can't", 'car', 'collapse', 'could', 'crash', 'day', 'dead', 'death', 'disaster', 'emergency', 'even', 'fire', 'first', 'flood', 'full', 'get', 'go', 'going', 'good', 'got', 'hiroshima', 'home', 'house', "i'm", 'injury', 'killed', 'know', 'last', 'life', 'like', 'look', 'love', 'make', 'man', 'mass', 'need', 'never', 'new', 'news', 'nuclear', 'one', 'people', 'police', 'right', 'rt', 'say', 'see', 'service', 'still', 'storm', 'suicide', 'take', 'think', 'time', 'today', 'train', 'two', 'u', 'via', 'video', 'w', 'want', 'war', 'way', 'weapon', 'woman', 'world', 'would', 'year']


In [99]:
#K Means Cluster

from sklearn.cluster import KMeans
n_cluster = 5
km = KMeans(n_clusters=n_cluster)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [100]:
df['cluster'] = clusters

In [101]:
df['cluster'].value_counts()

0    7971
2     365
4     361
3     354
1     231
Name: cluster, dtype: int64

In [102]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]



In [123]:
#words in clusters 

for i in range (n_cluster):
    print("Words in Cluster {}: ".format(i) , end='')
    all_terms = []
    for ind in order_centroids[i, :10]:
        all_terms.append(terms[ind])
    print((all_terms))
print()
    
    

Words in Cluster 0: ['get', 'via', "i'm", 'new', 'u', 'news', 'people', 'emergency', 'video', 'storm']
Words in Cluster 1: ['one', 'like', 'day', 'year', 'get', "i'm", 'fire', 'people', 'got', 'time']
Words in Cluster 2: ['amp', 'rt', 'fire', 'u', 'w', 'new', 'like', 'take', 'time', "i'm"]
Words in Cluster 3: ['fire', 'building', 'california', 'service', 'burning', 'u', 'two', 'say', 'news', 'time']
Words in Cluster 4: ['like', 'look', "i'm", 'get', 'people', 'fire', 'would', 'burning', 'think', 'back']

