In [6]:
#import libraries

import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [7]:
#library to create a dictionaty
import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
import re



In [8]:
from collections import defaultdict
import itertools

In [9]:
#import training data

df = pd.read_csv("C:/Users/270168/Downloads/personal/nlp-starter-test/social_media_clean_text.csv")

In [10]:
#check the data set

df.head()

Unnamed: 0,text,choose_one,class_label
0,just happened a terrible car crash,Relevant,1
1,our deeds are the reason of this earthquake m...,Relevant,1
2,"heard about earthquake is different cities, s...",Relevant,1
3,"there is a forest fire at spot pond, geese are...",Relevant,1
4,forest fire near la ronge sask canada,Relevant,1


In [11]:
#separate text column for preprocessing

df["processed_text"] = df.iloc[:,0]

In [12]:
def process (G):
    tok = G.split(" ")
    stop_words = stopwords.words('english')
    no_tok = [t for t in tok if t not in stop_words]
    filtered_tok = [tok1 for tok1 in no_tok if re.search('[a-zA-Z]', tok1)]
    word_lemmat = WordNetLemmatizer()
    lemmat_words = [word_lemmat.lemmatize(t) for t in filtered_tok]
    return lemmat_words

In [13]:
df["processed_text"] = df["processed_text"].apply(process)

In [14]:
data_text = df["processed_text"].values.tolist()

In [15]:
dictionary = Dictionary(df["processed_text"])

In [16]:
dictionary.token2id.get("flood")

72

In [17]:
corpus = [dictionary.doc2bow(t) for t in data_text]

In [18]:
gross_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    gross_count[word_id] += word_count

In [19]:
sorted_gross_count = sorted(gross_count.items(), key=lambda w: w[1], reverse=True)

In [20]:
#print top 10 words

for word_id, word_count in sorted_gross_count[:10]:
    print(dictionary.get(word_id), word_count)

fire 443
amp 442
like 436
i'm 309
u 307
get 303
new 268
one 253
people 245
via 235


In [21]:
#create tfidf model

tfidf = TfidfModel(corpus)


In [22]:
dictionary.values()

ValuesView(<gensim.corpora.dictionary.Dictionary object at 0x000002B68B9FF048>)

In [23]:
from sklearn import feature_extraction

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000, min_df=0.01, use_idf=True, tokenizer=process, ngram_range=(1,3))

In [26]:
tfidf_matrix = tfidf_vectorizer.fit_transform(df["text"])

In [27]:
print(tfidf_matrix.shape)

(9282, 78)


In [28]:
terms = tfidf_vectorizer.get_feature_names()

In [29]:
print(terms)

['accident', 'amp', 'attack', 'back', 'body', 'bomb', 'building', 'burning', 'california', "can't", 'car', 'collapse', 'could', 'crash', 'day', 'dead', 'death', 'disaster', 'emergency', 'even', 'fire', 'first', 'flood', 'full', 'get', 'go', 'going', 'good', 'got', 'hiroshima', 'home', 'house', "i'm", 'injury', 'killed', 'know', 'last', 'life', 'like', 'look', 'love', 'make', 'man', 'mass', 'need', 'never', 'new', 'news', 'nuclear', 'one', 'people', 'police', 'right', 'rt', 'say', 'see', 'service', 'still', 'storm', 'suicide', 'take', 'think', 'time', 'today', 'train', 'two', 'u', 'via', 'video', 'w', 'want', 'war', 'way', 'weapon', 'woman', 'world', 'would', 'year']


In [30]:
#K Means Cluster

from sklearn.cluster import KMeans
n_cluster = 5
km = KMeans(n_clusters=n_cluster)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [31]:
df['cluster'] = clusters

In [32]:
df['cluster'].value_counts()

0    8192
1     376
4     274
3     220
2     220
Name: cluster, dtype: int64

In [33]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]



In [34]:
from sklearn.decomposition import LatentDirichletAllocation

In [35]:
lda_model = LatentDirichletAllocation(n_components=2, max_iter=10, learning_method='online',learning_offset=50).fit(tfidf_matrix)

In [36]:
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [37]:
pyLDAvis.sklearn.prepare(lda_model,tfidf_matrix, tfidf_vectorizer)

  nbits = re.compile('(\d+)bit').search(abits).group(1)
  "\s+stepping\s+(?P<STP>\d+)", re.IGNORECASE)
