In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('./data/nlp_tweets_2012_2017.csv')
df.head()

Unnamed: 0,index,ID,datetime,has_media,is_reply,is_retweet,medias,nbr_favorite,nbr_reply,nbr_retweet,text,url,user_id,usernameTweet,regex_clean_tweets,lemmed_tweet,nouns,proper_nouns,geopolitical_ents
0,1869,252633076105097216,2012-10-01 00:00:00,no media,False,False,no media,1.0,0.0,1.0,@ ckundo Inspired by your ConEd summary ou...,/seecmb/status/252633076105097216,294434762.0,seecmb,ckundo Inspired ConEd summary outage info Ruby...,ckundo Inspired ConEd summary outage info Ruby...,summary outage info gem power outage detail da...,Inspired ConEd Ruby,
1,3094,254638373040422912,2012-10-06 13:00:00,no media,False,False,no media,0.0,0.0,0.0,Partial power outage at #MediaLoft That's...,/LuisPaganStudio/status/254638373040422912,212391633.0,LuisPaganStudio,Partial power outage MediaLoft That twice week...,Partial power outage MediaLoft That twice week...,power outage week,MediaLoft Damn ConEd,
2,2528,256621964268032000,2012-10-12 01:00:00,no media,False,False,no media,0.0,0.0,0.0,"Um, can someone call ConEd , there's a power...",/marlonthereal/status/256621964268032000,14144742.0,marlonthereal,Um someone call ConEd power outage Bronx Sourc...,Um someone call ConEd power outage Bronx Sourc...,someone power outage post,ConEd Bronx Source YHqV,
3,723,259866379086340096,2012-10-20 23:00:00,no media,False,False,no media,0.0,0.0,0.0,Power outage took over my entire block as s...,/meligyh/status/259866379086340096,280700235.0,meligyh,Power outage took entire block soon loaded Con...,Power outage took entire block soon loaded Con...,Power outage block,ConEd,
4,487,261909073295974401,2012-10-26 15:00:00,no media,False,False,no media,0.0,1.0,10.0,Mayor: A major concern given this type of stor...,/NYCMayorsOffice/status/261909073295974401,55338739.0,NYCMayorsOffice,Mayor A major concern given type storm possibi...,Mayor A major concern given type storm possibi...,concern type storm possibility power outage,Mayor ConEdison,


In [156]:
X_raw = df.text
X_clean = df.regex_clean_tweets
X_lem = df.lemmed_tweet
X_noun = df.nouns

In [244]:
def vectorizer(input):
    # km = KMeans(n_clusters=3)
    cv = CountVectorizer(ngram_range=(1,2),
                         max_features=3000)
    X_vec = cv.fit_transform(input)
    
#     params = {'n_grams': [(1,1), (1,2), (2,2), (1,3)],
#               'max_features': [None, 500, 1000, 2000, 3000]}
 
    return pd.DataFrame(X_vec.todense(), columns=cv.get_feature_names())

In [245]:
X_raw_df = vectorizer(X_raw)

In [246]:
X_clean_df = vectorizer(X_clean)

In [247]:
X_lem_df = vectorizer(X_lem)

In [248]:
X_noun_df = vectorizer(X_noun.dropna())

In [249]:
X_clean_df.shape

(2082, 3000)

In [250]:
pipe = Pipeline([('scaler', StandardScaler()),
                 ('cluster', KMeans())])

In [251]:
params = {'cluster__n_clusters': [3, 4, 5]}
gs = GridSearchCV(pipe, param_grid=params)

### KMeans Clustering

In [452]:
def clusterer(input):

    ss = StandardScaler()
    input_sc = ss.fit_transform(input)
    km = KMeans(n_clusters=4, random_state=42)
    km.fit(input_sc)

    return km.labels_

In [453]:
raw_cluster = clusterer(X_raw_df)

In [454]:
set(raw_cluster)

{0, 1, 2, 3}

In [455]:
len(raw_cluster)

2082

In [456]:
sum(raw_cluster == 0)

18

In [457]:
clean_cluster = clusterer(X_clean_df)

In [458]:
set(clean_cluster)

{0, 1, 2, 3}

In [459]:
len(clean_cluster)

2082

In [460]:
print(sum(clean_cluster == 0))
print(sum(clean_cluster == 1))
print(sum(clean_cluster == 2))
print(sum(clean_cluster == 3))

42
2034
3
3


In [461]:
silhouette_score(X_clean_df, clean_cluster)

0.05476045218616988

In [469]:
for i in range(4):
    a = X_clean_df[X_clean_df.cluster == i].sum() > 0
    print(f'CLUSTER {i} has {sum(clean_cluster == i)} tweets with the following words:')
    print(a[a == True])
    print('=========')

CLUSTER 0 has 42 tweets with the following words:
coned                   True
coned warns             True
consolidated            True
consolidated ediso      True
customers               True
customers possible      True
ediso                   True
ediso tghs              True
lower                   True
lower manhattan         True
manhattan               True
manhattan customers     True
new                     True
new york                True
outages                 True
outages tonight         True
possible                True
possible power          True
power                   True
power outages           True
reuters                 True
reuters consolidated    True
tghs                    True
tonight                 True
tonight new             True
warns                   True
warns lower             True
york                    True
york reuters            True
dtype: bool
CLUSTER 1 has 2034 tweets with the following words:
coned                  True
coned warns      

In [240]:
X_clean_df['cluster'] = clean_cluster

array([1, 1, 1, ..., 1, 1, 1], dtype=int32)

In [236]:
lem_cluster = clusterer(X_lem_df)

In [237]:
set(lem_cluster)

{0, 1, 2, 3}

In [238]:
len(lem_cluster)

2082

In [243]:
sum(lem_cluster == 1)

2066

In [214]:
noun_cluster = clusterer(X_noun_df)

In [215]:
set(noun_cluster)

{0, 1, 2, 3}

In [216]:
len(noun_cluster)

2070

In [217]:
sum(noun_cluster == 0)

7

## DBSCAN Clustering

In [424]:
def DBSCAN_clusterer(input):

    ss = StandardScaler()
    input_sc = ss.fit_transform(input)
    dbscan = DBSCAN(eps=0.4, min_samples=6)
    dbscan.fit(input_sc)

    return dbscan.labels_

In [425]:
raw_cluster = DBSCAN_clusterer(X_raw_df)

In [426]:
set(raw_cluster)

{-1, 0, 1, 2, 3}

In [427]:
len(raw_cluster)

2082

In [428]:
sum(raw_cluster == 0)

7

In [429]:
clean_cluster = DBSCAN_clusterer(X_clean_df)

In [430]:
set(clean_cluster)

{-1, 0, 1, 2, 3, 4, 5}

In [431]:
len(clean_cluster)

2082

In [432]:
for i in set(clean_cluster):
    print(i, sum(clean_cluster == i))

0 7
1 7
2 8
3 7
4 6
5 6
-1 2041


In [433]:
X_clean_df['cluster'] = clean_cluster
X_clean_df.head()

Unnamed: 0,aaaa,aaaa ijeyfe,abc,abc ny,able,about,above,above screenshot,abt,abt power,...,zone,zoom,zoom details,zpzuap,zq,zup,zv,zv conedison,zyqz,cluster
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1


In [435]:
for i in range(5):
    a = X_clean_df[X_clean_df.cluster == i].sum() > 0
    print(f'CLUSTER {i} has {sum(clean_cluster == i)} tweets with the following words:')
    print(a[a == True])
    print('=========')

CLUSTER 0 has 7 tweets with the following words:
coned                   True
coned warns             True
consolidated            True
consolidated ediso      True
customers               True
customers possible      True
ediso                   True
ediso tghs              True
lower                   True
lower manhattan         True
manhattan               True
manhattan customers     True
new                     True
new york                True
outages                 True
outages tonight         True
possible                True
possible power          True
power                   True
power outages           True
reuters                 True
reuters consolidated    True
tghs                    True
tonight                 True
tonight new             True
warns                   True
warns lower             True
york                    True
york reuters            True
dtype: bool
CLUSTER 1 has 7 tweets with the following words:
coned                  True
coned warns          

In [436]:
silhouette_score(X_clean_df, clean_cluster)

0.0540537949664378

In [402]:
X_clean_df['cluster'] = clean_cluster

In [403]:
lem_cluster = DBSCAN_clusterer(X_lem_df)

In [404]:
set(lem_cluster)

{-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}

In [405]:
len(lem_cluster)

2082

In [406]:
for i in set(lem_cluster):
    print(i, sum(lem_cluster == i))

0 5
1 7
2 7
3 5
4 5
5 5
6 5
7 8
8 6
9 7
10 5
11 7
12 6
13 5
14 6
15 5
16 5
17 5
18 5
19 5
20 5
-1 1963


In [407]:
noun_cluster = DBSCAN_clusterer(X_noun_df)

In [408]:
set(noun_cluster)

{-1,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25}

In [409]:
len(noun_cluster)

2070

In [410]:
for i in set(noun_cluster):
    print(i, sum(noun_cluster == i))

0 5
1 7
2 10
3 10
4 59
5 5
6 7
7 14
8 5
9 7
10 12
11 7
12 12
13 10
14 5
15 68
16 16
17 5
18 5
19 28
20 5
21 6
22 6
23 5
24 5
25 5
-1 1741


In [411]:
X_noun_df['cluster'] = noun_cluster
X_noun_df.head()

Unnamed: 0,abc,abc pic,abt,abt chelsea,abt jcpl,abt outage,abt power,abt pseg,acbowl,access,...,zip code,zip fixit,zips,zmml,zone,zone power,zq,zqccse,zuqbnywg,cluster
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1


In [471]:
for i in range(29):
    a = X_noun_df[X_noun_df.cluster == i].sum() > 0
    print(f'CLUSTER {i} has {sum(noun_cluster == i)} tweets with the following words:')
    print(list(pd.DataFrame(a[a == True]).index.values))
    print('=========')

CLUSTER 0 has 5 tweets with the following words:
['outage', 'possibility', 'possibility power', 'power', 'power outage']
CLUSTER 1 has 7 tweets with the following words:
['outage', 'power', 'power outage', 'wire', 'wire power', 'cluster']
CLUSTER 2 has 10 tweets with the following words:
['apps', 'apps stormcenter', 'default', 'default html', 'html', 'map', 'map apps', 'outage', 'outage map', 'power', 'power outage', 'stormcenter', 'stormcenter default', 'cluster']
CLUSTER 3 has 10 tweets with the following words:
['map', 'outage', 'outage map', 'power', 'power outage', 'cluster']
CLUSTER 4 has 59 tweets with the following words:
['outage', 'power', 'power outage', 'cluster']
CLUSTER 5 has 5 tweets with the following words:
['detail', 'detail goo', 'electric', 'goo', 'goo electric', 'map', 'map detail', 'outage', 'outage map', 'power', 'power outage', 'time', 'time power', 'cluster']
CLUSTER 6 has 7 tweets with the following words:
['coned', 'coned customer', 'customer', 'customer powe

In [441]:
a = X_noun_df[X_noun_df.cluster == 1].sum() > 0

In [450]:
list(pd.DataFrame(a[a == True]).index.values)

['outage', 'power', 'power outage', 'wire', 'wire power', 'cluster']