In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score
from sklearn.model_selection import GridSearchCV

In [80]:
# read in the pre-processed tweet data
df = pd.read_csv('./data/nlp_tweets_2012_2017.csv')
df.head()

Unnamed: 0,index,ID,datetime,has_media,is_reply,is_retweet,medias,nbr_favorite,nbr_reply,nbr_retweet,text,url,user_id,usernameTweet,regex_clean_tweets,lemmed_tweet,nouns,proper_nouns,geopolitical_ents
0,1869,252633076105097216,2012-10-01 00:00:00,no media,False,False,no media,1.0,0.0,1.0,@ ckundo Inspired by your ConEd summary ou...,/seecmb/status/252633076105097216,294434762.0,seecmb,ckundo Inspired ConEd summary outage info Ruby...,ckundo Inspired ConEd summary outage info Ruby...,summary outage info gem power outage detail da...,Inspired ConEd Ruby,
1,3094,254638373040422912,2012-10-06 13:00:00,no media,False,False,no media,0.0,0.0,0.0,Partial power outage at #MediaLoft That's...,/LuisPaganStudio/status/254638373040422912,212391633.0,LuisPaganStudio,Partial power outage MediaLoft That twice week...,Partial power outage MediaLoft That twice week...,power outage week,MediaLoft Damn ConEd,
2,2528,256621964268032000,2012-10-12 01:00:00,no media,False,False,no media,0.0,0.0,0.0,"Um, can someone call ConEd , there's a power...",/marlonthereal/status/256621964268032000,14144742.0,marlonthereal,Um someone call ConEd power outage Bronx Sourc...,Um someone call ConEd power outage Bronx Sourc...,someone power outage post,ConEd Bronx Source YHqV,
3,723,259866379086340096,2012-10-20 23:00:00,no media,False,False,no media,0.0,0.0,0.0,Power outage took over my entire block as s...,/meligyh/status/259866379086340096,280700235.0,meligyh,Power outage took entire block soon loaded Con...,Power outage took entire block soon loaded Con...,Power outage block,ConEd,
4,487,261909073295974401,2012-10-26 15:00:00,no media,False,False,no media,0.0,1.0,10.0,Mayor: A major concern given this type of stor...,/NYCMayorsOffice/status/261909073295974401,55338739.0,NYCMayorsOffice,Mayor A major concern given type storm possibi...,Mayor A major concern given type storm possibi...,concern type storm possibility power outage,Mayor ConEdison,


In [79]:
# create dataframes for raw text tweets, regex clean text tweets, lemmatized tweets, and nouns
X_raw = df.text
X_clean = df.regex_clean_tweets
X_lem = df.lemmed_tweet
X_noun = df.nouns

In [6]:
# define a function to convert the 
def vectorizer(input):
    
    cvec = CountVectorizer(ngram_range=(1,2),
                         max_features=3000)
    X_vec = cvec.fit_transform(input)
    dataframe = pd.DataFrame(X_vec.todense(), columns=cvec.get_feature_names())
    
    return dataframe

In [72]:

X_raw_df = vectorizer(X_raw)

X_clean_df = vectorizer(X_clean)

X_lem_df = vectorizer(X_lem)

X_noun_df = vectorizer(X_noun.dropna())

In [11]:
X_clean_df.shape

(2082, 3000)

In [12]:
pipe = Pipeline([('scaler', StandardScaler()),
                 ('cluster', KMeans())])

In [13]:
params = {'cluster__n_clusters': [3, 4, 5]}
gs = GridSearchCV(pipe, param_grid=params)

### KMeans Clustering

In [33]:
def km_clusterer(input):

    ss = StandardScaler()
    input_sc = ss.fit_transform(input)
    km = KMeans(n_clusters=4, random_state=42)
    km.fit(input_sc)
    print(f'{len(set(km.labels_))} clusters: {set(km.labels_)}')
    print('=================================')
    for cluster in set(km.labels_):
        print(f'cluster {cluster} contains {sum(km.labels_ == cluster)} tweets')
    print('=================================')
    print(f'silhouette score: {silhouette_score(input, km.labels_)}')

    return km.labels_

In [34]:
raw_cluster = km_clusterer(X_raw_df)

4 clusters: {0, 1, 2, 3}
cluster 0 contains 18 tweets
cluster 1 contains 26 tweets
cluster 2 contains 3 tweets
cluster 3 contains 2035 tweets
silhouette score: 0.029888164317810613


In [35]:
clean_cluster = km_clusterer(X_clean_df)

4 clusters: {0, 1, 2, 3}
cluster 0 contains 42 tweets
cluster 1 contains 2034 tweets
cluster 2 contains 3 tweets
cluster 3 contains 3 tweets
silhouette score: 0.0585029960797748


In [38]:
lem_cluster = km_clusterer(X_lem_df)

4 clusters: {0, 1, 2, 3}
cluster 0 contains 6 tweets
cluster 1 contains 2069 tweets
cluster 2 contains 4 tweets
cluster 3 contains 3 tweets
silhouette score: 0.12002518421102606


In [39]:
noun_cluster = km_clusterer(X_noun_df)

4 clusters: {0, 1, 2, 3}
cluster 0 contains 1 tweets
cluster 1 contains 2065 tweets
cluster 2 contains 2 tweets
cluster 3 contains 2 tweets
silhouette score: 0.06417553291580111


In [71]:
# X_clean_df['cluster'] = clean_cluster

# for i in range(4):
#     a = X_clean_df[X_clean_df.cluster == i].sum() > 0
#     print(f'CLUSTER {i} has {sum(clean_cluster == i)} tweets with the following words:')
#     print(a[a == True])
#     print('=========')

## DBSCAN Clustering

In [48]:
def DBSCAN_clusterer(input):

    ss = StandardScaler()
    input_sc = ss.fit_transform(input)
    dbscan = DBSCAN(eps=0.4, min_samples=6)
    dbscan.fit(input_sc)
    print(f'{len(set(dbscan.labels_))} clusters: {set(dbscan.labels_)}')
    print('=================================')
    for cluster in set(dbscan.labels_):
        if cluster >= 0:
            print(f'cluster {cluster} contains {sum(dbscan.labels_ == cluster)} tweets')
        elif cluster == -1:
            print('')
            print(f'{sum(dbscan.labels_ == cluster)} tweets classified as noise')
    print('=================================')
    print(f'silhouette score: {silhouette_score(input, dbscan.labels_)}')


    return dbscan.labels_

In [49]:
raw_cluster = DBSCAN_clusterer(X_raw_df)

5 clusters: {0, 1, 2, 3, -1}
cluster 0 contains 7 tweets
cluster 1 contains 7 tweets
cluster 2 contains 8 tweets
cluster 3 contains 9 tweets

2051 tweets classified as noise
silhouette score: -0.05125191123110729


In [51]:
clean_cluster = DBSCAN_clusterer(X_clean_df)

7 clusters: {0, 1, 2, 3, 4, 5, -1}
cluster 0 contains 7 tweets
cluster 1 contains 7 tweets
cluster 2 contains 8 tweets
cluster 3 contains 7 tweets
cluster 4 contains 6 tweets
cluster 5 contains 6 tweets

2041 tweets classified as noise
silhouette score: -0.02386756315517961


In [54]:
lem_cluster = DBSCAN_clusterer(X_lem_df)

9 clusters: {0, 1, 2, 3, 4, 5, 6, 7, -1}
cluster 0 contains 7 tweets
cluster 1 contains 7 tweets
cluster 2 contains 8 tweets
cluster 3 contains 6 tweets
cluster 4 contains 7 tweets
cluster 5 contains 7 tweets
cluster 6 contains 6 tweets
cluster 7 contains 6 tweets

2028 tweets classified as noise
silhouette score: -0.276279511206427


In [55]:
noun_cluster = DBSCAN_clusterer(X_noun_df)

17 clusters: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1}
cluster 0 contains 7 tweets
cluster 1 contains 10 tweets
cluster 2 contains 10 tweets
cluster 3 contains 59 tweets
cluster 4 contains 7 tweets
cluster 5 contains 14 tweets
cluster 6 contains 7 tweets
cluster 7 contains 12 tweets
cluster 8 contains 7 tweets
cluster 9 contains 12 tweets
cluster 10 contains 10 tweets
cluster 11 contains 68 tweets
cluster 12 contains 16 tweets
cluster 13 contains 28 tweets
cluster 14 contains 6 tweets
cluster 15 contains 6 tweets

1791 tweets classified as noise
silhouette score: -0.19551692987201336


In [433]:
X_clean_df['cluster'] = clean_cluster
X_clean_df.head()

Unnamed: 0,aaaa,aaaa ijeyfe,abc,abc ny,able,about,above,above screenshot,abt,abt power,...,zone,zoom,zoom details,zpzuap,zq,zup,zv,zv conedison,zyqz,cluster
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1


In [435]:
for i in range(5):
    a = X_clean_df[X_clean_df.cluster == i].sum() > 0
    print(f'CLUSTER {i} has {sum(clean_cluster == i)} tweets with the following words:')
    print(a[a == True])
    print('=========')

CLUSTER 0 has 7 tweets with the following words:
coned                   True
coned warns             True
consolidated            True
consolidated ediso      True
customers               True
customers possible      True
ediso                   True
ediso tghs              True
lower                   True
lower manhattan         True
manhattan               True
manhattan customers     True
new                     True
new york                True
outages                 True
outages tonight         True
possible                True
possible power          True
power                   True
power outages           True
reuters                 True
reuters consolidated    True
tghs                    True
tonight                 True
tonight new             True
warns                   True
warns lower             True
york                    True
york reuters            True
dtype: bool
CLUSTER 1 has 7 tweets with the following words:
coned                  True
coned warns          

In [76]:
X_clean_df['cluster'] = clean_cluster

In [57]:
X_noun_df['cluster'] = noun_cluster
X_noun_df.head()

Unnamed: 0,abc,abc pic,abt,abt chelsea,abt jcpl,abt outage,abt power,abt pseg,acbowl,access,...,zip code,zip fixit,zips,zmml,zone,zone power,zq,zqccse,zuqbnywg,cluster
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1


In [70]:
for i in range(len(set(noun_cluster)) - 1):
    a = X_noun_df[X_noun_df.cluster == i].sum() > 0
    print(f'CLUSTER {i} has {sum(noun_cluster == i)} tweets with the following words:')
    print(list(pd.DataFrame(a[a == True]).index.values))
    print('=========')

CLUSTER 0 has 7 tweets with the following words:
['outage', 'power', 'power outage', 'wire', 'wire power']
CLUSTER 1 has 10 tweets with the following words:
['apps', 'apps stormcenter', 'default', 'default html', 'html', 'map', 'map apps', 'outage', 'outage map', 'power', 'power outage', 'stormcenter', 'stormcenter default', 'cluster']
CLUSTER 2 has 10 tweets with the following words:
['map', 'outage', 'outage map', 'power', 'power outage', 'cluster']
CLUSTER 3 has 59 tweets with the following words:
['outage', 'power', 'power outage', 'cluster']
CLUSTER 4 has 7 tweets with the following words:
['coned', 'coned customer', 'customer', 'customer power', 'outage', 'outage tonight', 'power', 'power outage', 'tghs', 'tonight', 'tonight tghs', 'cluster']
CLUSTER 5 has 14 tweets with the following words:
['coned', 'coned customer', 'customer', 'customer power', 'outage', 'outage tonight', 'power', 'power outage', 'tonight', 'cluster']
CLUSTER 6 has 7 tweets with the following words:
['cluster

In [441]:
a = X_noun_df[X_noun_df.cluster == 1].sum() > 0

In [63]:
list(pd.DataFrame(a[a == True]).index.values)

['area', 'outage', 'outage area', 'power', 'power outage', 'cluster']