# sustainability_cluster_eda

In [1]:
from pymongo import MongoClient
client = MongoClient()
db = client.environment
sustainability_collection = db.sustainability

In [26]:
import pandas as pd
import re
from nltk.tokenize import RegexpTokenizer
from collections import Counter

from textblob import TextBlob

In [3]:
import pickle

with open('sustainability_clusters.pickle','rb') as read_file:
    clusters = pickle.load(read_file)

In [4]:
len(clusters)

10474

In [5]:
tweet_list = []
for tweet in sustainability_collection.aggregate([{'$match': {'lang': 'en'}}]):
    tweet_list.append(tweet['text'])

In [6]:
tweet_df = pd.DataFrame()

In [7]:
tweet_df = pd.DataFrame(tweet_list, columns = ['tweet'])    

In [8]:
tweet_df['clusters'] = clusters

In [9]:
tweet_df.head()

Unnamed: 0,tweet,clusters
0,RT @angelurena: This visit concludes this week...,0
1,Are you in Melbourne or Sydney? Join us at #SP...,9
2,Why Vienna Does So Well on Quality-of-Life Ran...,0
3,RT @APLShipping: APL achieved a 50.7% reductio...,0
4,RT @AmazonWatch: “Hydroelectricity may appear ...,0


In [10]:
tweet_df.head()

Unnamed: 0,tweet,clusters
0,RT @angelurena: This visit concludes this week...,0
1,Are you in Melbourne or Sydney? Join us at #SP...,9
2,Why Vienna Does So Well on Quality-of-Life Ran...,0
3,RT @APLShipping: APL achieved a 50.7% reductio...,0
4,RT @AmazonWatch: “Hydroelectricity may appear ...,0


In [11]:
TextBlob(tweet_df.iloc[0][0]).sentiment

Sentiment(polarity=0.0, subjectivity=0.0)

In [12]:
tweet_df['polarity'] = tweet_df.tweet.apply(lambda x: TextBlob(x).polarity)
tweet_df['subjectivity'] = tweet_df.tweet.apply(lambda x: TextBlob(x).subjectivity)

In [13]:
tweet_df.head()

Unnamed: 0,tweet,clusters,polarity,subjectivity
0,RT @angelurena: This visit concludes this week...,0,0.0,0.0
1,Are you in Melbourne or Sydney? Join us at #SP...,9,0.5,0.9
2,Why Vienna Does So Well on Quality-of-Life Ran...,0,0.0,0.0
3,RT @APLShipping: APL achieved a 50.7% reductio...,0,0.0,0.0
4,RT @AmazonWatch: “Hydroelectricity may appear ...,0,0.233333,0.85


In [14]:
clustered_tweets = tweet_df.filter(items = ['clusters', 'polarity', 'subjectivity'])

In [18]:
clustered_tweets.groupby(by = clusters).mean()

Unnamed: 0,clusters,polarity,subjectivity
0,0,0.128961,0.32417
1,1,0.124985,0.228718
2,2,0.998322,0.898658
3,3,-0.02381,0.380952
4,4,0.08512,0.276329
5,5,0.0,0.0
6,6,0.126186,0.333083
7,7,0.571651,0.601654
8,8,-0.184163,0.011104
9,9,0.307062,0.568531


In [22]:
topic_counts = Counter(clusters)

for topic, count in sorted(topic_counts.items()):
    print("Topic Number:", topic, "Percent:", (count / sum(topic_counts.values())))

Topic Number: 0 Percent: 0.681687989306855
Topic Number: 1 Percent: 0.03255680733244224
Topic Number: 2 Percent: 0.014225701737636052
Topic Number: 3 Percent: 0.010693144930303608
Topic Number: 4 Percent: 0.04525491693717777
Topic Number: 5 Percent: 0.012220737063204125
Topic Number: 6 Percent: 0.06759595188084781
Topic Number: 7 Percent: 0.041435936604926484
Topic Number: 8 Percent: 0.013080007637960665
Topic Number: 9 Percent: 0.027592132900515563
Topic Number: 10 Percent: 0.02281840748520145
Topic Number: 11 Percent: 0.018617529119725033
Topic Number: 12 Percent: 0.012220737063204125


In [27]:
'''

Tweet Tokenizer Adapted from :  https://github.com/adonoho/TweetTokenizers/blob/master/PottsTweetTokenizer.py
   
This Tokenizer preserves Twitter language including:  usernames, hashtags, symbols html, etc.

'''

regex_code = (

    r"""(?:<[^>]+>)""",                             # HTML tags
    r"""(?:http[s]?://t.co/[a-zA-Z0-9]+)""",        # URLs 
    r"""(?:http[s]\S+?)""",                         # URLs
    r"""(?:@[\w_]+)""",                             # Twitter username
    r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""",           # Twitter hashtags
    r"""(?:\$[a-zA-Z]{1,6}([._][a-zA-Z]{1,2})?)""", # Twitter symbols / cashtags
    r"""(?:[a-z][a-z'\-_]+[a-z])""",                # Words with apostrophes or dashes
    r"""(?:[+\-]?\d+[,/.:-]\d+[+\-]?)""",           # Numbers, including fractions, decimals
    r"""(?:[\w_]+)""",                              # Words without apostrophes or dashes
    r"""(?:\.(?:\s*\.){1,})"""                      # Ellipsis
    )

#regex_text = re.sub(r'[%s]'% (regex_pattern), ' ')

regex_pattern = re.compile(r"""(%s)"""%"|".join(regex_code), re.VERBOSE | re.I | re.UNICODE)

print(regex_pattern.pattern)

regex_tokenizer = RegexpTokenizer(pattern=regex_pattern.pattern,gaps=True, discard_empty = True)

((?:<[^>]+>)|(?:http[s]?://t.co/[a-zA-Z0-9]+)|(?:http[s]\S+?)|(?:@[\w_]+)|(?:\#+[\w_]+[\w\'_\-]*[\w_]+)|(?:\$[a-zA-Z]{1,6}([._][a-zA-Z]{1,2})?)|(?:[a-z][a-z'\-_]+[a-z])|(?:[+\-]?\d+[,/.:-]\d+[+\-]?)|(?:[\w_]+)|(?:\.(?:\s*\.){1,}))


In [28]:
tweet_df.tweet[0]

'RT @angelurena: This visit concludes this week’s stops in Miami and the Caribbean by President @BillClinton to promote sustainability and r…'

In [29]:
regex_tokenizer.tokenize(tweet_df.tweet[0])

['RT',
 ' ',
 '@angelurena',
 ': ',
 'This',
 ' ',
 'visit',
 ' ',
 'concludes',
 ' ',
 'this',
 ' ',
 'week',
 '’',
 's',
 ' ',
 'stops',
 ' ',
 'in',
 ' ',
 'Miami',
 ' ',
 'and',
 ' ',
 'the',
 ' ',
 'Caribbean',
 ' ',
 'by',
 ' ',
 'President',
 ' ',
 '@BillClinton',
 ' ',
 'to',
 ' ',
 'promote',
 ' ',
 'sustainability',
 ' ',
 'and',
 ' ',
 'r',
 '…']