# sustainability_cluster_eda

In [1]:
from pymongo import MongoClient
client = MongoClient()
db = client.environment
sustainability_collection = db.sustainability

In [2]:
import pandas as pd
import re
from nltk.tokenize import RegexpTokenizer
from collections import Counter

from textblob import TextBlob

In [3]:
import pickle

with open('sustainability_clusters.pickle','rb') as read_file:
    clusters = pickle.load(read_file)

In [4]:
(clusters)

array([12, 12, 12, ..., 12, 12, 12], dtype=int32)

In [5]:
tweet_list = []
for tweet in sustainability_collection.aggregate([{'$match': {'lang': 'en'}}]):
    tweet_list.append(tweet['text'])

In [6]:
tweet_df = pd.DataFrame()

In [7]:
tweet_df = pd.DataFrame(tweet_list, columns = ['tweet'])    

In [8]:
tweet_df['clusters'] = clusters

In [9]:
tweet_df.head(20)

Unnamed: 0,tweet,clusters
0,RT @angelurena: This visit concludes this week...,12
1,Are you in Melbourne or Sydney? Join us at #SP...,12
2,Why Vienna Does So Well on Quality-of-Life Ran...,12
3,RT @APLShipping: APL achieved a 50.7% reductio...,12
4,RT @AmazonWatch: “Hydroelectricity may appear ...,12
5,RT @cleanAirGurgaon: Right to Clean Enviornmen...,12
6,RT @thefishsite: A new study suggests seafood ...,12
7,RT @rsb_org: Did you hear about our new partne...,12
8,RT @NewEnglandDairy: Tips to reduce food waste...,12
9,Sustainability Can Be Beautiful: Dirt + Pressu...,12


In [10]:
tweet_df.head()

Unnamed: 0,tweet,clusters
0,RT @angelurena: This visit concludes this week...,12
1,Are you in Melbourne or Sydney? Join us at #SP...,12
2,Why Vienna Does So Well on Quality-of-Life Ran...,12
3,RT @APLShipping: APL achieved a 50.7% reductio...,12
4,RT @AmazonWatch: “Hydroelectricity may appear ...,12


In [11]:
TextBlob(tweet_df.iloc[0][0]).sentiment

Sentiment(polarity=0.0, subjectivity=0.0)

In [12]:
tweet_df['polarity'] = tweet_df.tweet.apply(lambda x: TextBlob(x).polarity)
tweet_df['subjectivity'] = tweet_df.tweet.apply(lambda x: TextBlob(x).subjectivity)

In [13]:
tweet_df.head()

Unnamed: 0,tweet,clusters,polarity,subjectivity
0,RT @angelurena: This visit concludes this week...,12,0.0,0.0
1,Are you in Melbourne or Sydney? Join us at #SP...,12,0.5,0.9
2,Why Vienna Does So Well on Quality-of-Life Ran...,12,0.0,0.0
3,RT @APLShipping: APL achieved a 50.7% reductio...,12,0.0,0.0
4,RT @AmazonWatch: “Hydroelectricity may appear ...,12,0.233333,0.85


In [14]:
clustered_tweets = tweet_df.filter(items = ['clusters', 'polarity', 'subjectivity'])

In [15]:
clustered_tweets.groupby(by = clusters).mean()

Unnamed: 0,clusters,polarity,subjectivity
0,0,-0.000241,0.227166
1,1,-0.251852,0.262963
2,2,0.277126,0.432977
3,3,-0.1,0.05
4,4,0.303711,0.460179
5,5,0.5,0.5
6,6,-0.1,0.2
7,7,0.0,0.0
8,8,0.5,0.5
9,9,0.998322,0.898658


In [16]:
topic_counts = Counter(clusters)

for topic, count in sorted(topic_counts.items()):
    print("Topic Number:", topic, "Percent:", (count / sum(topic_counts.values())))

Topic Number: 0 Percent: 0.041054038571701354
Topic Number: 1 Percent: 0.005823945006683215
Topic Number: 2 Percent: 0.023391254535039145
Topic Number: 3 Percent: 0.004487301890395264
Topic Number: 4 Percent: 0.1315638724460569
Topic Number: 5 Percent: 0.0036280313156387247
Topic Number: 6 Percent: 0.00343708229902616
Topic Number: 7 Percent: 0.012316211571510407
Topic Number: 8 Percent: 0.004678250907007829
Topic Number: 9 Percent: 0.014225701737636052
Topic Number: 10 Percent: 0.005537521481764369
Topic Number: 11 Percent: 0.004582776398701547
Topic Number: 12 Percent: 0.7249379415696009
Topic Number: 13 Percent: 0.010693144930303608
Topic Number: 14 Percent: 0.0049646744319266755
Topic Number: 15 Percent: 0.004678250907007829


In [17]:
'''

Tweet Tokenizer Adapted from :  https://github.com/adonoho/TweetTokenizers/blob/master/PottsTweetTokenizer.py
   
This Tokenizer preserves Twitter language including:  usernames, hashtags, symbols html, etc.

'''

regex_code = (

    r"""(?:<[^>]+>)""",                             # HTML tags
    r"""(?:http[s]?://t.co/[a-zA-Z0-9]+)""",        # URLs 
    r"""(?:http[s]\S+?)""",                         # URLs
    r"""(?:@[\w_]+)""",                             # Twitter username
    r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""",           # Twitter hashtags
    r"""(?:\$[a-zA-Z]{1,6}([._][a-zA-Z]{1,2})?)""", # Twitter symbols / cashtags
    r"""(?:[a-z][a-z'\-_]+[a-z])""",                # Words with apostrophes or dashes
    r"""(?:[+\-]?\d+[,/.:-]\d+[+\-]?)""",           # Numbers, including fractions, decimals
    r"""(?:[\w_]+)""",                              # Words without apostrophes or dashes
    r"""(?:\.(?:\s*\.){1,})"""                      # Ellipsis
    )

#regex_text = re.sub(r'[%s]'% (regex_pattern), ' ')

#regex_pattern = re.compile(r"""(%s)"""%"|".join(regex_code), re.VERBOSE | re.I | re.UNICODE)
regex_pattern = re.compile(r"""(%s)"""%"| ".join(regex_code))#, re.VERBOSE | re.I | re.UNICODE)

print(regex_pattern.pattern)

regex_tokenizer = RegexpTokenizer(pattern=regex_pattern.pattern,gaps=True, discard_empty = True)

((?:<[^>]+>)| (?:http[s]?://t.co/[a-zA-Z0-9]+)| (?:http[s]\S+?)| (?:@[\w_]+)| (?:\#+[\w_]+[\w\'_\-]*[\w_]+)| (?:\$[a-zA-Z]{1,6}([._][a-zA-Z]{1,2})?)| (?:[a-z][a-z'\-_]+[a-z])| (?:[+\-]?\d+[,/.:-]\d+[+\-]?)| (?:[\w_]+)| (?:\.(?:\s*\.){1,}))


In [18]:
tweet_df.tweet[0]

'RT @angelurena: This visit concludes this week’s stops in Miami and the Caribbean by President @BillClinton to promote sustainability and r…'

In [19]:
regex_tokenizer.tokenize(tweet_df.tweet[0])

['RT',
 ' @angelurena',
 ':',
 ' This',
 ' visit',
 ' concludes',
 ' this',
 ' week',
 '’s',
 ' stops',
 ' in',
 ' Miami',
 ' and',
 ' the',
 ' Caribbean',
 ' by',
 ' President',
 ' @BillClinton',
 ' to',
 ' promote',
 ' sustainability',
 ' and',
 ' r',
 '…']

In [20]:
with open ('tweet_df.pickle', 'wb') as to_write:
    pickle.dump(tweet_df, to_write)

In [21]:
with open ('clustered_tweets.pickle', 'wb') as to_write:
    pickle.dump(clustered_tweets, to_write)