# sustainability_topic_modeling

In [1]:
#Load in nlp pipeline

#import sustainability_nlp_pipeline
from sustainability_nlp_pipeline import nlp_pipeline as nlp_func
from sustainability_nlp_pipeline import cleaned_text

In [3]:
#Load in other packages

import re
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD

In [4]:
#Connect to MongoDB

from pymongo import MongoClient
client = MongoClient()
db = client.environment
sustainability_collection = db.sustainability

Create Regex Pattern Specific to Twitter Language:

In [4]:
'''

Tweet Tokenizer Adapted from :  https://github.com/adonoho/TweetTokenizers/blob/master/PottsTweetTokenizer.py
   
This Tokenizer preserves Twitter language including:  usernames, hashtags, symbols html, etc.

'''

regex_code = (

    r"""(?:<[^>]+>)""",                             # HTML tags
    r"""(?:http[s]?://t.co/[a-zA-Z0-9]+)""",        # URLs 
    r"""(?:http[s]\S+?)""",                         # URLs
    r"""(?:@[\w_]+)""",                             # Twitter username
    r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""",           # Twitter hashtags
    r"""(?:\$[a-zA-Z]{1,6}([._][a-zA-Z]{1,2})?)""", # Twitter symbols / cashtags
    r"""(?:[a-z][a-z'\-_]+[a-z])""",                # Words with apostrophes or dashes
    r"""(?:[+\-]?\d+[,/.:-]\d+[+\-]?)""",           # Numbers, including fractions, decimals
    r"""(?:[\w_]+)""",                              # Words without apostrophes or dashes
    r"""(?:\.(?:\s*\.){1,})"""                      # Ellipsis
    )

#regex_text = re.sub(r'[%s]'% (regex_pattern), ' ')

regex_pattern = re.compile(r"""(%s)"""%"|".join(regex_code), re.VERBOSE | re.I | re.UNICODE)

In [5]:
print(regex_pattern.pattern)

((?:<[^>]+>)|(?:http[s]?://t.co/[a-zA-Z0-9]+)|(?:http[s]\S+?)|(?:@[\w_]+)|(?:\#+[\w_]+[\w\'_\-]*[\w_]+)|(?:\$[a-zA-Z]{1,6}([._][a-zA-Z]{1,2})?)|(?:[a-z][a-z'\-_]+[a-z])|(?:[+\-]?\d+[,/.:-]\d+[+\-]?)|(?:[\w_]+)|(?:\.(?:\s*\.){1,}))


In [6]:
regex_tokenizer = RegexpTokenizer(pattern=regex_pattern.pattern,gaps=True, discard_empty = True)

Define Stop Words:

In [7]:
stop_words = stopwords.words('english')

#Add additional stop words in here:

additional_stop_words = ['RT', 'rt', "’"]

total_stop_words = stop_words + additional_stop_words

In [8]:
english_cursor = sustainability_collection.aggregate([{'$match': {'lang': 'en'}},{'$sample':{'size':5}}])

In [9]:
for tweet in english_cursor:
    print (tweet['text'])

The latest Supply Chain Mgt Trends to Edge! https://t.co/qyCL7Nwu3i Thanks to @SparklePatriot @DockflowBE… https://t.co/8DXdu84IIu
RT @PavoIoT: The #UAE government has implemented a 15-year strategy that focuses on sustainability and self-sufficiency. Still, the strateg…
RT @SCCEurope: 2016 winner of the European Green Capital, Ljubljana continues to flourish as an example of sustainability! check out the sm…
The best and worse welfare can be found on small "family" farms size ownership or location do not dictate welfare o… https://t.co/Oj0v3jUTuc
Want to work at PGE? We're #hiring in #Portland, OR! Click for details: https://t.co/OpYpc0QL7V #energy #utilities… https://t.co/fXWnZnlxGf


**Define Parameters to use in NLP Function**

In [10]:
#TF-IDF

nlp_tfidf = nlp_func(vectorizer=TfidfVectorizer(min_df=0, max_df=0.5), cleaning_function=cleaned_text, tokenizer=TreebankWordTokenizer().tokenize, stemmer=PorterStemmer())

In [11]:
english_cursor = sustainability_collection.aggregate([{'$match': {'lang': 'en'}},{'$sample':{'size':5}}])

for tweet in english_cursor:
    nlp_tfidf.fit_vectorizer(tweet['text'])
    vectorized_tweets_tfidf = nlp_tfidf.transform_vectorizer(tweet['text']).toarray()

In [12]:
vectorized_tweets_tfidf

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]])

In [13]:
#Count Vectorizer

nlp_cv = nlp_func(vectorizer=CountVectorizer(), cleaning_function=cleaned_text, tokenizer=TreebankWordTokenizer().tokenize, stemmer=PorterStemmer())

In [14]:
english_cursor = sustainability_collection.aggregate([{'$match': {'lang': 'en'}},{'$sample':{'size':5}}])

for tweet in english_cursor:
    nlp_cv.fit_vectorizer(tweet['text'])
    vectorized_tweets_cv = nlp_cv.transform_vectorizer(tweet['text']).toarray()

In [15]:
vectorized_tweets_cv

array([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

In [16]:
CountVectorizer.get_feature_names

<function sklearn.feature_extraction.text.CountVectorizer.get_feature_names(self)>

In [17]:
nlp_cv.get_features()

In [None]:
#screw the pipeline i need to get this to work

In [30]:
english_cursor = sustainability_collection.aggregate([{'$match': {'lang': 'en'}}])

for tweet in english_cursor:
    print(cleaned_text(tweet['text'],tokenizer=TreebankWordTokenizer().tokenize, stemmer=PorterStemmer()))

['angelurena', 'visit', 'conclud', 'week', 'stop', 'miami', 'caribbean', 'presid', 'billclinton', 'promot', 'sustain', 'r…']
['melbourn', 'sydney', 'join', 'us', 'spfaustralia', 'learn', 'latest', 'trend', 'eh', 'riskmanagement…']
['vienna', 'well', 'qualityoflif', 'rank', 'affordableh', 'publictransport', 'sustain']
['aplship', 'apl', 'achiev', '507', 'reduct', 'co2', 'emiss', 'per', 'transport', 'contain', 'per', 'kilometr', '2017', 'compar', '2009', 'read…']
['amazonwatch', '“', 'hydroelectr', 'may', 'appear', 'rel', 'clean', 'process', 'megadam', 'built', 'far', 'belomonte…']
['cleanairgurgaon', 'right', 'clean', 'enviorn', 'also', 'come', 'immens', 'respons', 'ascitizen', 'consumersproducersw', 'thank', 'ha…']
['thefishsit', 'new', 'studi', 'suggest', 'seafood', 'consum', 'across', 'globe', 'unit', 'think', 'biggest', 'threat', 'ocean', 'po…']
['rsborg', 'hear', 'new', 'partnership', 'qldrfa', 'well', 'work', 'togeth', 'support', 'sustain', 'certif', 'fo…']
['newenglanddairi', 'ti

['abdulelsay', 'victori', 'today', 'work', 'continu', 'congratul', 'gretchenwhitm', 'primari', 'win', 'tomorro…']
['alexverbeek', '🌎', '4', 'innov', 'way', 'peopl', 'reus', 'plastic', 'rather', 'throw', 'away', 'plastic', 'zeroplast', 'innovatio…']
['willoughbyc', 'gailgilesgidney', 'also', 'peopl', 'go', 'internet', 'know', 'theyr', 'look', 'i…']
['kashthefuturist', 'doctor', 'warn', 'plasticpollut', 'kill', 'human', 'race', 'sustain', 'climatechang', 'plastic', 'wast', 'sdg', 'ci…']
['love', 'shot', 'stellathelight', 'enjoy', 'soft', 'amp', 'easi', 'slide', 'beauti', 'day', 'ethicalfashion…']
['amazonwatch', '“', 'hydroelectr', 'may', 'appear', 'rel', 'clean', 'process', 'megadam', 'built', 'far', 'belomonte…']
['south', 'west', 'builder', 'lead', 'way', 'environment', 'sustain']
['openpackaginnet', 'readi', 'sustain', 'blockchain', 'let', 'talk', 'one', 'time', 'blockchain', 'packag', 'industry…']
['willoughbyc', 'gailgilesgidney', 'pamphlet', 'provid', 'council', 'pickup', 'could',

['techstar', 'techstar', 'sustain', 'acceler', '10', 'startup', 'drive', 'innov', 'planet', 'via', 'forb', 'mar…']
['treat', 'road', 'oil', 'ga', 'wastewat', 'may', 'spread', 'harm', 'pollut', 'thirteen', 'state', 'allow', 'practic', 'help…']
['ignitioncoin', 'success', 'pass', 'block', '225000', 'neoscrypt', 'asicresist', 'pow', 'algo', 'along', 'sever', 'amaz', 'hardenin…']
['academi', 'manag', 'go', 'sustainablevik', '2025', 'great', 'opportun', 'showcas', 'global', 'sustainabi…']
['jblefevre60', 'need', 'pilot', 'licens', 'fli', 'car', 'drone', 'sustain', 'ipfconline1', 'evankirstel', 'helenewpli', 'severinelienar…']
['look', 'raini', 'day', 'read', 'materi', 'head', 'websit', 'download', 'white', 'paper', 'outlines…']
['mikesav47032563', 'import', 'reason', 'escal', 'inequ', 'global', 'threat', 'social', 'sustain']
['colinlemahieu', 'partnership', 'nano', 'sustain', 'confirm']
['love', 'peopl', 'love', 'talk', 'sustain', 'seem', 'ditsi', 'start', 'talk', 'sustain', 'i…']
['perthto

['buzzonearth', 'corpor', 'social', 'respons', 'need', 'hour', '⏱️', 'csr', 'corpor', 'globalgo', 'sustainabilit…']
['savepl21174455', 'think', 'hot', 'wait', 'reach', 'hothous', 'earth', 'via', 'forb', 'climatechangeisr', 'stopadani', 'fossilfre', 'protect…']
['pamkeithfl', 'result', 'polici', 'base', 'short', 'term', 'greed', 'long', 'term', 'sustain', 'let', 'gop', 'control', 'al…']
['kendallmi', 'talk', 'sustain', 'organ', 'nongmo', 'provid', 'robust', 'incom', 'stream', 'farmers…']
['cisgroupuk', 'netherland', '🇳🇱', 'build', 'road', 'recycl', 'wast', 'fridayfeel', 'tech', 'innov', 'sustain', 'construction…']
['evankirstel', '5foot', 'robot', 'snake', 'design', 'find', 'sourc', 'pollut', 'contamin', 'water', 'robot', 'environmentprotecti…']
['restoringnola', 'ignor', 'climatechang', 'denier', 'california', 'hellish', 'summer', 'realli', 'grave', 'warn', 'latim']
['restoringnola', 'ignor', 'climatechang', 'denier', 'california', 'hellish', 'summer', 'realli', 'grave', 'warn', 'latim

['jblefevre60', 'compon', 'need', 'build', 'smartciti', 'cybersecur', 'iot', '5g', 'industry40', 'machinelearn', 'bigdata', 'infosec…']
['greenjourney', 'come', 'deforest', 'longer', 'bulldoz', 'need', 'focu', 'machet', 'support', 'coolearth…']
['greenjourney', 'come', 'deforest', 'longer', 'bulldoz', 'need', 'focu', 'machet', 'support', 'coolearth…']
['greenjourney', 'come', 'deforest', 'longer', 'bulldoz', 'need', 'focu', 'machet', 'support', 'coolearth…']
['greenjourney', 'come', 'deforest', 'longer', 'bulldoz', 'need', 'focu', 'machet', 'support', 'coolearth…']
['jblefevre60', 'compon', 'need', 'build', 'smartciti', 'cybersecur', 'iot', '5g', 'industry40', 'machinelearn', 'bigdata', 'infosec…']
['die', 'consumpt', 'guzzl', 'snake', 'oil', 'natcounterpunch', 'need', 'make', 'en…']
['come', 'deforest', 'longer', 'bulldoz', 'need', 'focu', 'machet', 'support…']
['jblefevre60', 'need', 'pilot', 'licens', 'fli', 'car', 'drone', 'sustain', 'ipfconline1', 'evankirstel', 'helenewpli', 'sev

['impakterdotcom', 'climat', 'chang', 'global', 'warm', 'erad', 'human', 'climatechang', 'late', 'articl', 'claudeforthomm', 'impakt…']
['california', 'take', 'cool', 'period', 'gener', 'much', 'energi', 'solarpower…']
['evankirstel', '5foot', 'robot', 'snake', 'design', 'find', 'sourc', 'pollut', 'contamin', 'water', 'robot', 'environmentprotecti…']
['mountain', 'studi', 'sourc', 'imag']
['jblefevre60', 'compon', 'need', 'build', 'smartciti', 'cybersecur', 'iot', '5g', 'industry40', 'machinelearn', 'bigdata', 'infosec…']
['abdulelsay', 'courag', 'empathi', 'respect', 'honesti', 'justic', 'equiti', 'sustain', '☝🏽☝🏽we', 'sought', 'put', 'ou…']
['themanualguid', 'konabrewingco', 'abl', 'produc', '7k', 'bottl', 'beer', 'solar', 'power']
['chriskingphoto', 'recent', 'feedbackorg', 'publish', 'report', 'rank', 'supermarket', 'accord', 'perform', 'reduc', 'amount', 'of…']
['janinerogan', 'first', 'good', 'excercis', 'amaz', 'mental', 'health', 'sustain', 'argument', 'drivin…']
['batterseanet

['koborigrillscsr', 'sustain', 'team', 'levistraussco', 'hire', 'analyst', 'posit', 'sanfrancisco', 'want', 'help', 'us', 'c…']
['llcpr', 'mark', 'calendar', 'usgbc', 'louisiana', 'chapter', 'forward', 'confer', 'oct', '19th', 'tulan', 'univers', 'registr', 'open']
['natheal', 'top', '10', 'countri', 'robot', 'put', 'work', 'ai', 'autom', 'robot', 'iot', 'iiot', '4ir', 'futureofwork', 'sustainabil…']
['job', 'might', 'great', 'fit', 'burn', 'crew', 'member', '46655', 'conservation…']
['bobstglob', 'ensur', 'healthi', 'live', 'promot', 'wellb', 'annual', 'physic', 'health', 'examin', 'staff', 'member', 'long…']
['jblefevre60', 'need', 'pilot', 'licens', 'fli', 'car', 'drone', 'sustain', 'ipfconline1', 'evankirstel', 'helenewpli', 'severinelienar…']
['haroldsinnott', 'big', 'tube', 'could', 'reduc', 'giant', 'pacif', 'garbag', 'patch', '50', '5', 'year', 'sustain', 'ocean', 'environ', 'plast…']
['kashthefuturist', 'doctor', 'warn', 'plasticpollut', 'kill', 'human', 'race', 'sustain', 'cl

['jblefevre60', 'need', 'pilot', 'licens', 'fli', 'car', 'drone', 'sustain', 'ipfconline1', 'evankirstel', 'helenewpli', 'severinelienar…']
['shyam17', 'indian', 'fishermen', 'take', 'plastic', 'sea', 'use', 'build', 'road', 'keralaplasticsrecycling…']
['second', 'genesis™', 'radio', 'show', 'also', 'commun', 'websit', 'thehumanaccelerator™']
['taigacompani', 'thought', 'leadership', 'overr', 'sustain', 'brand', 'via', 'johnfriedman', 'ecomonday', 'sustainabili…']
['deviousprez', 'rise', 'sea', 'level', 'could', 'leav', 'internet', 'cabl', 'underwat', 'within', '15', 'year', 'studi', 'say', 'climatechang', 'sustain', 'h…']
['aeriscto', 'mustread', 'ebook', 'iot', '2018', 'learn', 'busi', 'need', 'know', 'implement', 'optim', 'grow', 'int…']
['second', 'genesis™', 'radio', 'show', 'also', 'commun', 'websit', 'thehumanaccelerator™']
['backdrop', 'oper', '“', 'greater', 'agil', 'stronger', 'integr', '”', 'cargil', 'firmli', 'path', 'to…']
['arikr', 'brilliant', 'schwarzenegg', 'coalth', '

['smetradeacademi', 'tripl', 'bottom', 'line', 'environment', 'social', 'econom', 'sustain', 'import', 'msme', '🌎', '🌍', '🌏…']
['damiencabadi', 'would', 'buy', 'biofabr', 'leather', 'mashabl', 'sustain', 'animalright', 'ecolog', 'environ', 'tech4good', 'jblefevr…']
['savepl21174455', 'think', 'hot', 'wait', 'reach', 'hothous', 'earth', 'via', 'forb', 'climatechangeisr', 'stopadani', 'fossilfre', 'protect…']
['damiencabadi', 'would', 'buy', 'biofabr', 'leather', 'mashabl', 'sustain', 'animalright', 'ecolog', 'environ', 'tech4good', 'jblefevr…']
['see', 'latest', 'usa', 'job', 'click', 'appli', 'coordinador', 'de', 'restauracion', '46770']
['globaldop', 'time', 'unit', 'get', 'legal', 'savetheplanet', 'legalizeit', 'sustain', 'cannabi', 'cannabiscommun', 'weed', 'st…']
['beyondcapitaljo', 'sustain', 'isnt', 'buzzword', 'u', 'need', '2', 'feed', 'w', 'balanc', 'valu', '4', 'ur', 'busi', '2', 'surviv', 'entmagazinem', 'simpli…']
['nialfinegan', 'thank', 'rubi', 'amp', 'charli', 'kingsvil',

['duflotvaleria', 'scientist', 'discov', 'game', 'chang', 'enzym', 'break', 'plastic', 'day', '😍😍😍', 'plasticfre', 'innov', 'te…']
['ifrstanton', 'ifr', 'green', 'financ', 'roundtabl', 'transcript', 'onlin', 'paywal', 'access', 'via', 'link', 'left', 'download', 'd…']
['haroldsinnott', 'compon', 'need', 'build', 'smartciti', 'cybersecur', 'iot', '5g', 'industry40', 'machinelearn', 'bigdata', 'infose…']
['aluminiumshow', 'eualuminium', 'partner', 'aluminium', 'trade', 'fair', 'voic', 'entir', 'aluminium', 'valu', 'chain', 'circular', 'nat…']
['ecosavvyrebel', 'opportun', 'attend', 'educ', 'event', 'week', 'ago', 'takeaway', 'need', 'greenbond', 'companie…']
['kailendorg', 'onethird', 'world', 'food', 'current', 'lost', 'wast', 'foodwast', 'cost', 'world', '940bn', 'everi', 'year', 'cut', 'food', 'wast…']
['haroldsinnott', 'compon', 'need', 'build', 'smartciti', 'cybersecur', 'iot', '5g', 'industry40', 'machinelearn', 'bigdata', 'infose…']
['farmafrica', 'onethird', 'world', 'food', 'cur

['kidscaninnov', 'hi', 'global', 'innov', 'camp', 'leader', 'week', '1', 'lesson', 'guid', 'aug', '1317', '2018', 'readi', 'download', 'pleas', 'visit']
['ksanderson727', 'fourthwav', 'great', 'exampl', 'make', 'data', 'access', 'drive', 'deforest', 'action', 'thank', 'traseearth']
['bound4blu', 'pleas', 'announc', 'bound4blu', 'wingsail', 'system', 'select', 'one', '1000', 'effici', 'solu…']
['mvollmer1', 'climat', 'chang', 'look', 'like', 'vr', 'climatechang', 'virtualr', 'vr', 'augmentedr', 'ar', 'sustain', 'ht…']
['mridagroup', 'devendra', 'play', 'crucial', 'role', 'mrida', 'csr', 'partnership', 'ilampf', 'develop', 'faridpur', 'inayat', 'khan', 'bareilli', 'distr…']
['everi', 'tech', 'busi', 'advertis', 'faster', 'processor', 'better', 'batteri', 'life', 'look', 'f…']
['teresanwanjohi', 'need', 'take', 'actiononclim', 'asap', 'altern', 'rich', 'poor', 'includ', 'becom', 'refuge', 'own…']
['mridagroup', 'devendra', 'play', 'crucial', 'role', 'mrida', 'csr', 'partnership', 'ilampf'

['good', 'idea']
['solut', 'reus', 'refus']
['⚡️', '“', 'complet', 'mahoshiva', 'cufflink', 'seri', '05', '”', 'mahoshiva', 'handmad', 'slowfashion', 'musthav', 'wood…']
['disillus', 'youngster', 'leav', 'socialmedia', 'corpor', 'amplifi', 'brand', 'mess…']
['univers', 'project', 'cater', 'healthi', 'sustain', 'environ', 'design', 'energi', 'sustainability…']
['⚡️', '“', 'complet', 'mahoshiva', 'cufflink', 'seri', '06', '”', 'mahoshiva', 'handmad', 'slowfashion', 'musthav', 'wood…']
['oscarwezenbeek', 'proud', 'plastic', 'captur', 'oper', 'indonesia', 'excel', 'collabor', 'govt', 'thousand', 'island', 'akzono…']
['⚡️', '“', 'mahoshiva', 'cufflink', 'flamedmapl', 'mapl', '2017', '”', 'mahoshiva', 'handmad', 'slowfashion', 'musthav', 'wood…']
['grownund', 'otm', 'magazin', 'intern', 'public', 'tfl', 'transport', 'london', 'undergroundfarm', 'see', 'ear…']
['⚡️', '“', 'mahoshiva', 'cufflink', 'walnut', 'mapl', '2017', '”', 'mahoshiva', 'handmad', 'slowfashion', 'musthav', 'wood…']
['kasht

['realli', 'problem', 'must', 'tackl']
['suebhatia', 'kudo', 'countri', 'lead', 'charg', 'make', 'world', 'environment', 'friendlyther', 'still', 'much', 'work', 'in…']
['gerritgrella', 'creativ', 'solut', 'reduc', 'engergi', 'cost', '👍👍👍', 'kirkdborn', 'paulapiccard', 'evankirstel', 'lightboxdigi', 'innovecosys…']
['sargentdisc', 'book', '121', 'lead', 'uk', 'product', 'financ', 'amp', 'sustain', 'expert', 'wearealbert', 'bfi', 'filminukbfc', 'dig…']
['“', 'strang', 'myth', 'manag', 'environment', 'social', 'issu', 'alway', 'even', 'mostli', 'cost', 'money…']
['research', 'show', 'high', 'sustain', 'compani', 'profit', 'outperform', 'in…']
['launch', 'new', 'studi', 'investig', 'alga', 'pasta', 'could', 'meet', 'tast', 'consum', 'read', 'blog…']
['“', 'strang', 'myth', 'manag', 'environment', 'social', 'issu', 'alway', 'even', 'mostli', 'cost', 'money…']
['rdjconsult', 'solar', 'run', 'applianc', 'facil', 'would', 'like', 'know', 'post', 'pic', 'timelin', 'us', 'celebr', 'y…']
['launc

['impact', 'back', 'long', 'term', 'realist', 'fair', 'fund', 'school', 'sustain', 'ict…']
['fauz09412080', 'join', 'activ', 'plant', 'pakistan', '14th', 'august', '2018', 'tloop25', 'plantforpakistan', 'isustain', 'sustain']
['latest', 'star', 'maker', 'news', 'thank', 'absolutecomm…']
['engineeringncl', 'new', 'blog', 'newcastl', 'univers', 'shed', 'light', 'new', 'scienc', 'futur', 'engin', 'first', 'articl', 'went', 'li…']
['hedhntzgroupltd', 'contrast', 'hedhntz', 'executivesearch', 'hr', 'sustain', 'construct', 'properti', 'fashion', 'retail', 'pr', 'media', 'legal', 'se…']
['westaygreen', 'latest', 'stay', 'green', 'daili', 'thank', 'walkoflifecoach', 'sustain', 'csr']
['paulapiccard', 'uniqu', 'skylight', 'reduc', 'energi', 'cost', '☀️', 'solarenergi', 'sustain', 'tech', 'cc', 'kirkdborn', 'jblefevre60', 'evankir…']
['regrann', 'fgtalk', 'oh', 'ye', 'inde', 'start', 'today', 'use', 'wisely🙏🏾👌🏽👍🏾🙌🏾🤗regrann', 'khanshahrukh1', '…']
['bigmicrosurvey', 'uneviron', 'improv', 'wast', 

['congratul', 'thekeystonectr', 'monarch', 'collabor', 'farmer', 'monarch', '2018', 'big', 'sustainability…']
['saudiaramco', 'team', 'mazdausa', 'aisten', 'develop', 'effici', 'lowcarbon', 'engin', 'research', 'wi…']
['would', 'describ', 'product', 'driven', 'disciplin', 'interest', 'work', 'team', 'that…']
['🔥🌐', 'retweet', 'share', 'support', '🌐🔥', 'scottha55665467', 'crisi', 'nicaragua', 'sosnicaragua', 'sustain', 'viralexpnotify…']
['caelusgreenroom', 'alliancebernstein', 'join', 'coalit', 'advanc', 'divers', 'inclus', 'workplac', 'sustain…']
['3blmedia', 'watch', 'breweryviv', 'serv', 'sustain', 'everi', 'beer', 'umsea', 'grad', 'inspir', 'sustain', 'busi', 'b…']
['love', 'new', 'electr', 'vehiclesand', 'look', 'like', 'came', 'movi', 'car']
['caelusgreenroom', 'alliancebernstein', 'join', 'coalit', 'advanc', 'divers', 'inclus', 'workplac', 'sustain…']
['kashthefuturist', 'countri', 'stop', 'plastic', 'pollut', 'ocean', 'sustain', 'wast', 'circulareconomi', 'seasav…']
['sealnetwo

['ecosia', 'want', 'plant', 'tree', 'ecosia', 'launch', 'onlin', 'tshirt', 'store', 'sustain', 'fashion', 'ecosia']
['julesboykoff', 'combat', 'deadli', 'heat', 'tokyo', 'olymp', 'honcho', 'experi', 'spray', 'water', 'sidewalk', 'tri', 'cool', 'n…']
['haroldsinnott', 'futurist', 'check', 'cool', 'sixstep', 'forecast', 'methodolog', 'creat', 'amywebb', 'via', 'm…']
['fig', 'season', 'favourit', 'season', '😍', 'obsess', 'fig', 'italian', 'local', 'sustain', 'eatloc', 'season', 'zerowaste…']
['want', 'help', 'citi', 'act', 'climat', 'chang', 'give', 'data', 'sustain']
['40', 'sustain', 'expert', 'contribut', 'comprehens', 'resourc', 'sustain', 'engin', 'method', 'practica…']
['elllebest', 'interest', 'read', 'edi', 'academ', 'call', 'busi', 'drop', 'standalon', 'sustain', 'csr', 'report']
['demav', 'respons', 'innov', 'great', 'context', 'approach', 'compani', 'sustain', 'great', 'talk', 'andr', 'martinuzzi', 'ao…']
['berlinessainni', 'week', 'lead', 'workshop', '“', 'design', 'sustain', 

['saven4n5shop', 'sainsburi', 'invit', 'respect', 'islingtonbc', 'plan', 'expert', 'saven4n5shop', 'commun', 'wantne…']
['copperenergi', 'didyouknow', 'factor', 'sustain', 'ev', 'globalelectrif', 'watercrisi', 'healthconcern', 'greenarchit…']
['onlin', 'usgbc', 'new', 'member', 'volunt', 'orient', '3', '415', 'pm', 'august', '14', 'usgbc…']
['latest', 'solar', 'wind', 'renew', 'daili', 'thank', 'mabalicia', 'berntothefuture…']
['arikr', 'ye', 'renew', 'energi', 'futur', 'actonclim', 'carbon', 'pollut', 'green', 'sustain', 'environ', 'profr…']
['knowledgestart', 'super', 'cool', 'scienc']
['glengilmor', '❤️', '🚲', '🚲', '🚲', 'japan', '🇯🇵', 'autom', 'bicycl', 'park', 'system', 'engin', 'smartciti', 'smartciti', 'via', 'evankirstel', 'sharing…']
['arikr', 'ye', 'renew', 'energi', 'futur', 'actonclim', 'carbon', 'pollut', 'green', 'sustain', 'environ', 'profr…']
['great', 'tip', 'sustain', 'effort', 'creat', 'posit', 'experi']
['montag', 'top20', 'global', 'brand', 'montagehotel', 'sustain'

['caelusgreenroom', 'anoth', 'satisfi', 'currentst', 'client', 'think', 'inspir', 'multicultur', 'market', 'sustain', 'meet']
['caelusgreenroom', 'anoth', 'satisfi', 'currentst', 'client', 'think', 'inspir', 'multicultur', 'market', 'sustain', 'meet']
['haroldsinnott', 'futurist', 'check', 'cool', 'sixstep', 'forecast', 'methodolog', 'creat', 'amywebb', 'via', 'm…']
['caelusgreenroom', 'anoth', 'satisfi', 'currentst', 'client', 'think', 'inspir', 'multicultur', 'market', 'sustain', 'meet']
['behindthescen', 'interview', 'take', 'rooftop', 'tallest', 'build', 'campu', 'see', 'their…']
['urbanfutureconf', 'good', 'read', 'world', 'sustainablec', 'sustainablebuild', '🏙️', 'oslo', 'seattl', 'bogot…']
['pope', 'visit', 'sustain', 'confer', 'go', 'two', 'week', 'greenheroes…']
['theonlywayiseco', 'lot', 'work', 'done', 'make', 'progress', '3', 'favourit', 'ecotrend', 'far', 'year', '…']
['bring', 'outdoor', 'indoor', '100', 'pure', 'wool', 'carpet', 'sumptuous', 'soft', 'sustain', 'high', 'p

['jaieileenee', 'brain', 'scienc', 'women', 'ye', 'agre', 'plantstrong', 'permacultur', 'sustain', 'ecospiritu', 'hemp', 'answer', 'ar…']
['realli', 'cool', 'toronto', 'plan', 'start', 'use', 'organ', 'wast', 'green', 'bin', 'make', 'fuel', 'next', 'fall', 'biog…']
['matthewschrimpf', 'keep', 'milliondollarhom', 'look', 'like', 'millionbuck', 'keep', 'year', 'hottest', 'design', 'trend', 'susta…']
['thank', 'much', 'model4greenlvng', 'featur', 'us', 'blog', 'one', 'best', 'podcast', 'health', 'sustainabil…']
['recommend', 'anyon', 'job', 'loyal', 'donor', 'offic', 'arizona', '46686']
['wiomax', 'disrupt', 'sharedmobl', 'industri', 'docklessbik', 'scooter', 'program', 'bikeshar', 'smartmob…']
['☀️', 'work', 'lower', 'greenhous', 'ga', 'emiss', 'project', 'like', 'transport', 'electrif', 'goal', 'r…']
['atrin', 'need', 'pilot', 'licens', 'fli', 'car', 'credit', 'nowthisnew', 'rin', 'navig', 'drone', 'drone', 'sustain', 'flyingcar', '…']
['capmaisonhotel', 'campaign', 'show', 'plastic', '

['“', 'small', 'act', 'multipli', 'million', 'peopl', 'transform', 'world', '”', 'howard', 'zinn', 'renewables…']
['benev', 'there', 'widespread', 'desir', 'expect', 'worker', 'consum', 'support', 'healthier', 'planet', 'learn', 'company…']
['monday', 'word', 'peopl', 'planet', 'profit', 'environ', 'climatechang', 'sustain']
['dougblackab', 'wish', 'cancattlemen', 'great', 'canadian', 'beef', 'industri', 'confer', 'cdnbeefconf', 'start', 'tomorrow', 'london', 'ontario', 'htt…']
['9cnewsua', 'like', '5', 'plz', 'rtit', 'job', 'high', 'probabl', 'replac', 'robot', 'insur', 'underwrit', 'farm', 'labor', 'ai', 'automat…']
['catch', 'food', 'client', 'madhi', 'food', 'recent', 'trip', 'patagonia', 'un', 'sanction', 'peac', 'boat', 'sustain', 'cook…']
['amandasiebert', 'mayor', 'offic', 'tri', 'take', 'credit', 'return', 'salmon', 'whale', 'beaver', 'vancouv', 'area', 'umm']
['believ', 'say', 'bioplast', 'answer', 'problem', 'clever', 'market', 'ploy…']
['recommend', 'anyon', 'job', 'land', 

['wadetroxel', 'aclca', 'head', 'citi', 'lcaxviii', 'expert', 'sustain', 'lca', 'highlight', 'the…']
['privileg', 'belo', 'horizont', 'share', 'sustain', 'develop', 'ambit', 'colleagu…']
['offgrid', 'ecofriendli', 'technolog', 'smart', 'cabin', 'design', 'remot', 'live', 'featur', 'autonom', 'utilitie…']
['econom', 'environ', 'amp', 'social', 'equiti', 'sustmem', 'magazin', 'stori', 'via', 'gerbrandh…']
['chang', 'system', 'sustain', 'develop', 'sustain', 'southafrica', '14']
['digitalis', 'sustain', 'work', 'togeth', 'benefit', 'one', 'anoth', 'find', 'digit', 'sustain…']
['sustain', 'activ', 'involv', 'assetmanag', 'build', 'recognis', 'import', 'of…']
['find', 'hottest', 'impact', 'job', 'right', 'hotjob', 'impactcar', 'jobseek', 'jobsearch…']
['need', 'palm', 'oil']
['seedsandchip', 'global', 'popul', 'set', 'hit', '95', 'billion', '2050', 'demand', 'fresh', 'food', 'water', 'outstrip', 'supplie…']
['haroldsinnott', 'compon', 'need', 'build', 'smartciti', 'cybersecur', 'iot', '5g',

['iihsin', 'art', 'histori', 'background', 'priyanka', 'saha', 'aim', 'creat', 'visual', 'dialogu', 'citi', 'peopl', 'keep', 'ide…']
['sunpow', 'pivot', '‘', 'solar', 'energi', 'servic', 'upon', 'exit', 'utilityscal', 'develop', 'solarenergi', 'solarpower…']
['cargil', 'food', 'system', 'complex', 'work', 'deliv', 'valuabl', 'insight', 'custom', 'help', 'increas', 'effici', 't…']
['hope', 'someon', 'back', 'lawrenc', 'quest', 'scaleup', 'reduc', 'food', 'wast', 'uganda', 'beyond…']
['impakterdotcom', 'women', 'save', 'planet', 'annehidalgo', 'mayor', 'pari', 'presid', 'c40cities…']
['impakterdotcom', 'water', 'hold', 'key', 'sustain', 'develop', 'lockwoodem', 'cbm', 'repres', 'un', 'impakt']
['impakterdotcom', 'climat', 'chang', 'global', 'warm', 'erad', 'human', 'climatechang', 'late', 'articl', 'claudeforthomm', 'impakt…']
['citiesgrpuow', 'look', 'fwd', 'host', 'ness', '2nd', 'scienc', 'sustain', 'develop', 'day', 'aim', 'earli', 'career', 'research', 'no…']
['pavegen', 'great', 'sp

['sustev', 'measur', 'interest', 'measur', 'inform', 'amp', 'sustain', 'is…']
['drsplace', 'land', '“', 'use', '”', 'beef', 'gener', 'human', 'food', 'grassland', 'ecosystem', 'without', 'rumin', 'land', 'footprint…']
['whatsnyourpapr', 'know', 'paper', 'calcul', 'includ', 'tissu', 'product', 'papercalcul', 'sustain', 'whatsinyourpape…']
['nyc', 'paint', '6', 'million', 'squar', 'feet', 'rooftop', 'white', 'brilliant', 'heatfight', 'plan']
['drsplace', 'land', '“', 'use', '”', 'beef', 'gener', 'human', 'food', 'grassland', 'ecosystem', 'without', 'rumin', 'land', 'footprint…']
['energysvgtrust', 'join', 'transport', 'expert', 'promin', 'busi', 'govern', 'repres', 'discuss', 'fleet', 'sustain', 'innova…']
['brilliant', 'idea', 'contracostacounti', 'could', 'give', 'away', 'tree', 'sustain', 'supejohngioia', 'dianeburgis…']
['cobocent', 'know', 'cobocent', 'green', 'roof', 'home', '80000', 'honey', 'bee', 'manag', 'partner', 'conserv', 'b…']
['latest', 'headsupdad', 'commun', 'news', 'th

['drsharwood', 'circuitari', 'advanc', 'technolog', 'structur', 'natur', 'materi', 'digit', 'tech', 'ai', 'vr', 'su…']
['pavegen', 'great', 'sponsor', 'meco', 'award', 'themercurymal', 'great', 'work', 'sustain', 'innov', 'r…']
['anyon', 'sustain', 'jobsearch']
['thank', 'forb', 'adrfellowship', 'incred', 'opportun', 'inspir', 'other', 'listen', 'understand', 'tran…']
['fbaddach', 'want', 'contribut', 'make', 'sustain', 'world', 'need', 'inspir', 'like', 'here', 'good', 'sourc', 'susta…']
['here', 'christoph', 'broadbent', 'robertsbridg', 'real', 'found', 'father', 'environment', 'may', 'well', 'neve…']
['csrinact', 'miss', 'initi', 'articl', 'greenbond', 'click', '👉', 'read', 'nigeria', '🇳🇬emerg', 'leader', 'i…']
['zahrahmusa', 'big', 'rush', 'morn', 'ship', 'pass', 'wait', 'sigh', 'netherland', 'sustain']
['aagbi', 'appli', 'becom', 'first', 'ever', 'fellow', 'environment', 'sustain', 'anaesthesia', 'friday', '17', 'august', '2018', 'anaesthesia', 'environ…']
['bmay', 'here', 'christ

['duflotvaleria', '👍🌏thi', 'ac', 'unit', 'doesnt', 'use', 'electr', '—', 'cool', 'better', 'sustain', 'v', 'nowthisnew', 'ht', 'evankirstel…']
['seedsandchip', 'global', 'popul', 'set', 'hit', '95', 'billion', '2050', 'demand', 'fresh', 'food', 'water', 'outstrip', 'supplie…']
['ai', 'help', 'creat', 'better', 'world—if', 'build', 'right', 'via', 'singularityhub', 'betterworld', 'sustain', 'ai']
['unit', 'divers', 'voic', 'drive', 'import', 'initi', 'forward', 'feel', 'lucki', 'work', 'great', 'colleagues…']
['duflotvaleria', '👍🌏thi', 'ac', 'unit', 'doesnt', 'use', 'electr', '—', 'cool', 'better', 'sustain', 'v', 'nowthisnew', 'ht', 'evankirstel…']
['skylinemed', 'couldnt', 'agre', 'gundersenhealth', 'one', 'first', 'implement', 'streamway', 'help', 'minim', 'suction', 'can…']
['warn', 'plastic', 'crisi', 'realli', 'hit', 'home', 'becom', 'humanhealth', 'crisi', 'excel', 'articl', 'a…']
['stjohnnz', 'today', 'talk', 'student', 'volunt', 'opportun', 'aut', '2018', 'volunt', 'expo', 'gre

['duflotvaleria', '👍🌏thi', 'ac', 'unit', 'doesnt', 'use', 'electr', '—', 'cool', 'better', 'sustain', 'v', 'nowthisnew', 'ht', 'evankirstel…']
['antoniobandera', 'thank', 'fashion', 'love', 'support', '💙']
['janetgrang', 'open', 'data', 'could', 'save', 'ocean', 'via', 'marex', 'sustain', 'maritim', 'ship', 'environment…']
['bicyclechar', 'teach', 'bike', 'owner', 'bike', 'mainten', 'skill', 'give', 'power', 'identifi', 'problem', 'bike', 'becom…']
['duflotvaleria', 'scientist', 'discov', 'game', 'chang', 'enzym', 'break', 'plastic', 'day', '😍😍😍', 'plasticfre', 'innov', 'te…']
['antoniobandera', 'love', 'support', 'alway', 'fashion', '💙', 'gracia']
['wiomax', 'disrupt', 'sharedmobl', 'industri', 'docklessbik', 'scooter', 'program', 'bikeshar', 'smartmob…']
['greenhash', 'want', 'know', '2018', 'crypto', 'see', 'massiv', 'shift', 'awar', 'mathemat', 'race', 'mine', 'limit', 'elit', 'amp', 'techie…']
['drdcwahl', 'activ', 'revisit', 'consciou', 'particip', 'collect', 'intellig', 'cc', 'c

['zeroco2', 'share', 'agre', 'electr', 'soon', 'made', 'peopl', 'peopl', 'blockchain', 'renew', 'cleantech', 'pro…']
['kashthefuturist', 'els', 'think', 'reus', 'plastic', 'sustain', 'climatechang', 'plastic', 'wast', 'sdg', 'circulareconomi', '…']
['thai', 'union', 'includ', 'two', 'world', 'follow', 'sustain', 'indic', '2017', 'compani', 'named…']
['okchinyer', 'rise', 'above😊', 'mindingmybusi', 'okrecycl', 'focu', 'okchinyereconsult', 'tax', 'bookkeep', 'account', 'sustainabili…']
['andistaub', 'reread', 'converg', 'digit', 'amp', 'sustain', 'ai', 'iot', 'bigdata', 'fintech', 'insurtech', 'industry40', 'cybersec…']
['damiencabadi', 'countri', 'get', 'hydrogen', 'train', '🤔', 'world', 'econom', 'forum', 'congrat', 'germani', 'tech4good', 'sustain', 'ecology…']
['mahworldc', 'ride', 'bicycl', 'benefit', 'individu', 'world', 'larg', 'mwcchennai', 'partnership', 'w', 'pedl…']
['fudao', 'honour', 'singapor', 'highest', 'design', 'accolad', 'presid', 'design', 'award', 'design', 'y…']
['e

['sunflaircook', 'sun', 'bake', 'bliss', 'home', 'anywher', 'solarcook', 'solaroven', 'bake', 'cook', 'cinnamonrol', 'cinnamonbunss', 'offgri…']
['carolin', 'nettl', 'mission', 'plant', 'tree', 'wale', 'soulect', 'treevac', 'tree', 'sustain']
['axalta', 'support', 'environmentalstewardship', 'program', 'associ', 'nonprofit', 'organ', 'around', 'globe', 'promo…']
['sustain', 'help', 'drive', 'harm', 'mke', 'local', 'economi', 'featur', 'eco', 'erick', 'shambarg', 'source…']
['great', 'piec', 'eldaili', 'smart', 'packag', 'grow', 'popular', 'cant', 'disregard', 'function', 'or…']
['ladolfreezon', 'watch', 'md', 'amyjadesimi', 'arisetv', 'discuss', 'profosinbajo', 'recent', 'visit', 'localcont', 'develop', 'nigeria', 'lad…']
['lystek', 'amp', 'fairfieldsuisun', 'sewer', 'district', 'receiv', 'award', 'excel', 'innov', 'sustain', 'f…']
['theb1m', 'architect', 'engin', 'develop', 'construct', 'ever', 'greener', 'built', 'environ', 'watch', 'happen', 'tree', 'meet', 'bu…']
['coupl', 'member'

['student', 'cornellsg', 'immers', 'describ', 'realworld', 'perspect', 'sustain', 'gain', 'trek', 'ny', 'and…']
['thorsvortex', 'label', 'anyway', 'sustain', 'food', 'farmer', 'gaea😂', 'theolog', 'art', 'salt', 'seattleartsleadershipteam', 'salmon…']
['bradzarnett', 'globalsdgaward', 'search', '4', 'next', 'gener', 'sustain', 'leader', 'mayb', 'compani', 'figur', 't…']
['mathieuflamini', 'lose', 'fall', 'fail', 'get', 'hardworkpaysoff', 'sport', 'sustain', 'makeachang', '💪🏽🌎', 'h…']
['top', 'stori', 'drug', 'amp', 'pharma', 'co', 'news', 'basfcanada', 'globalcompactca', 'cawst', 'edaalberta', 'sherrittintl', 'q2…']
['fear', 'sale', 'tini', 'terror', 'ho', 'game', 'springst', 'renew', 'gold', 'melbourn', 'ausbiz…']
['theb1m', 'architect', 'engin', 'develop', 'construct', 'ever', 'greener', 'built', 'environ', 'watch', 'happen', 'tree', 'meet', 'bu…']
['xfxie', 'work', 'bikeshar', 'dataanalyt', 'implic', 'datadriven', 'decis', 'support', 'appear', 'journal', 'transport', 'geography…']
['

['homepod', 'homepod', 'optim', 'circular', 'economi', 'pattern', 'connect', 'peopl', 'product', 'creativ', 'opportun', 'romania…']
['homepod', 'endtoend', 'build', 'softwar', 'develop', 'want', 'reduc', 'cost', 'homepod', 'homepod', 'startups…']
['green', 'roof', 'planter', 'box', 'local', 'veggi', 'what', 'love', 'incorpor', 'agricultur', 'live', 'impro…']
['homepod', 'mitig', 'oper', 'risk', 'design', 'manag', 'subscript', 'predict', 'eprocur', 'softwar', 'homepod']
['homepod', 'autonom', 'build', 'softwar', 'allow', 'properti', 'develop', 'mitig', 'risk', 'reduc', 'construct', 'cost', 'startup', 'homepod…']
['homepod', 'bore', 'design', 'jump', 'softwar', 'softwar', 'sign', 'startup', 'homepod', 'proptech…']
['make', 'aquafe', 'sustain', 'scientist', 'develop', 'feed', 'use', 'marin', 'microalga', 'coproduct…']
['homepod', 'provid', 'circular', 'economi', 'autonom', 'framework', 'build', 'reduc', 'construct', 'demolit', 'wast', 'europ', 'b…']
['homepod', 'founder', 'attend', 'works

In [53]:
count_vectorizer = CountVectorizer(ngram_range=(1, 2),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.5)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6)

In [61]:
#Count Vectorizer
english_cursor = sustainability_collection.aggregate([{'$match': {'lang': 'en'}},{'$sample':{'size':100}}])

cleaned_tweets = []

for tweet in english_cursor:
    cleaned_tweets.append(cleaned_text(tweet['text'],tokenizer=TreebankWordTokenizer().tokenize, stemmer=PorterStemmer()))
    cv_data = count_vectorizer.fit_transform(cleaned_tweets)
#    tfidf_data = tfidf_vectorizer.fit_transform(cleaned_tweets)

AttributeError: 'list' object has no attribute 'lower'

In [55]:
#TF-IDF
english_cursor = sustainability_collection.aggregate([{'$match': {'lang': 'en'}},{'$sample':{'size':100}}])

cleaned_tweets = []

for tweet in english_cursor:
    cleaned_tweets.append(cleaned_text(tweet['text'],tokenizer=TreebankWordTokenizer().tokenize, stemmer=PorterStemmer()))
#    cv_data = count_vectorizer.fit_transform(cleaned_tweets)
    tfidf_data = tfidf_vectorizer.fit_transform(cleaned_tweets)

In [48]:
for doc in english_cursor:
    print (doc)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




{'_id': ObjectId('5b734d953386b775333287de'), 'created_at': 'Tue Aug 14 21:45:57 +0000 2018', 'id': 1029484293079347200, 'id_str': '1029484293079347200', 'text': 'Finland is the greenest country in the world! Got to make a visit there! #TravelTuesday #GoGreen #EcoFriendly #Sustainability 🍃', 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 1029153354407133184, 'id_str': '1029153354407133184', 'name': 'The Pure92 🍃', 'screen_name': 'thepure92', 'location': 'Atlanta, GA', 'url': None, 'description': 'The purest form of life relies on the basis of the 92 natural elements on the periodic table 🌾 #EnvironmentalAwareness #GoGreen', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 3, 'friends_count': 22, 'listed_count

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [60]:
cleaned_tweets

['sweat',
 'small',
 'stuff',
 'actual',
 'make',
 'differ',
 'banplast',
 'plasticban',
 'nostraw',
 'refusethestraw…']

In [56]:
count_vectorizer.get_feature_names()

['amp',
 'better',
 'council',
 'manag',
 'roadverg',
 'time',
 'wildflow',
 'wildlif']

In [57]:
tfidf_vectorizer.get_feature_names()

['actual',
 'banplast',
 'differ',
 'make',
 'nostraw',
 'plasticban',
 'refusethestraw',
 'small',
 'stuff',
 'sweat']

Models

In [11]:
#Define Models

n_comp = 20

lda = LatentDirichletAllocation(n_topics=10,
                                max_iter=10,
                                random_state=42,
                               learning_method='online')
lsa = TruncatedSVD(n_components=n_comp)
nmf = NMF(n_components=n_comp)

In [27]:
#Fit / Transform Models

lda_cv = lda.fit_transform(cv_data)
lsa_tfidf = lsa.fit_transform(tfidf_data)
lsa_cv = lsa.fit_transform(cv_data)
nmf_cv = nmf.fit_transform(cv_data)
nmf_tfidf = nmf.fit_transform(tfidf_data)



ValueError: n_components must be < n_features; got 20 >= 13

In [8]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [26]:
display_topics(lsa,count_vectorizer.get_feature_names(),10)

AttributeError: 'TruncatedSVD' object has no attribute 'components_'

In [None]:
n_comp = 20
lsa_tfidf = TruncatedSVD(n_components=n_comp)
lsa_cv = TruncatedSVD(n_components=n_comp)
nmf_cv = NMF(n_components=n_comp)

lsa_tfidf_data = lsa_tfidf.fit_transform(tfidf_data)
lsa_cv_data = lsa_cv.fit_transform(cv_data)
nmf_cv_data = nmf_cv.fit_transform(cv_data)