# MongoDB playing with Tags in python

In [286]:
%matplotlib inline
import pymongo
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
from bson import json_util, ObjectId
import pandas as pd
from pandas import DataFrame
from pandas.io.json import json_normalize
import numpy as np
import requests
import json, os
import configparser
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import seaborn as sns
import warnings
import random
import pprint
from datetime import datetime
random.seed(datetime.now())
warnings.filterwarnings('ignore')

# Make plots larger
plt.rcParams['figure.figsize'] = (10, 6)

In [287]:
client = MongoClient('localhost', 27017)

In [288]:
db=client.tweets

In [289]:
db.collection_names(include_system_collections=False)

['tweets']

In [290]:
try:
    result = db.tweets.drop()
    print ("analytics tweets dropped")
except:
    pass

analytics tweets dropped


In [291]:
def load_json(j):
    p=os.path.join("data/", j)
    print (p)
    with open(p, 'rU') as f:
      data = [json.loads(row) for row in f]
    return data 

In [292]:
tweets_j=load_json('db_tweets.json')
tweets_j[0]

data/db_tweets.json


{'contributors': None,
 'coordinates': None,
 'created_at': 'Sun Mar 15 21:41:22 +0000 2015',
 'entities': {'hashtags': [{'indices': [128, 136], 'text': 'Bigdata'},
   {'indices': [137, 140], 'text': 'IoT'},
   {'indices': [139, 140], 'text': 'CIO'}],
  'symbols': [],
  'trends': [],
  'urls': [{'display_url': 'bit.ly/1jvQFcK',
    'expanded_url': 'http://bit.ly/1jvQFcK',
    'indices': [105, 127],
    'url': 'http://t.co/gLf50KtZKx'}],
  'user_mentions': [{'id': 474651213,
    'id_str': '474651213',
    'indices': [3, 15],
    'name': 'Marc Wilczek',
    'screen_name': 'MarcWilczek'}]},
 'favorite_count': 0,
 'favorited': False,
 'filter_level': 'low',
 'geo': None,
 'id': 577223085054558208,
 'id_str': '577223085054558208',
 'in_reply_to_screen_name': None,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'lang': 'en',
 'place': None,
 'possibly_sensitive': False,
 'retweet_count': 0,
 'retweeted': Fa

In [293]:
print(tweets_j[0]['text'])

RT @MarcWilczek: In 2014, the digital universe equaled 1.7 megabytes a minute for every person on Earth: http://t.co/gLf50KtZKx #Bigdata #I…


In [294]:
def tweet_json(tid,text,created,favorite_count,retweet_count,urls,tags):
    j={
    "tweet_id" : tid,
    "text" : text,
    "favorite_count" : favorite_count,  
    "retweet_count" : retweet_count,
    "urls" : urls, 
    "tags" : tags,         
    "created_at" : created}
    return j

In [295]:
# Twitter dates are of the form Sun Mar 15 21:41:54 +0000 2015
datestrings=['Sun Mar 15 21:41:54 +0000 2015','Tue Mar 29 08:11:25 +0000 2011']
from datetime import timedelta
from email.utils import parsedate_tz
from dateutil.parser import parse

def to_datetime(datestring):
    time_tuple = parsedate_tz(datestring.strip())
    dt = datetime(*time_tuple[:6])
    return dt - timedelta(seconds=time_tuple[-1])

ts=to_datetime(datestrings[0])
print (ts.strftime("%Y-%m-%d %H:%M"))
ts=to_datetime(datestrings[1])
print (ts.strftime("%Y-%m-%d %H:%M"))

2015-03-15 21:41
2011-03-29 08:11


In [296]:
hashtags={}
starter_tags=['Bigdata','big data','algorithm','big data','AI','MongoDB','SQL','artificial intelligence','machine learning']
for tag in starter_tags:
    hashtags[tag]=0
urls={}
tags={}
print(hashtags)

{'Bigdata': 0, 'big data': 0, 'algorithm': 0, 'AI': 0, 'MongoDB': 0, 'SQL': 0, 'artificial intelligence': 0, 'machine learning': 0}


In [297]:
sample_tweet_text="RT @TheHesterView Tutorials on big data, big data, AI, MongoDB, SQL, artificial intelligence, machine learning. hackathons, crowdsourcing, #bigdata http://t.co/6HWjCv3BL5 Lets join "
print (sample_tweet_text)

RT @TheHesterView Tutorials on big data, big data, AI, MongoDB, SQL, artificial intelligence, machine learning. hackathons, crowdsourcing, #bigdata http://t.co/6HWjCv3BL5 Lets join 


In [298]:
import re
def tokenize(txt):  
  txt=re.sub(r'\n', ' ',txt)
  txt=re.compile(r'[\.][ ]+').sub(' ',txt)      
  txt=re.compile(r'[\,][ ]+').sub(' ',txt)    
  txt=re.compile(r'[_+;=!@$%^&\*\"\?]').sub(' ',txt)  
  splitter=re.compile(r'[ ]+')
  # Split the words by non-alpha characters
  words=splitter.split(txt)
  return words
print (tokenize(sample_tweet_text))

['RT', 'TheHesterView', 'Tutorials', 'on', 'big', 'data', 'big', 'data', 'AI', 'MongoDB', 'SQL', 'artificial', 'intelligence', 'machine', 'learning', 'hackathons', 'crowdsourcing', '#bigdata', 'http://t.co/6HWjCv3BL5', 'Lets', 'join', '']


In [299]:
s='http://t.co/6HWjCv3BL5'
print (s[0:4].lower())

http


In [300]:
from nltk.corpus import stopwords
stop_words_list = list(stopwords.words('english'))
stop_words={}
for tag in stop_words_list:
    stop_words[tag]=0
print (stop_words.keys())

dict_keys(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'sh

In [301]:
def update_urls_tags(url_list,urls,hashtag_list,hashtags,tag_list,tags):  
  for url in url_list:
      if url in urls:
        urls[url]=urls[url]+1
      else:
        urls[url]=1	 
  for tag in tag_list:
      if tag in tags:
        tags[tag]=tags[tag]+1
      else:
        tags[tag]=1	 
  for hashtag in hashtag_list:
      if hashtag in hashtags:
        hashtags[hashtag]=hashtags[hashtag]+1
      else:
        hashtags[hashtag]=1        
  return urls,hashtags,tags

In [302]:
def extract_tags_urls(dct,words,stop):
  i=0
  tags={} 
  tokens={}     
  urls={}     
  size=len(words)   
  while i < size:
    ngram = words[i]
    i=i+1
    if len(ngram) < 1: continue
    if len(ngram) > 4:        
      if ngram[0:4].lower()=='http':
        if ngram in urls:
          urls[ngram]=urls[ngram]+1
        else:
          urls[ngram]=1	 
    if ngram[0]=='#':
  #    ngram=re.sub(r'\#', '',ngram)     if you want to remove the # 
      tags[ngram]=1 
    if ngram.lower() not in stop:    
        tokens[ngram]=1	          
    if ngram in dct:
      tags[ngram]=1
    if i < (size-1):
      ngram = words[i] + ' ' + words[i+1]
      if words[i].lower() not in stop:    
        tokens[ngram]=1	        
      if ngram in dct:
        tags[ngram]=1
    if i < (size-2):
      ngram = words[i] + ' ' + words[i+1] + ' ' + words[i+2]           
      if ngram in dct:
        tags[ngram]=1
  return list(tags.keys()),list(urls.keys()),list(tokens.keys())
print (extract_tags_urls(hashtags,(tokenize(sample_tweet_text)),stop_words))

(['big data', 'AI', 'MongoDB', 'SQL', 'artificial intelligence', 'machine learning', '#bigdata'], ['http://t.co/6HWjCv3BL5'], ['RT', 'TheHesterView Tutorials', 'TheHesterView', 'Tutorials on', 'Tutorials', 'big data', 'big', 'data big', 'data', 'data AI', 'AI MongoDB', 'AI', 'MongoDB SQL', 'MongoDB', 'SQL artificial', 'SQL', 'artificial intelligence', 'artificial', 'intelligence machine', 'intelligence', 'machine learning', 'machine', 'learning hackathons', 'learning', 'hackathons crowdsourcing', 'hackathons', 'crowdsourcing #bigdata', 'crowdsourcing', '#bigdata http://t.co/6HWjCv3BL5', '#bigdata', 'http://t.co/6HWjCv3BL5 Lets', 'http://t.co/6HWjCv3BL5', 'Lets join', 'Lets', 'join ', 'join'])


In [303]:
cnt=0
for tweet in tweets_j:
    ts=datetime.now()
    try:
        ts=to_datetime(tweet['created_at'])
    except:
        continue  
    favorite_count=0
    try:
        favorite_count=int(tweet['favorite_count'])
    except:
        pass 
    retweet_count=0
    try:
        retweet_count=int(tweet['retweet_count'])
    except:
        pass  
    tweet_tags,tweet_urls,tweet_ngrams=extract_tags_urls(hashtags,(tokenize(tweet['text'])),stop_words)
    urls,hashtags,tags=update_urls_tags(tweet_urls,urls,tweet_tags,hashtags,tweet_ngrams,tags)
    try:
        j=tweet_json(tweet['id'],tweet['text'],ts,favorite_count,retweet_count,tweet_urls,tweet_tags)
        result = db.tweets.insert_one(j)
        cnt+=1
    except:
        pass       
print ("%d tweets inserted."%cnt)

55 tweets inserted.


In [304]:
for tweet in db.tweets.find().limit(9):
    pprint.pprint(tweet)

{'_id': ObjectId('5ac6fc98a313fc0a38a3e47d'),
 'created_at': datetime.datetime(2015, 3, 15, 21, 41, 22),
 'favorite_count': 0,
 'retweet_count': 0,
 'tags': ['#Bigdata', '#I…'],
 'text': 'RT @MarcWilczek: In 2014, the digital universe equaled 1.7 megabytes '
         'a minute for every person on Earth: http://t.co/gLf50KtZKx #Bigdata '
         '#I…',
 'tweet_id': 577223085054558208,
 'urls': ['http://t.co/gLf50KtZKx']}
{'_id': ObjectId('5ac6fc98a313fc0a38a3e47e'),
 'created_at': datetime.datetime(2015, 3, 15, 21, 41, 30),
 'favorite_count': 0,
 'retweet_count': 0,
 'tags': [],
 'text': 'Tanginang database to',
 'tweet_id': 577223118327926785,
 'urls': []}
{'_id': ObjectId('5ac6fc98a313fc0a38a3e47f'),
 'created_at': datetime.datetime(2015, 3, 15, 21, 41, 31),
 'favorite_count': 0,
 'retweet_count': 0,
 'tags': ['#AgTech', '#AgBots', '#Robotics', '#Automation', '#Io…'],
 'text': 'RT @chrisco: The future of farming: robots + big data: '
         'http://t.co/MiEIZmKzEb via @KurzweilAINe

In [305]:
for key, value in hashtags.items():
    print ("%s count %d"%(key, value))

Bigdata count 0
big data count 1
algorithm count 1
AI count 0
MongoDB count 2
SQL count 1
artificial intelligence count 0
machine learning count 0
#Bigdata count 1
#I… count 1
#AgTech count 1
#AgBots count 1
#Robotics count 1
#Automation count 1
#Io… count 1
#cloud count 4
#bigdata count 14
#aws count 3
#ec2 count 2
#BigData count 13
#natgeosxsw count 1
#Analytics count 6
#amazon count 2
#IT count 2
#HRtech count 2
#HR count 2
#People count 2
#Cloud count 3
#data count 1
#DataScientists count 1
#Hackathon count 3
#BDH count 1
#sme count 2
#thamesvalley count 2
#php count 5
#database count 5
#IoT count 1
#job#SeniorOracleDBA count 1
#banktech count 1
#fintech count 1
#ITJob count 3
#Job count 3
#Plymouth count 1
#hiring count 1
#Wilmington count 1
#job count 2
#Sioux count 1
#Wearables count 1
#IOT count 1
#3DPrinting count 1
#Selfie: count 1
#Today count 1
#Zimbabwe count 1
#TeaParty count 1
#tcot count 1
#PJNet count 1


In [306]:
def trim_dct(dct,thresh): 
  tmp={}    
  keys=dct.keys()
  for key in keys:
    if dct[key] > thresh:
        tmp[key]=dct[key]
  return tmp

In [307]:
'''
for key, value in tags.items():
    print ("%s count %d"%(key, value))
'''    

'\nfor key, value in tags.items():\n    print ("%s count %d"%(key, value))\n'

In [308]:
tags=trim_dct(tags,3)

In [309]:
for key, value in tags.items():
    print ("%s count %d"%(key, value))

RT count 32
database count 6
via count 10
#bigdata count 14
#BigData count 13
#Analytics count 6
- count 9


In [310]:
for key, value in urls.items():
    print ("%s count %d"%(key, value))

http://t.co/gLf50KtZKx count 1
http://t.co/MiEIZmKzEb count 1
http://t.co/i37gbFthQw count 1
http://t.co/uYQIXOSMHs count 1
http://t.co/uQSGCKEiOn count 1
http://t.co/6HWjCv3BL5 count 1
http://t.co/rLNk2UtnkL count 1
http://t.co/92XV9CUUpO count 1
http://t.co/fi0MuG9QUH count 1
http://t.co/jRLp7… count 1
http://t.co/w4xw93Tw19 count 2
http://t.co/wdK8JZM7db count 1
http://t.co/tkSfjPpods count 1
http://t.co/mM0… count 1
http… count 1
https://t.co/BGmnXAOlIY count 1
http:… count 1
http://t.co/EBChICLch2 count 1
http://t.co/SgcfxbdQvZ count 1
http://t.c… count 1
http://t.co/Jfh5Xc2hQz count 1
http://t.co/SIHVND8mDY count 2
http://t.co/… count 2
http://t.co/OGcvtI1663 count 1
http://t.co/n5kJRoYydW count 1
https://t.co/TzrPR5rtY8 count 1
http://t.co/UQg7q3EvHp count 1
http://t.co/BZN44MeEYF count 2
http://t.co/rPVg5Kd7q1 count 1
http://t.co/gXAsZVt1IQ count 1
http://t.co/Qh5Lkvcyyz count 1
http://t.co/d6O7BOcfKQ count 2
http://t.co/CUES54iN0m count 1
http://t.co/Oq7Dkb4TRy count 1
http://

Updated October 3, 2017