In [1]:
import pandas as pd
import numpy as np
from pymongo import MongoClient

%matplotlib inline

mongo_client = MongoClient('18.236.138.158', 27016)
database_reference = mongo_client.twitter

In [2]:
from mongo_aggregation_verbs import *

In [3]:
collection_reference = database_reference.instructor_test_group

In [4]:
collection_reference.count()

20000

In [5]:
source_is_instagram = { 'source' : '<a href="http://instagram.com" rel="nofollow">Instagram</a>' }
source_is_not_instagram = { 'source' : {'$ne' : '<a href="http://instagram.com" rel="nofollow">Instagram</a>'} } 

In [6]:
(collection_reference.find(source_is_instagram).count(),
 collection_reference.find(source_is_not_instagram).count())

(1907, 18093)

## Tweet Locations

In [7]:
not_null = { '$ne' : None }
nonnull_geo = {'geo' : not_null }
keep_geo_and_text = { 'geo' : 1, 'text' : 1 , '_id' : 0}

match_insta = {
    MATCH : source_is_instagram
}

match_not_insta = {
    MATCH : source_is_not_instagram
}

In [8]:
cursor = collection_reference.aggregate([
    { MATCH : source_is_instagram },
    { MATCH : nonnull_geo},
    { COUNT : "geo"}
])
next(cursor)

{'geo': 1907}

In [9]:
cursor = collection_reference.aggregate([
    { MATCH : source_is_not_instagram },
    { MATCH : nonnull_geo},
    { COUNT : "geo"}
])
next(cursor)

{'geo': 1045}

In [10]:
def group_and_count(key):
    return { GROUP : {
                 "_id"   : key,
                 "count" : { "$sum" : 1 }
                }
           }

In [11]:
def parse_geo_from_tweets(tweets):
    tweets = pd.DataFrame(tweets)
    geo = pd.DataFrame(list(tweets['_id'].values))
    geo['count'] = tweets['count']
    return geo

In [12]:
cursor = collection_reference.aggregate([
    { MATCH : source_is_not_instagram },
    { MATCH : nonnull_geo},
    group_and_count('$geo'),
    { MATCH : { "count" : { "$gt" : 14 } } }, 
    { SORT  : { "count" : -1 } }
])
not_insta = parse_geo_from_tweets(list(cursor))
not_insta

Unnamed: 0,coordinates,type,count
0,"[34.0522342, -118.2436849]",Point,206
1,"[37.3813444, -122.1802812]",Point,39
2,"[34.1425078, -118.255075]",Point,31
3,"[36.778261, -119.4179324]",Point,21
4,"[35.426667, -116.89]",Point,17
5,"[34.0508369, -118.263032]",Point,16
6,"[34.0194543, -118.4911912]",Point,15


In [13]:
cursor = collection_reference.aggregate([
    { MATCH : source_is_instagram },
    { MATCH : nonnull_geo},
    group_and_count('$geo'),
    { MATCH : { "count" : { "$gt" : 14 } } }, 
    { SORT  : { "count" : -1 } }
])
insta = parse_geo_from_tweets(list(cursor))
insta

Unnamed: 0,coordinates,type,count
0,"[34.0522, -118.243]",Point,465
1,"[36.48863024, -119.72972051]",Point,37
2,"[34.09799334, -118.33866453]",Point,35
3,"[34.07305556, -118.39944444]",Point,29
4,"[34.0221, -118.481]",Point,27
5,"[34.0402214, -118.2545227]",Point,16
6,"[33.9442368, -118.3975983]",Point,15


In [14]:
import folium
starting_loc = [34.0689, -118.4452]
la_map = folium.Map(location=starting_loc, zoom_start=12)

In [15]:
for loc, count in not_insta[['coordinates','count']].values:
    popup = folium.Popup(str(count), parse_html=True)
    folium.Marker(loc, popup=popup, icon=folium.Icon(color='red')).add_to(la_map)
for loc, count in insta[['coordinates','count']].values:
    popup = folium.Popup(str(count), parse_html=True)
    folium.Marker(loc, popup=popup, icon=folium.Icon(color='blue')).add_to(la_map)



In [16]:
la_map

In [17]:
def parse_geo_from_tweets(tweets):
    tweets = pd.DataFrame(tweets)
    geo = pd.DataFrame(list(tweets['_id'].values))
    geo['count'] = tweets['count']
    return geo

In [18]:
cursor = collection_reference.aggregate([
    { MATCH : source_is_not_instagram },
    { MATCH : nonnull_geo},
    group_and_count('$user.id'),
    { MATCH : { "count" : { "$gt" : 14 } } }, 
    { SORT  : { "count" : -1 } },
    { LIMIT : 10 }
])
not_insta_top_users = pd.DataFrame(list(cursor))
not_insta_top_users

Unnamed: 0,_id,count
0,4549072827,29
1,787687147,29
2,1414684496,27
3,3066057658,27
4,789990810,27
5,4191239027,25
6,21298660,21
7,3864064936,19
8,21298373,19
9,3380828067,17


In [19]:
cursor = collection_reference.aggregate([
    { MATCH : source_is_instagram },
    { MATCH : nonnull_geo},
    group_and_count('$user.id'),
#     { MATCH : { "count" : { "$gt" : 10 } } }, 
    { SORT  : { "count" : -1 } },
    { LIMIT : 10 }
])
insta_top_users = pd.DataFrame(list(cursor))
insta_top_users

Unnamed: 0,_id,count
0,1455659006,10
1,613833206,8
2,843390093012353024,6
3,4561143733,6
4,19640448,5
5,226456467,5
6,37016954,4
7,760160463833313280,4
8,30723561,4
9,2267807461,4


In [20]:
not_insta_top_users_ids = not_insta_top_users._id.values
insta_top_users_ids = insta_top_users._id.values

In [21]:
not_insta_top_users_ids_list = list(not_insta_top_users_ids)
not_insta_top_users_ids_list = [int(i) for i in not_insta_top_users_ids_list]
insta_top_users_ids_list = list(insta_top_users_ids)
insta_top_users_ids_list = [int(i) for i in insta_top_users_ids_list]

In [22]:
def parse_geo_from_tweets(tweets):
    tweets = pd.DataFrame(tweets)
    tmp = pd.DataFrame(list(tweets['_id'].values))
    geo = pd.DataFrame(list(tmp['geo'].values))
    geo['user_id'] = tmp['user_id']
    geo['count'] = tweets['count']
    return geo

In [23]:
cursor = collection_reference.aggregate([
    { MATCH : source_is_not_instagram },
    { MATCH : nonnull_geo},
    { PROJECT : { "user_id" : "$user.id", "geo" : 1, "text" : 1, "_id" :0 } },
    { MATCH : { "user_id" : { "$in" : not_insta_top_users_ids_list }}},
    group_and_count({"user_id" : "$user_id", "geo" : "$geo"}),
])

not_insta_top_user_geo = parse_geo_from_tweets(list(cursor))

In [24]:
cursor = collection_reference.aggregate([
    { MATCH : source_is_instagram },
    { MATCH : nonnull_geo},
    { PROJECT : { "user_id" : "$user.id", "geo" : 1, "text" : 1, "_id" :0 } },
    { MATCH : { "user_id" : { "$in" : insta_top_users_ids_list }}},
    group_and_count({"user_id" : "$user_id", "geo" : "$geo"}),
])
insta_top_user_geo = parse_geo_from_tweets(list(cursor))

In [25]:
not_insta_top_user_geo.head()

Unnamed: 0,coordinates,type,user_id,count
0,"[34.19743613, -118.58178967]",Point,4549072827,1
1,"[34.03491, -118.27746]",Point,4191239027,1
2,"[35.7476654, -118.060997]",Point,1414684496,1
3,"[34.0995, -118.32813]",Point,4191239027,1
4,"[34.187044, -118.3812562]",Point,789990810,1


In [26]:
insta_top_user_geo.head()

Unnamed: 0,coordinates,type,user_id,count
0,"[34.04453451, -118.26677639]",Point,226456467,1
1,"[34.06895637, -118.40267947]",Point,1455659006,1
2,"[34.0221, -118.481]",Point,1455659006,1
3,"[34.07305556, -118.39944444]",Point,1455659006,5
4,"[34.08718311, -118.46354276]",Point,19640448,1


In [27]:
colors_not_insta = {
    4549072827 : 'red',
    3066057658 : 'blue',
    1414684496 : 'green',
    21298660 : 'purple',
    3380828067 : 'orange',
    787687147 : 'darkred',
    21298373 : 'lightred',
    3864064936 : 'beige',
    4191239027 : 'darkblue',
    789990810 : 'darkgreen',
}

# colors_insta = {
#     760160463833313280 : '#0000ff',
#     30723561 : '#0010ff',
#     613833206 : '#0020ff',
#     2267807461 : '#0030ff',
#     4561143733 : '#0040ff',
#     1455659006 : '#0050ff',
#     37016954 : '#0060ff',
#     19640448 : '#0070ff',
#     843390093012353024 : '#0080ff',
#     226456467 : '#0090ff',
# }

In [28]:
not_insta_top_user_geo['color'] = not_insta_top_user_geo.user_id.apply(lambda x: colors_not_insta[x])
# insta_top_user_geo['color'] = insta_top_user_geo.user_id.apply(lambda x: colors_insta[x])

In [29]:
not_insta_top_user_geo.sample(10)

Unnamed: 0,coordinates,type,user_id,count,color
60,"[34.1583323, -118.5155901]",Point,21298660,1,purple
39,"[34.19533677, -118.57134192]",Point,4549072827,1,red
53,"[34.165357, -118.6089752]",Point,789990810,1,darkgreen
73,"[34.0522342, -118.2436849]",Point,21298373,10,lightred
59,"[34.19948803, -118.58392246]",Point,4549072827,1,red
109,"[34.1898566, -118.451357]",Point,789990810,2,darkgreen
87,"[38.8105011, -122.8171692]",Point,1414684496,1,green
57,"[34.0522342, -118.2436849]",Point,789990810,16,darkgreen
51,"[34.19798999, -118.57596955]",Point,4549072827,1,red
71,"[34.0536608, -118.4133894]",Point,787687147,2,darkred


In [30]:
starting_loc = [34.0689, -118.4452]
la_map = folium.Map(location=starting_loc, zoom_start=12)

In [31]:
for loc, color, count in not_insta_top_user_geo[['coordinates','color','count']].values:
    popup = folium.Popup(str(count), parse_html=True)
    if count < 3:
        folium.Marker(loc, popup=popup, icon=folium.Icon(color=color)).add_to(la_map)
#     else:
#         folium.Marker(loc, popup=popup, icon=folium.Icon(color=color, icon='warning')).add_to(la_map)
# for loc, count in insta[['coordinates','count']].values:
#     folium.Marker(loc, popup=popup, icon=folium.Icon(color='blue')).add_to(la_map)



In [32]:
la_map