In [79]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import requests
import re
import unicodedata
from unidecode import unidecode
import warnings
warnings.filterwarnings('ignore')
import time

In [97]:
client = MongoClient()
db = client['capstone']
sid = SentimentIntensityAnalyzer()           

def pull_geodata(coll_name):
    coll=db[coll_name]
    query = coll.find({"full_data.geo" : {"$exists" : True,"$ne" : None}})
    df = pd.DataFrame(list(query))
    return df

def clean_text(inputString):
    final = ""
    for letter in inputString:
        try:
         letter.encode("ascii")
         final += letter
        except UnicodeEncodeError:
         final += ''
    return re.sub(r"http\S+", "", final)

def prep_twitter_df(df):
    df['coords'] = df['full_data'].apply(lambda x: x['geo']['coordinates'])
    
    code = []
    for i in df['full_data']:
        if i['place']:
            code.append(i['place']['country_code'])
        else:
            code.append(None)
    code = np.array(code)
    us_df = df[code=="US"]
    
    us_df['lat'] = us_df.coords.apply(lambda x: x[0])
    us_df['long'] = us_df.coords.apply(lambda x: x[1])
    
    us_df['text'] = us_df['text'].apply(clean_text)
    
    
    us_df['sentiment'] = us_df['text'].apply(lambda x: sid.polarity_scores(x))
    
    return us_df
    

In [134]:
#join census data
def get_census(coords):
    url='https://geo.fcc.gov/api/census/area?lat={}&lon={}&format=json'.format(coords[0],coords[1])
    res = requests.get(url)
    if res.json()['results']==[]:
        return None
    else:
        return res.json()['results'][0]['block_fips'][:-4]

def census_df(df):
    census = []
    count=0
    req_count=0
    while len(census)< df.shape[0]:
        census.append(get_census(df['coords'][count]))
        if req_count % 1000 == 0:
            print("Current count: ",req_count)
            time.sleep(30)

    df['census']=census
    return df


### HEALTHY FOODS

In [140]:
healthy_df = pull_geodata('healthy')

In [146]:
healthy_df.head()

Unnamed: 0,_id,full_data,keyword,sentiment,text
0,5c9cde8a626b30bd695ddf36,{'created_at': 'Thu Mar 28 13:07:03 +0000 2019...,almond,"{'neg': 0.0, 'neu': 0.678, 'pos': 0.322, 'comp...","W E D D I N G A L M O N D 🥂\nPink, bright, so..."
1,5c9cde8a626b30bd695ddf3b,{'created_at': 'Thu Mar 28 13:04:25 +0000 2019...,almond,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",@NutellaUSA #almond #pearl #waffles @ Inn At T...
2,5c9cde8a626b30bd695ddf42,{'created_at': 'Thu Mar 28 12:59:58 +0000 2019...,almond,"{'neg': 0.0, 'neu': 0.791, 'pos': 0.209, 'comp...",Good Spring Morning to all the Beautiful Souls...
3,5c9cde8a626b30bd695ddf6c,{'created_at': 'Thu Mar 28 12:34:53 +0000 2019...,almond,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",So good!!!!!\nTwo scoops Level-1 vanilla ice c...
4,5c9cde8b626b30bd695ddfd5,{'created_at': 'Thu Mar 28 11:44:10 +0000 2019...,almond,"{'neg': 0.0, 'neu': 0.763, 'pos': 0.237, 'comp...",Hello beautiful soul. 💕 Thursday reminder...yo...


In [147]:
us_healthy = prep_twitter_df(healthy_df)

In [148]:
us_healthy.shape

(3682, 8)

In [121]:
census=[]

In [128]:
for coords in us_healthy['coords'][3000:].values:
    census.append(get_census(coords))

In [129]:
len(census)

3682

In [None]:
census_healthy = census_df(us_healthy)

In [131]:
census_healthy = us_healthy

In [143]:
census_healthy.shape

(3682, 9)

In [133]:
census_healthy.head()

Unnamed: 0,_id,full_data,keyword,sentiment,text,coords,lat,long,census
0,5c9cde8a626b30bd695ddf36,{'created_at': 'Thu Mar 28 13:07:03 +0000 2019...,almond,"{'neg': 0.0, 'neu': 0.655, 'pos': 0.345, 'comp...","W E D D I N G A L M O N D \nPink, bright, sop...","[40.7537, -73.97254]",40.7537,-73.97254,36061009200
1,5c9cde8a626b30bd695ddf3b,{'created_at': 'Thu Mar 28 13:04:25 +0000 2019...,almond,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",@NutellaUSA #almond #pearl #waffles @ Inn At T...,"[44.14179362, -71.17940884]",44.141794,-71.179409,33003955100
2,5c9cde8a626b30bd695ddf42,{'created_at': 'Thu Mar 28 12:59:58 +0000 2019...,almond,"{'neg': 0.0, 'neu': 0.785, 'pos': 0.215, 'comp...",Good Spring Morning to all the Beautiful Souls...,"[42.1396065, -87.9076157]",42.139606,-87.907616,17031802506
3,5c9cde8a626b30bd695ddf6c,{'created_at': 'Thu Mar 28 12:34:53 +0000 2019...,almond,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",So good!!!!!\nTwo scoops Level-1 vanilla ice c...,"[34.924, -81.0282]",34.924,-81.0282,45091060501
4,5c9cde8b626b30bd695ddfd5,{'created_at': 'Thu Mar 28 11:44:10 +0000 2019...,almond,"{'neg': 0.0, 'neu': 0.755, 'pos': 0.245, 'comp...",Hello beautiful soul. Thursday reminder...you...,"[39.9008, -74.8239]",39.9008,-74.8239,34005703802


### UNHEALTHY FOODS

In [149]:
unhealthy_df = pull_geodata('unhealthy')

In [150]:
unhealthy_df.shape

(7299, 5)

In [151]:
us_unhealthy = prep_twitter_df(unhealthy_df)

In [152]:
us_unhealthy.shape

(4540, 8)

In [153]:
census_unhealthy = census_df(us_unhealthy)

KeyError: 0

### GROCERY STORES

In [105]:
grocery_df = pull_geodata('grocery_stores')

In [106]:
grocery_df.shape

(447, 5)

In [107]:
us_grocery = prep_twitter_df(grocery_df)

In [108]:
us_grocery.shape

(417, 8)

### FAST FOOD STORES

In [109]:
fastfood_df = pull_geodata('ff_stores')

In [110]:
fastfood_df.shape

(8067, 5)

In [111]:
us_fastfood = prep_twitter_df(fastfood_df)

In [112]:
us_fastfood.shape

(4334, 8)