In [79]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import requests
import re
import unicodedata
from unidecode import unidecode
import warnings
warnings.filterwarnings('ignore')
import time

In [97]:
client = MongoClient()
db = client['capstone']
sid = SentimentIntensityAnalyzer()           

def pull_geodata(coll_name):
    coll=db[coll_name]
    query = coll.find({"full_data.geo" : {"$exists" : True,"$ne" : None}})
    df = pd.DataFrame(list(query))
    return df

def clean_text(inputString):
    final = ""
    for letter in inputString:
        try:
         letter.encode("ascii")
         final += letter
        except UnicodeEncodeError:
         final += ''
    return re.sub(r"http\S+", "", final)

def prep_twitter_df(df):
    df['coords'] = df['full_data'].apply(lambda x: x['geo']['coordinates'])
    
    code = []
    for i in df['full_data']:
        if i['place']:
            code.append(i['place']['country_code'])
        else:
            code.append(None)
    code = np.array(code)
    us_df = df[code=="US"]
    
    us_df['lat'] = us_df.coords.apply(lambda x: x[0])
    us_df['long'] = us_df.coords.apply(lambda x: x[1])
    
    us_df['text'] = us_df['text'].apply(clean_text)
    
    
    us_df['sentiment'] = us_df['text'].apply(lambda x: sid.polarity_scores(x))
    
    return us_df
    

In [98]:
#join census data
def get_census(coords):
    url='https://geo.fcc.gov/api/census/area?lat={}&lon={}&format=json'.format(coords[0],coords[1])
    res = requests.get(url)
    if res.json()['results']==[]:
        return None
    else:
        return res.json()['results'][0]['block_fips'][:-4]

def census_df(df):
    census = []
    count=0
    req_count=0
    while len(census)< df.shape[0]:
        census.append(get_census(df['coords'][count]))
        if req_count % 50 == 0:
            print("Just spacing out my calls a bit")
            time.sleep(2*60)
        if req_count > 998:
            print("sleep time")
            time.sleep(60*60)

    df['census']=census
    return df


### HEALTHY FOODS

In [9]:
healthy_df = pull_geodata('healthy')

In [10]:
healthy_df.shape

(6377, 5)

In [103]:
us_healthy = prep_twitter_df(healthy_df)

In [104]:
us_healthy.shape

(3682, 8)

### UNHEALTHY FOODS

In [11]:
unhealthy_df = pull_geodata('unhealthy')

In [12]:
unhealthy_df.shape

(7299, 5)

In [84]:
us_unhealthy = prep_twitter_df(unhealthy_df)

In [99]:
us_unhealthy.shape

(4540, 8)

### GROCERY STORES

In [105]:
grocery_df = pull_geodata('grocery_stores')

In [106]:
grocery_df.shape

(447, 5)

In [107]:
us_grocery = prep_twitter_df(grocery_df)

In [108]:
us_grocery.shape

(417, 8)

### FAST FOOD STORES

In [109]:
fastfood_df = pull_geodata('ff_stores')

In [110]:
fastfood_df.shape

(8067, 5)

In [111]:
us_fastfood = prep_twitter_df(fastfood_df)

In [112]:
us_fastfood.shape

(4334, 8)