In [118]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import requests
import re
import unicodedata
from unidecode import unidecode
import warnings
warnings.filterwarnings('ignore')
import time
import ast

In [119]:
#connect to mongo
client = MongoClient()
db = client['capstone']
sid = SentimentIntensityAnalyzer()   

In [120]:
#pull data from mongo
def pull_geodata(coll_name):
    coll=db[coll_name]
    query = coll.find({"full_data.geo" : {"$exists" : True,"$ne" : None}})
    df = pd.DataFrame(list(query))
    return df

#remove emojis and urls
def clean_text(inputString):
    final = ""
    for letter in inputString:
        try:
         letter.encode("ascii")
         final += letter
        except UnicodeEncodeError:
         final += ''
    return re.sub(r"http\S+", "", final)

#get tweets only from US with geocoords
#split out lat,long,text, and sentiment
def prep_twitter_df(df):
    df['coords'] = df['full_data'].apply(lambda x: x['geo']['coordinates'])
    
    code = []
    for i in df['full_data']:
        if i['place']:
            code.append(i['place']['country_code'])
        else:
            code.append(None)
    code = np.array(code)
    us_df = df[code=="US"]
    
    us_df['lat'] = us_df.coords.apply(lambda x: x[0])
    us_df['long'] = us_df.coords.apply(lambda x: x[1])
    
    us_df['text'] = us_df['text'].apply(lambda x: re.sub(r"http\S+", "", x))
    
    
    us_df['sentiment'] = us_df['text'].apply(lambda x: sid.polarity_scores(x))
    #us_df['sentiment'] = us_df['sentiment'].apply(ast.literal_eval)
    return us_df 

In [121]:
#join census data
def get_census(coords):
    url='https://geo.fcc.gov/api/census/area?lat={}&lon={}&format=json'.format(coords[0],coords[1])
    res = requests.get(url)
    if res.json()['results']==[]:
        return None
    else:
        return res.json()['results'][0]['block_fips'][:-4]

#call fcc api to convert lat,long pairs to census tracts in US
def census_df(df):
    census = []
    count=0
    req_count=0
    while len(census)< df.shape[0]:
        census.append(get_census(df['coords'].iloc[count]))
        req_count+=1
        count+=1
        if req_count % 1000 == 0:
            print("Current count: ",req_count)
            time.sleep(30)

    df['census']=census
    return df


In [153]:
#wrapper function to do the above
def pull_to_csv(coll_name,path_for_csv):
    df1 = pull_geodata(coll_name)
    us_df = prep_twitter_df(df1)
    census_update = census_df(us_df)
    census_update.to_csv(path_for_csv)
    return census_update

### HEALTHY FOODS

In [9]:
healthy_df = pull_geodata('healthy')

us_healthy = prep_twitter_df(healthy_df)

census_healthy = census_df(us_healthy)

census_healthy.to_csv("data/twitter_mongo/census_healthy1.csv")

OR

In [None]:
census_healthy = pull_to_csv('healthy',"data/twitter_mongo/census_healthy1.csv")

### HEALTHY FOODS Second Pull

In [11]:
healthy_df2 = pull_geodata('healthy2')

us_healthy2 = prep_twitter_df(healthy_df2)

census_healthy2 = census_df(us_healthy2)

census_healthy2.to_csv("data/census_healthy2.csv")

OR

In [None]:
census_healthy2 = pull_to_csv('healthy2',"data/twitter_mongo/census_healthy2.csv")

### UNHEALTHY FOODS

In [149]:
unhealthy_df = pull_geodata('unhealthy')

us_unhealthy = prep_twitter_df(unhealthy_df)

census_unhealthy = census_df(us_unhealthy)

census_unhealthy.to_csv("data/census_unhealthy1.csv")

OR

In [None]:
census_unhealthy = pull_to_csv('unhealthy',"data/twitter_mongo/census_unhealthy1.csv")

### HEALTHY FOODS Second Pull

In [26]:
unhealthy_df2 = pull_geodata('unhealthy2')

us_unhealthy2 = prep_twitter_df(unhealthy_df2)

census_unhealthy2 = census_df(us_unhealthy2)

census_unhealthy2.to_csv("data/census_unhealthy2.csv")

OR

In [None]:
census_unhealthy2 = pull_to_csv('unhealthy2',"data/twitter_mongo/census_unhealthy2.csv")

### GROCERY STORES

In [43]:
grocery_df = pull_geodata('grocery_stores')

us_grocery = prep_twitter_df(grocery_df)

census_grocery = census_df(us_grocery)

census_grocery.to_csv("data/census_grocery1.csv")

OR

In [None]:
census_grocery = pull_to_csv('grocery_stores',"data/twitter_mongo/census_healthy2.csv")

### Grocery Second Pull

In [38]:
grocery_df2 = pull_geodata('grocery_stores2')

us_grocery2 = prep_twitter_df(grocery_df2)

census_grocery2 = census_df(us_grocery2)

census_grocery2.to_csv("data/census_grocery2.csv")

OR

In [None]:
census_grocery2 = pull_to_csv('grocery_stores2',"data/twitter_mongo/census_grocery2.csv")

### FAST FOOD STORES

In [109]:
fastfood_df = pull_geodata('ff_stores')

us_fastfood = prep_twitter_df(fastfood_df)

census_fastfood = census_df(us_fastfood)

census_fastfood.to_csv("data/twitter_mongo/census_fastfood1.csv")

OR

In [None]:
census_fast_food = pull_to_csv('ff_stores',"data/twitter_mongo/census_fastfood1.csv")

### FAST FOODS Second Pull

In [49]:
fastfood_df2 = pull_geodata('ff_stores2')

us_fastfood2 = prep_twitter_df(fastfood_df2)

census_fastfood2 = census_df(us_fastfood2)

census_fastfood2.to_csv("data/twitter_mongo/census_fastfood2.csv")

OR

In [None]:
census_fastfood2 = pull_to_csv('ff_stores2',"data/twitter_mongo/census_fastfood2.csv")

In [None]:
test