In [1]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import requests
import re
import unicodedata
from unidecode import unidecode
import warnings
warnings.filterwarnings('ignore')
import time
import ast

In [2]:
target = pd.read_csv('../data/target.csv',index_col=0)
target['CensusTract'] = target['CensusTract'].apply(lambda x: "0"+str(x) if len(str(x))==10 else str(x)) 

In [37]:
target.head()

Unnamed: 0,CensusTract,LILATracts_1And10
0,1001020100,0
1,1001020200,0
2,1001020300,0
3,1001020400,0
4,1001020500,0


In [3]:
#connect to mongo
client = MongoClient()
db = client['capstone']
sid = SentimentIntensityAnalyzer()   

In [44]:
#join census data
def get_census(coords):
    url='https://geo.fcc.gov/api/census/area?lat={}&lon={}&format=json'.format(coords[0],coords[1])
    res = requests.get(url)
    if res.json()['results']==[]:
        return None
    else:
        return res.json()['results'][0]['block_fips'][:-4]

#call fcc api to convert lat,long pairs to census tracts in US
def census_df(df):
    census = []
    count=0
    req_count=0
    while len(census)< df.shape[0]:
        census.append(get_census(df['coords'].iloc[count]))
        req_count+=1
        count+=1
        if req_count % 1000 == 0:
            print("Current count: ",req_count)
            time.sleep(30)

    df['census']=census
    return df


In [101]:
#pull data from mongo
def pull_geodata(coll_name):
    coll=db[coll_name]
    query = coll.find({"full_data.geo" : {"$exists" : True,"$ne" : None}})
    df = pd.DataFrame(list(query))
    return df

#remove emojis and urls
def clean_text(inputString):
    final = ""
    for letter in inputString:
        try:
            letter.encode("ascii")
            final += letter
        except UnicodeEncodeError:
            final += ''
    return re.sub(r"http\S+", "", final)

#get tweets only from US with geocoords
#split out lat,long,text, and sentiment
def prep_twitter_df(df,category):
    
    target = pd.read_csv('../data/target.csv')
    target['CensusTract'] = target['CensusTract'].apply(lambda x: "0"+str(x) if len(str(x))==10 else str(x)) 
    
    df['coords'] = df['full_data'].apply(lambda x: x['geo']['coordinates'])
    
    code = []
    for i in df['full_data']:
        if i['place']:
            code.append(i['place']['country_code'])
        else:
            code.append(None)
    code = np.array(code)
    us_df = df[code=="US"]

    us_df['lat'] = us_df.coords.apply(lambda x: x[0])
    us_df['long'] = us_df.coords.apply(lambda x: x[1])
    
    us_df['text'] = us_df['text'].apply(lambda x: re.sub(r"http\S+", "", x))
    
    us_df['sentiment'] = us_df['text'].apply(lambda x: sid.polarity_scores(x))
    
    if type(us_df['sentiment'][0]) == str:
        us_df['sentiment'] = us_df['sentiment'].apply(ast.literal_eval)

    us_df = us_df[(us_df['long']>-161)&(us_df['long']<-68)&(us_df['lat']>20)&(us_df['lat']<64)]
    
    us_df = census_df(us_df)

    us_df['census'] = us_df['census'].apply(lambda x: "0"+str(x) if len(str(x))==10 else str(x)) 

    us_df.census=us_df.census.apply(float)

    us_df.dropna(axis=0, inplace=True)

    us_df.census= us_df.census.apply(int)

    us_df['census']=us_df['census'].apply(lambda x: "0"+str(x) if len(str(x))==10 else str(x)) 

    us_df['comp']=us_df['sentiment'].apply(lambda x: x['compound'])

    us_df['category']=category

    us_df = pd.merge(us_df,target,how='inner',left_on="census",right_on="CensusTract")

    us_df['county'] = us_df['census'].apply(lambda x: x[:5])
    
    us_df = us_df[['keyword','text','lat','long','census','comp','category','LILATracts_1And10','county']]
    
    return us_df 

In [102]:
#wrapper function to do the above
def pull_to_csv(coll_name,path_for_csv,category):
    df1 = pull_geodata(coll_name)
    us_df = prep_twitter_df(df1,category)
    us_df.to_csv(path_for_csv)
    return us_df

#takes in collection and path to pull the cleaned data from and insert to mongo
def clean_to_mongo(coll_name, path_to_csv):
    coll=db[coll_name]
    
    df = pd.read_csv(path_to_csv, index_col=0)
    data = df.to_dict(orient='records')
    coll.insert_many(data)
    
#saves the csv data from mongo as a df 
def mongo_to_csv(coll_name,csv_path):
    coll=db[coll_name]
    query = coll.find()
    df = pd.DataFrame(list(query))
    df.to_csv(csv_path)

### HEALTHY FOODS

In [None]:
healthy = pull_to_csv('healthy_clean',"../data/twitter_mongo/healthy_final.csv","healthy")

### UNHEALTHY FOODS

In [None]:
unhealthy = pull_to_csv('unhealthy_clean',"../data/twitter_mongo/unhealthy_final.csv","unhealthy")

### GROCERY STORES

In [None]:
grocery = pull_to_csv('grocery_stores_clean',"../data/twitter_mongo/grocery_stores_final.csv")

### FAST FOOD STORES

In [None]:
fast_food = pull_to_csv('ff_stores_clean',"../data/twitter_mongo/ff_stores_final.csv")

### Healthy cleaned data to mongo
EX:

In [None]:
coll=db['healthy_clean']

clean_to_mongo('healthy_clean',"../data/census_healthy2.csv")

coll.count()

#combined csv with all data from mongo
mongo_to_csv('healthy_clean',"../data/healthy_clean.csv")