In [1]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import requests
import re
import unicodedata
from unidecode import unidecode
import warnings
warnings.filterwarnings('ignore')
import time
import ast
from model_prep import prep_data

In [None]:
client = MongoClient()
db = client['capstone']
sid = SentimentIntensityAnalyzer()   

In [2]:
def pull_geodata(coll_name):
    coll=db[coll_name]
    query = coll.find({"full_data.geo" : {"$exists" : True,"$ne" : None}})
    df = pd.DataFrame(list(query))
    return df

def clean_text(inputString):
    final = ""
    for letter in inputString:
        try:
         letter.encode("ascii")
         final += letter
        except UnicodeEncodeError:
         final += ''
    return re.sub(r"http\S+", "", final)

def prep_twitter_df(df):
    df['coords'] = df['full_data'].apply(lambda x: x['geo']['coordinates'])
    
    code = []
    for i in df['full_data']:
        if i['place']:
            code.append(i['place']['country_code'])
        else:
            code.append(None)
    code = np.array(code)
    us_df = df[code=="US"]
    
    us_df['lat'] = us_df.coords.apply(lambda x: x[0])
    us_df['long'] = us_df.coords.apply(lambda x: x[1])
    
    us_df['text'] = us_df['text'].apply(lambda x: re.sub(r"http\S+", "", x))
    
    
    us_df['sentiment'] = us_df['text'].apply(lambda x: sid.polarity_scores(x))
    us_df['sentiment'] = us_df['sentiment'].apply(ast.literal_eval)
    return us_df
    

In [175]:
#join census data
def get_census(coords):
    url='https://geo.fcc.gov/api/census/area?lat={}&lon={}&format=json'.format(coords[0],coords[1])
    res = requests.get(url)
    if res.json()['results']==[]:
        return None
    else:
        return res.json()['results'][0]['block_fips'][:-4]

def census_df(df):
    census = []
    count=0
    req_count=0
    while len(census)< df.shape[0]:
        census.append(get_census(df['coords'].iloc[count]))
        req_count+=1
        count+=1
        if req_count % 1000 == 0:
            print("Current count: ",req_count)
            time.sleep(30)

    df['census']=census
    return df


### HEALTHY FOODS

In [140]:
healthy_df = pull_geodata('healthy')

In [146]:
healthy_df.head()

Unnamed: 0,_id,full_data,keyword,sentiment,text
0,5c9cde8a626b30bd695ddf36,{'created_at': 'Thu Mar 28 13:07:03 +0000 2019...,almond,"{'neg': 0.0, 'neu': 0.678, 'pos': 0.322, 'comp...","W E D D I N G A L M O N D 🥂\nPink, bright, so..."
1,5c9cde8a626b30bd695ddf3b,{'created_at': 'Thu Mar 28 13:04:25 +0000 2019...,almond,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",@NutellaUSA #almond #pearl #waffles @ Inn At T...
2,5c9cde8a626b30bd695ddf42,{'created_at': 'Thu Mar 28 12:59:58 +0000 2019...,almond,"{'neg': 0.0, 'neu': 0.791, 'pos': 0.209, 'comp...",Good Spring Morning to all the Beautiful Souls...
3,5c9cde8a626b30bd695ddf6c,{'created_at': 'Thu Mar 28 12:34:53 +0000 2019...,almond,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",So good!!!!!\nTwo scoops Level-1 vanilla ice c...
4,5c9cde8b626b30bd695ddfd5,{'created_at': 'Thu Mar 28 11:44:10 +0000 2019...,almond,"{'neg': 0.0, 'neu': 0.763, 'pos': 0.237, 'comp...",Hello beautiful soul. 💕 Thursday reminder...yo...


In [None]:
us_healthy = prep_twitter_df(healthy_df)

In [148]:
us_healthy.shape

(3682, 8)

In [121]:
census=[]

In [128]:
for coords in us_healthy['coords'][3000:].values:
    census.append(get_census(coords))

In [129]:
len(census)

3682

In [None]:
census_healthy = census_df(us_healthy)

In [131]:
census_healthy = us_healthy

In [143]:
census_healthy.shape

(3682, 9)

In [133]:
census_healthy.head()

Unnamed: 0,_id,full_data,keyword,sentiment,text,coords,lat,long,census
0,5c9cde8a626b30bd695ddf36,{'created_at': 'Thu Mar 28 13:07:03 +0000 2019...,almond,"{'neg': 0.0, 'neu': 0.655, 'pos': 0.345, 'comp...","W E D D I N G A L M O N D \nPink, bright, sop...","[40.7537, -73.97254]",40.7537,-73.97254,36061009200
1,5c9cde8a626b30bd695ddf3b,{'created_at': 'Thu Mar 28 13:04:25 +0000 2019...,almond,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",@NutellaUSA #almond #pearl #waffles @ Inn At T...,"[44.14179362, -71.17940884]",44.141794,-71.179409,33003955100
2,5c9cde8a626b30bd695ddf42,{'created_at': 'Thu Mar 28 12:59:58 +0000 2019...,almond,"{'neg': 0.0, 'neu': 0.785, 'pos': 0.215, 'comp...",Good Spring Morning to all the Beautiful Souls...,"[42.1396065, -87.9076157]",42.139606,-87.907616,17031802506
3,5c9cde8a626b30bd695ddf6c,{'created_at': 'Thu Mar 28 12:34:53 +0000 2019...,almond,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",So good!!!!!\nTwo scoops Level-1 vanilla ice c...,"[34.924, -81.0282]",34.924,-81.0282,45091060501
4,5c9cde8b626b30bd695ddfd5,{'created_at': 'Thu Mar 28 11:44:10 +0000 2019...,almond,"{'neg': 0.0, 'neu': 0.755, 'pos': 0.245, 'comp...",Hello beautiful soul. Thursday reminder...you...,"[39.9008, -74.8239]",39.9008,-74.8239,34005703802


In [160]:
census_healthy.to_csv("data/census_healthy1.csv")

### UNHEALTHY FOODS

In [149]:
unhealthy_df = pull_geodata('unhealthy')

In [150]:
unhealthy_df.shape

(7299, 5)

In [151]:
us_unhealthy = prep_twitter_df(unhealthy_df)

In [152]:
us_unhealthy.shape

(4540, 8)

In [157]:
census_unhealthy = census_df(us_unhealthy)

Current count:  1000
Current count:  2000
Current count:  3000
Current count:  4000


In [159]:
census_unhealthy.head()

Unnamed: 0,_id,full_data,keyword,sentiment,text,coords,lat,long,census
2,5c9d4874aaedff02ea18ce65,{'created_at': 'Thu Mar 28 22:03:04 +0000 2019...,bacon,"{'neg': 0.063, 'neu': 0.695, 'pos': 0.242, 'co...",This evening's special nocterrabrewing is the ...,"[40.15962, -83.07848]",40.15962,-83.07848,39041011423
3,5c9d4874aaedff02ea18ce77,{'created_at': 'Thu Mar 28 22:01:26 +0000 2019...,bacon,"{'neg': 0.101, 'neu': 0.791, 'pos': 0.108, 'co...","8 ounce filet mignon, topped with a bacon, gre...","[47.92657866, -97.03240413]",47.926579,-97.032404,38035010100
4,5c9d4874aaedff02ea18ce99,{'created_at': 'Thu Mar 28 21:57:26 +0000 2019...,bacon,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",HAMBURGUESA A LA DIABLA ! ! ! Flat iron steak ...,"[33.63696256, -117.60853489]",33.636963,-117.608535,6059032053
5,5c9d4874aaedff02ea18ceaa,{'created_at': 'Thu Mar 28 21:54:15 +0000 2019...,bacon,"{'neg': 0.0, 'neu': 0.77, 'pos': 0.23, 'compou...",Well...no pancakes and bacon!\nGuess the @pink...,"[33.7680875, -118.1751669]",33.768088,-118.175167,6037576601
6,5c9d4876aaedff02ea18cf9c,{'created_at': 'Thu Mar 28 21:15:42 +0000 2019...,bacon,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",New in my Repertoire!!!!\nCherrywod smoked se...,"[29.50307025, -98.68740387]",29.50307,-98.687404,48029181715


In [162]:
census_unhealthy.to_csv("data/census_unhealthy1.csv")

### GROCERY STORES

In [105]:
grocery_df = pull_geodata('grocery_stores')

In [106]:
grocery_df.shape

(447, 5)

In [107]:
us_grocery = prep_twitter_df(grocery_df)

In [108]:
us_grocery.shape

(417, 8)

In [167]:
census_grocery = census_df(us_grocery)

In [168]:
census_grocery.head()

Unnamed: 0,_id,full_data,keyword,sentiment,text,coords,lat,long,census
0,5c9bb3aaaaedff5572f9e6e1,{'created_at': 'Wed Mar 27 15:00:08 +0000 2019...,safeway,"{'neg': 0.0, 'neu': 0.903, 'pos': 0.097, 'comp...","See our latest #Rocklin, CA #branchbanking job...","[38.7907339, -121.2357828]",38.790734,-121.235783,6061021103
1,5c9bb3aaaaedff5572f9e703,{'created_at': 'Wed Mar 27 07:50:41 +0000 2019...,safeway,"{'neg': 0.732, 'neu': 0.268, 'pos': 0.0, 'comp...",Fuck no! Just stop M&amp;Ms. @ Safeway,"[37.76910525, -122.42826619]",37.769105,-122.428266,6075016900
2,5c9bb3abaaedff5572f9e7a4,{'created_at': 'Tue Mar 26 23:19:28 +0000 2019...,safeway,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","I'm at @Safeway in San Francisco, CA","[37.7746023, -122.4652111]",37.774602,-122.465211,6075045200
4,5c9bb3acaaedff5572f9e7d8,{'created_at': 'Tue Mar 26 20:10:22 +0000 2019...,safeway,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","I'm at @Safeway in Falls Church, VA","[38.8535005, -77.1287346]",38.853501,-77.128735,51059451502
5,5c9bb3acaaedff5572f9e7d9,{'created_at': 'Tue Mar 26 20:10:02 +0000 2019...,safeway,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",I'm at Starbucks (inside safeway) in Falls Chu...,"[38.853191, -77.129397]",38.853191,-77.129397,51059451502


In [169]:
census_grocery.to_csv("data/census_grocery1.csv")

### FAST FOOD STORES

In [109]:
fastfood_df = pull_geodata('ff_stores')

In [110]:
fastfood_df.shape

(8067, 5)

In [111]:
us_fastfood = prep_twitter_df(fastfood_df)

In [112]:
us_fastfood.shape

(4334, 8)

In [171]:
census_fastfood = census_df(us_fastfood)

Current count:  1000
Current count:  2000
Current count:  3000
Current count:  4000


In [172]:
census_fastfood.head()

Unnamed: 0,_id,full_data,keyword,sentiment,text,coords,lat,long,census
19,5c9b30feaaedff5572f7d255,{'created_at': 'Wed Mar 27 03:00:50 +0000 2019...,mcdonald's,"{'neg': 0.193, 'neu': 0.807, 'pos': 0.0, 'comp...",Waiting on our dinner. At least it's not McDo...,"[33.877187, -118.21795]",33.877187,-118.21795,6037543202
24,5c9b3103aaedff5572f7d389,{'created_at': 'Wed Mar 27 01:19:47 +0000 2019...,mcdonald's,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","I'm at McDonald's in Chicago, IL","[41.8835986, -87.6258091]",41.883599,-87.625809,17031320100
27,5c9b3104aaedff5572f7d3e3,{'created_at': 'Wed Mar 27 00:46:29 +0000 2019...,mcdonald's,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",#Chichan #Tone 2 #mood\n#Valle #Poser #Fake #d...,"[31.70018254, -106.33942634]",31.700183,-106.339426,48141003901
36,5c9b3113aaedff5572f7d8a0,{'created_at': 'Tue Mar 26 19:18:41 +0000 2019...,mcdonald's,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",Here for a late lunch! (@ McDonald's in Chicag...,"[42.00715077, -87.69046558]",42.007151,-87.690466,17031020302
38,5c9b3114aaedff5572f7d936,{'created_at': 'Tue Mar 26 18:38:09 +0000 2019...,mcdonald's,"{'neg': 0.237, 'neu': 0.763, 'pos': 0.0, 'comp...","McDonald's, it ain't .. #cigar #botl @eighty...","[36.0363, -95.7836]",36.0363,-95.7836,40143007503


In [173]:
census_fastfood.to_csv("data/census_fastfood1.csv")

In [None]:
df_train,y_train,df_test,y_test= prep_data()