### Import and Clean Data Script

#### Author: Lauren Thomas
#### Created: 01/05/2021
#### Last updated: 27/06/2021

###### File description: This file imports, cleans and pre-processes the data that will be used in the ML models.

In [None]:
import os
import gzip
import glob
import json
import pickle

import geopandas as gp
import pandas as pd
import numpy as np

import osmium as osm
from os import sep
from shapely.geometry import Point,Polygon,MultiPolygon

#### Import data

In [None]:
# Working directory
cwd = f"C:{sep}Users{sep}ltswe{sep}Dropbox{sep}Oxford{sep}Thesis"
# Data directory is kept on flash
data_dir = "D:"

In [None]:
# Restructure JSONs into JSOLs (where each line = one tweet) for each month-year from Jan 2007 to Dec 2013
# make list of month-year pairs
ym_list = [str(year)+"-"+("0"+str(month))[-2:] for month in range(1,13) for year in range(2007,2014)]

# Create a dictionary that will contain all the JSONs for a given list of month-year pairs

def create_json(ym_list, json_pickle_str):
    all_jsons = dict()
    for ym in ym_list:
        print(ym)
        # Create a list of the jsons that fall into that y-m - excluding all outputs that ends in 00000.json.
        json_list = [j for j in glob.glob(f'{data_dir}{sep}raw_tweets{sep}{ym}*{sep}*.json', recursive=True) 
                 if j[-10:] != '00000.json']
        # Create list of JSONs that we will append to the larger dictionary 
        temp_json_list = list()
        for j in json_list:
            temp_json = json.load(open(j, encoding = 'utf-8'))['data']
            temp_json_list.extend(temp_json)
        # Add temp_dict to larger dictionary of all JSONs with the key as the year-month
        all_jsons[ym] = temp_json_list
    # Pickle JSON
    tweets_json_pickle = open(f"{data_dir}{sep}pickle{sep}{json_pickle_str}.pickle", "wb")
    pickle.dump(all_jsons, tweets_json_pickle)


In [None]:
# Create JSONs in 12 chunks (one for each month)
# json_01 = create_json(ym_list[0:7], 'json_01')
# json_02 = create_json(ym_list[7:14], 'json_02')
# json_03 = create_json(ym_list[14:21], 'json_03')
# json_04 = create_json(ym_list[21:28], 'json_04')
# json_05 = create_json(ym_list[28:35], 'json_05')
# json_06 = create_json(ym_list[35:42], 'json_06')
# json_07 = create_json(ym_list[42:49], 'json_07')
# json_08 = create_json(ym_list[49:56], 'json_08')
# json_09 = create_json(ym_list[56:63], 'json_09')
# json_10 = create_json(ym_list[63:70], 'json_10')
# json_11 = create_json(ym_list[70:77], 'json_11')
# json_12 = create_json(ym_list[77:84], 'json_12')

In [None]:
def unpickle_json(num):
    ''' This function unpickles the relevant JSON'''
    return pickle.load(open(f"{data_dir}{sep}pickle{sep}json_{num}.pickle", "rb"))

In [None]:
json_03 = unpickle_json("03")

In [None]:
# author_id, id, lang, public_metrics.like_count, public_metrics.quote_count, public_metrics.reply_count,
# public_metrics.retweet_count, text
ym_list_test = ['2007-03', '2008-03','2009-03', '2010-03', '2011-03', '2012-03', '2013-03']

In [None]:
x_list, y_list, geo_list, author_list, tweet_list, lang_list, like_list, quote_list, reply_list, retweet_list, \
text_list, ym_list = list(),list(),list(),list(),list(),list(),list(),list(),list(),list(),list(),list()
# Make lists of important things 
for ym in ym_list_test:
    print(ym)
    for j in json_03[ym]:
        try:
            geo_id = Point(j["geo"]["coordinates"]["coordinates"])
            x_list.append(geo_id.x)
            y_list.append(geo_id.y)
            geo_list.append(geo_id), author_list.append(j['author_id']), tweet_list.append(j['id'])
            lang_list.append(j['lang']), like_list.append(j['public_metrics']['like_count'])
            quote_list.append(j['public_metrics']['quote_count']), reply_list.append(j['public_metrics']['reply_count']) 
            retweet_list.append(j['public_metrics']['retweet_count']), text_list.append(j['text'])
            ym_list.append(ym)
        except KeyError:
            continue
# Create dataframe for month 
tst_df = pd.DataFrame(
    {'ym': ym_list,
    'tweet_id':tweet_list,
     'author_id':author_list,
    'lang': lang_list,
    'like_count': like_list,
     'quote_count': quote_list,
     'reply_count': reply_list,
     'retweet_count': retweet_list,
     'text': text_list,
     'x': x_list,
     'y': y_list,
     'geometry': geo_list
    }
)
tst_geodf = gp.GeoDataFrame(tst_df[['x', 'y', 'geometry']])

In [None]:
tst_geodf = tst_df['geometry']
tst_geodf

In [None]:
from urllib.request import urlopen
from zipfile import ZipFile
from io import BytesIO

# Download and unzip shape files from census tract website
# Begin with a function that downloads & unzips a url
def download_url(url, save_path):
    with urlopen(url) as zipresp:
        with ZipFile(BytesIO(zipresp.read())) as zfile:
            zfile.extractall(save_path)
    

# Create a function that, when given a list of urls, downloads and unzips each url in the list & makes into pandas
def download_unzip(url_start, url_list, save_path):
    '''
    url_start = first part of the url (that is the same for all the zip files)
    url_list = second part (the part that changes & is the name of the unzipped file) 
    '''
    # Create empty df
    shape_df = pd.DataFrame()
    # Put each shape file into a df & append
    for url in url_list:
        download_url(url_start+url+".zip", save_path)
        shape_df = shape_df.append(gp.read_file(f"{save_path}{sep}{url}.shp"))
    return shape_df

# Create list of unique urls (tl_2013_##_tract) where ## = 01-78, with some numbers not included
# to_drop = list of #'s not to include
to_drop = [3, 7, 14,43,52,57,58,59,61,62,63,64,65,67,68,70,71,73,74,75,76,77]
url_list = ["tl_2013_" + ("0"+str(i))[-2:] + "_tract" for i in range(1,79) if i not in to_drop]
url_start = "https://www2.census.gov/geo/tiger/TIGER2013/TRACT/"

# Create shapefile dataframe of all counties in NYC
shape_df = download_unzip(url_start, url_list, 
                          f"{data_dir}{sep}shape_files")
        
# Get rid of any non-NYC counties (those not in the five NYC counties)
nyc_counties = shape_df[(shape_df['COUNTYFP'] == '069') | (shape_df['COUNTYFP'] == '047') | (shape_df['COUNTYFP'] == '081') 
    | (shape_df['COUNTYFP'] == '085') | (shape_df['COUNTYFP'] == '005')].reset_index(drop=True)
        
# Pickle dataframe
nyc_counties_pickle = open(f"{data_dir}{sep}pickle{sep}nyc_counties.pickle", "wb")
pickle.dump(nyc_counties, nyc_counties_pickle)


In [None]:
nyc_counties = pickle.load(open(f"{data_dir}{sep}pickle{sep}nyc_counties.pickle", "rb"))

In [None]:
nyc_counties.geometry[]

In [None]:
p1 = Point(all_jsons['2011-02'][0]['geo']['coordinates']['coordinates'])

In [None]:
# Create a function that downloads, unzips all the shape files from census tract website,
# then creates a dataframe from the shape file, which it appends to the others
# then create a function that takes a point from the tweet, runs it through all the rows, finds relevant
# census tract, then returns the relevant census tract
# then assign census tract to tweet & creates field in JSON
# do this for all tweets

In [None]:
poly_column = nyc_counties.columns.get_loc("geometry")
geoid_column = nyc_counties.columns.get_loc("GEOID")
for row in range(nyc_counties.shape[0]):
    poly = nyc_counties.iloc[row,poly_column]
    if p1.within(poly) == True:
        print(nyc_counties.iloc[row,geoid_column])

In [None]:
test_dict = {temp_json2[i]['id']:temp_json2[i] for i in range(len(temp_json2))}
# test_dict

In [None]:
# Bring in crime and 311 data, which uses the Socrata API in NYC Open Data
# Create a function that uses the Socrata API, which is written in SoQL, a SQL-like language, to query data
from sodapy import Socrata

def socrata_API_df(source_domain, dataset_id, select_string, where_string, limit=1000):
    '''
    Inputs: 
    source_domain: This tells Socrata the source of the dataset you're querying
    dataset_id: This is the unique id of the dataset
    select_string: This string tells Socrata which variables you are selecting from the dataset
    where_string: This string is equivalent to the "where" command in SQL
    limit = This tells Socrata how many results to query. The default is 1000 b/c Socrata automatically sets it to 1000

    Outputs a dataframe with with the queried results
    '''
    keyFile = open(f'{cwd}{sep}tokens{sep}socrata_apikey.txt', 'r')
    token = keyFile.readline() #api token imported from txt file
    
    client = Socrata(source_domain, token)
    # Change timeout var to arbitrarily large # of seconds so it doesn't time out
    client.timeout = 50
    results = client.get(dataset_id, limit = limit, select = select_string, where = where_string)
    df = pd.DataFrame.from_records(results)
    return df


In [None]:
# Create function to shorten lat/long
# Cut lat/long to 4 decimals (10 m)
def round_lat_long(df):
    df["latitude"] = df["latitude"].apply(lambda x: round(float(x),3))
    df["longitude"]= df["longitude"].apply(lambda x: round(float(x),3))


In [None]:
# Pull in 311 and Null Data 
# 2007, 2008, & 2009 are separate; 2010-on are in a single file. 
# The only thing that changes between 2007-09 is the dataset ID, & the id + where string for 2010-on
# so write a function that calls upon the 311 socrata API data
# complaint type string -- separated for ease of understanding. Complaint types drawn from literature
complaint_type_str = "complaint_type = 'Noise - Street/Sidewalk' OR complaint_type = 'Noise - Residential' OR complaint_type = 'Noise - Vehicle' OR complaint_type = 'Street Condition' " \
                    "OR complaint_type = 'Homeless Encampment' OR complaint_type = 'Drinking' OR complaint_type = 'Noise' " \
                    "OR complaint_type = 'Noise - Park' OR complaint_type = 'Noise - House of Worship' OR complaint_type = 'HEATING' " \
                    "OR complaint_type = 'GENERAL CONSTRUCTION' OR complaint_type = 'CONSTRUCTION' OR complaint_type = 'Boilers' " \
                    "OR complaint_type = 'For Hire Vehicle Complaint' OR complaint_type = 'Bike Rack Condition' OR complaint_type = 'Illegal Parking' " \
                    "OR complaint_type = 'Building/Use' OR complaint_type = 'ELECTRIC' OR complaint_type = 'PLUMBING'"

def pull_311(dataset_id, where_string = f'latitude IS NOT NULL AND ({complaint_type_str})'):
    return socrata_API_df(source_domain = "data.cityofnewyork.us", dataset_id = dataset_id, \
                         select_string = 'unique_key, created_date, complaint_type, date_extract_y(created_date) as year, date_extract_m(created_date) as month, descriptor, latitude, longitude', \
                         where_string = where_string,
                         limit = 4000000)

# 2007-2013
nyc_311_07 = pull_311("aiww-p3af")
nyc_311_08 = pull_311('uzcy-9puk')
nyc_311_09 = pull_311('3rfa-3xsf')
nyc_311_10_13 = pull_311('erm2-nwe9', \
                where_string = f'({complaint_type_str}) AND latitude IS NOT NULL AND (year = 2010 OR year = 2011 OR year = 2012 OR year = 2013)')

# Combine all four
nyc_311 = nyc_311_07.append(nyc_311_08).append(nyc_311_09).append(nyc_311_10_13)


In [None]:
nyc_311.complaint_type.unique()

In [None]:
nyc_311_pickle = open(f"{data_dir}{sep}pickle{sep}nyc_311.pickle", "wb")
pickle.dump(nyc_311, nyc_311_pickle)

In [None]:
# Pull in NYC historical crime data (also uses Socrata data)
select_string = 'cmplnt_num, cmplnt_fr_dt AS date, date_extract_y(cmplnt_fr_dt) AS year,' \
    'date_extract_m(cmplnt_fr_dt) AS month,  pd_cd AS class, pd_desc, law_cat_cd AS level, crm_atpt_cptd_cd AS completed, latitude, longitude'
where_string = 'latitude IS NOT NULL AND (year = 2007 OR year = 2008 OR year = 2009 OR year = 2010 OR year = 2011 OR year = 2012 OR year = 2013)'
nyc_crime = socrata_API_df(source_domain = "data.cityofnewyork.us", dataset_id = 'qgea-i56i', \
                           select_string = select_string, where_string = where_string, limit = 4000000)


In [None]:
nyc_crime_pickle = open(f"{data_dir}{sep}pickle{sep}nyc_crime.pickle", "wb")
pickle.dump(nyc_crime, nyc_crime_pickle)

In [None]:
# Bring in HUD vacant addresses data
# Create list of the excel files that will need to be loaded in 
# Glob.glob creates a list of all the files that end in .xlsx in the directory of HUD vacant data
# The rest of the command filters out jsons that end in 00000.json since those represent meta counts and not actual tweets
hud_list = [j for j in glob.glob(f'{data_dir}{sep}hud_vacant_data{sep}*.csv')]

In [None]:
hud_df = pd.DataFrame()
for file in hud_list:
    temp_file = pd.read_csv(file, sep = None, engine='python')
#   Using title of the file, create a column for the year & month/quarter
    temp_file['year'] = ["20"+file[32:34] for i in range(temp_file.shape[0])]
    temp_file['month'] = [file[28:30] for i in range(temp_file.shape[0])]
    hud_df = hud_df.append(temp_file).reset_index(drop=True)


In [None]:
# Create a FIPS code variable that is equal to the string of geoid 
# (note that b/c it's in integers, we need to re-add leading zero for states with fips codes < 10) 
hud_df['fips_code'] = hud_df["GEOID"].apply(lambda x: ("0" + str(x))[-11:])

# Create file with only NY 
ny_hud = hud_df[hud_df['fips_code'].apply(lambda x: x[0:2] == "36")].reset_index(drop=True)

# Pickle ny hug file
nyc_hud_pickle = open(f"{data_dir}{sep}pickle{sep}nyc_hud.pickle", "wb")
pickle.dump(ny_hud, nyc_hud_pickle)

In [None]:
# Unpickle ny hud
ny_hud = pickle.load(open(f"{data_dir}{sep}pickle{sep}nyc_hud.pickle", "rb"))

In [None]:
ny_hud

In [None]:
raw_df = pickle.load(open(f'{data_dir}{sep}pickle{sep}raw_tweets_df.pickle', "rb"))

In [None]:
raw_df.columns