In [20]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from scipy.ndimage.filters import gaussian_filter
import matplotlib.cm as cm
from matplotlib.colors import Normalize
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# import seaborn as sns
%matplotlib inline

In [21]:
spark = SparkSession.builder\
    .appName("demographics")\
    .getOrCreate()

In [22]:
demog = spark.read.csv('/Users/vvviren/Desktop/Demographic_Statistics_By_Zip_Code.csv',header='true')

demog = demog.filter(demog['count participants'] != '0')

cols=[]
for c in demog.columns:
    if 'COUNT' in c:
        cols.append(c)

cols.remove('COUNT PARTICIPANTS')
cols

demog = demog.drop(*cols)


In [23]:
fp = spark.read.csv('/Users/vvviren/Desktop/Film_Permits.csv',header='true')

cols = ['EventId', 'EventAgency', 'CommunityBoard', 'PolicePrecint', 'Country']
fp.drop(*cols)

DataFrame[EventType: string, StartDateTime: string, EndDateTime: string, EnteredOn: string, ParkingHeld: string, Borough: string, CommunityBoard(s): string, PolicePrecinct(s): string, Category: string, SubCategoryName: string, ZipCode(s): string]

In [24]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import split


fp=fp.withColumn('tmp', split('ZipCode(s)', ', '))
                                               
fp=fp.withColumn('zips', explode('tmp'))

In [25]:
left_join = fp.join(demog, fp['zips'] == demog['JURISDICTION NAME'],how='left')


In [26]:
from pyspark.sql.functions import col
import pyspark.sql.functions as F

cntfp = left_join.groupby(left_join.zips).count()
sortcntfp = cntfp.sort(col('count').desc())
sortcntfp = sortcntfp.filter(col('count')>1100)
sortcntfp.show()

+-----+-----+
| zips|count|
+-----+-----+
|11222| 7271|
|11101| 5694|
|10036| 4174|
|10019| 3969|
|10001| 3165|
|10013| 2998|
|10011| 2765|
|11201| 2516|
|10003| 2459|
|10023| 2401|
|10014| 2017|
|10012| 1976|
|11217| 1922|
|10002| 1908|
|11211| 1762|
|11106| 1581|
|11249| 1566|
|10022| 1352|
|10010| 1246|
|11237| 1188|
+-----+-----+
only showing top 20 rows



In [27]:
from pyspark.sql.functions import col

filterfp = left_join.join(sortcntfp, left_join['zips']==sortcntfp['zips']).drop(sortcntfp.zips)
df = sortcntfp.join(demog, sortcntfp['zips'] == demog['JURISDICTION NAME'],how='left')

In [28]:
from pyspark.sql.types import DoubleType
df = df.withColumn("percent_us_citizen", df["PERCENT US CITIZEN"].cast(DoubleType()))
df = df.withColumn("percent_other_citizen", df["PERCENT OTHER CITIZEN STATUS"].cast(DoubleType()))
df = df.withColumn("percent_unknown_citizen", df["PERCENT CITIZEN STATUS UNKNOWN"].cast(DoubleType()))
df = df.withColumn("percent_hispanic_latino", df["PERCENT HISPANIC LATINO"].cast(DoubleType()))
df = df.withColumn("percent_american_indian", df["PERCENT AMERICAN INDIAN"].cast(DoubleType()))
df = df.withColumn("percent_male", df["PERCENT MALE"].cast(DoubleType()))
df = df.withColumn("percent_female", df["PERCENT FEMALE"].cast(DoubleType()))
df = df.withColumn("percent_asian_non_hispanic", df['PERCENT ASIAN NON HISPANIC'].cast(DoubleType()))
df = df.withColumn("percent_white_non_hispanic", df['PERCENT WHITE NON HISPANIC'].cast(DoubleType()))
df = df.withColumn("percent_black_non_hispanic", df['PERCENT BLACK NON HISPANIC'].cast(DoubleType()))
df = df.withColumn("percent_ethinicity_unknown", df['PERCENT ETHNICITY UNKNOWN'].cast(DoubleType()))

In [29]:
df.sort(col('percent_us_citizen').desc()).select('zips','count','percent_us_citizen').show()
df.sort(col('percent_hispanic_latino').desc()).select('zips','count','percent_hispanic_latino').show()
df.sort(col('percent_male').desc()).select('zips','count','percent_male').show()

+-----+-----+------------------+
| zips|count|percent_us_citizen|
+-----+-----+------------------+
|10011| 2765|               1.0|
|11201| 2516|               1.0|
|10036| 4174|               1.0|
|11211| 1762|               1.0|
|10003| 2459|               1.0|
|10023| 2401|               1.0|
|10022| 1352|               1.0|
|10013| 2998|               1.0|
|11101| 5694|               1.0|
|10001| 3165|              0.95|
|10002| 1908|              0.94|
|10018| 1154|              0.67|
|11222| 7271|              null|
|10014| 2017|              null|
|11106| 1581|              null|
|11249| 1566|              null|
|10012| 1976|              null|
|10010| 1246|              null|
|11385| 1162|              null|
|11217| 1922|              null|
+-----+-----+------------------+
only showing top 20 rows

+-----+-----+-----------------------+
| zips|count|percent_hispanic_latino|
+-----+-----+-----------------------+
|10023| 2401|                   0.43|
|10001| 3165|                 

In [30]:
df.stat.corr('count', 'percent_us_citizen')

0.1685760188209232

In [31]:
df.stat.corr('count', 'percent_hispanic_latino')

0.05192380017097818

In [32]:
df.stat.corr('count', 'percent_white_non_hispanic')

-0.11742141292817125

In [33]:
df.stat.corr('count', 'percent_asian_non_hispanic')

0.10755895535994457

In [34]:
df.stat.corr('count', 'percent_black_non_hispanic')

0.1892730768700985