In [91]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from scipy.ndimage.filters import gaussian_filter
import matplotlib.cm as cm
from matplotlib.colors import Normalize
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# import seaborn as sns
%matplotlib inline



In [92]:
spark = SparkSession.builder\
    .appName("popularity")\
    .getOrCreate()

In [93]:
fp = spark.read.csv('/Users/vvviren/Desktop/Film_Permits.csv',header='true')

In [94]:
cols = [
 'EventType',
 'StartDateTime',
 'EnteredOn',
 'EventAgency',
 'ParkingHeld',
 'Borough',
 'CommunityBoard(s)',
 'PolicePrecinct(s)',
 'Category',
 'SubCategoryName',
 'Country']
fp=fp.drop(*cols)

In [95]:
fp.show()

+-------+--------------------+--------------------+
|EventID|         EndDateTime|          ZipCode(s)|
+-------+--------------------+--------------------+
| 446040|10/20/2018 04:00:...|               10012|
| 446168|10/20/2018 02:00:...|        10034, 10463|
| 186438|10/31/2014 02:00:...|               11378|
| 445255|10/20/2018 06:00:...|               11201|
| 128794|11/17/2013 06:00:...|        10001, 10121|
|  43547|01/10/2012 07:00:...|        11101, 11222|
|  66846|07/27/2012 07:00:...|               11217|
| 104342|06/21/2013 09:00:...|               10036|
| 244863|09/16/2015 09:00:...|               10462|
| 446379|10/20/2018 10:00:...|               10014|
| 446359|10/20/2018 12:00:...|        10036, 10105|
| 203743|02/20/2015 07:00:...|        10014, 11101|
| 446069|10/20/2018 09:00:...|        11203, 11218|
| 445165|10/21/2018 06:00:...|        10001, 10121|
|  82397|01/07/2013 09:00:...|               11219|
| 137539|02/01/2014 06:00:...|               10001|
| 446355|10/

In [96]:
from pyspark.sql.functions import to_timestamp, year

fp = fp.withColumn("date", to_timestamp("EndDateTime", "MM/dd/yyyy HH:mm:ss ")).withColumn("year", year("date")).drop("EndDateTime").drop("date")
fp.show()

+-------+--------------------+----+
|EventID|          ZipCode(s)|year|
+-------+--------------------+----+
| 446040|               10012|2018|
| 446168|        10034, 10463|2018|
| 186438|               11378|2014|
| 445255|               11201|2018|
| 128794|        10001, 10121|2013|
|  43547|        11101, 11222|2012|
|  66846|               11217|2012|
| 104342|               10036|2013|
| 244863|               10462|2015|
| 446379|               10014|2018|
| 446359|        10036, 10105|2018|
| 203743|        10014, 11101|2015|
| 446069|        11203, 11218|2018|
| 445165|        10001, 10121|2018|
|  82397|               11219|2013|
| 137539|               10001|2014|
| 446355|               10036|2018|
| 446372|               11201|2018|
| 220618|               11104|2015|
| 446318|10017, 10022, 101...|2018|
+-------+--------------------+----+
only showing top 20 rows



In [97]:
from pyspark.sql.functions import explode,split


fp=fp.withColumn('tmp', split('ZipCode(s)', ', '))                                               
fp=fp.withColumn('zips', explode('tmp')).drop("tmp", "ZipCode(s)")

fp.show()

+-------+----+-----+
|EventID|year| zips|
+-------+----+-----+
| 446040|2018|10012|
| 446168|2018|10034|
| 446168|2018|10463|
| 186438|2014|11378|
| 445255|2018|11201|
| 128794|2013|10001|
| 128794|2013|10121|
|  43547|2012|11101|
|  43547|2012|11222|
|  66846|2012|11217|
| 104342|2013|10036|
| 244863|2015|10462|
| 446379|2018|10014|
| 446359|2018|10036|
| 446359|2018|10105|
| 203743|2015|10014|
| 203743|2015|11101|
| 446069|2018|11203|
| 446069|2018|11218|
| 445165|2018|10001|
+-------+----+-----+
only showing top 20 rows



In [101]:
import pyspark.sql.functions as func

fp_2012_fp = fp.filter(func.col("year") == 2012)
fp_2013_fp = fp.filter(func.col("year") == 2013)
fp_2014_fp = fp.filter(func.col("year") == 2014)
fp_2015_fp = fp.filter(func.col("year") == 2015)
fp_2016_fp = fp.filter(func.col("year") == 2016)
fp_2017_fp = fp.filter(func.col("year") == 2017)
fp_2018_fp = fp.filter(func.col("year") == 2018)
fp_2019_fp = fp.filter(func.col("year") == 2019)


fp_2012_fp = fp_2012_fp.groupby("zips").agg(func.count(func.lit(1)).alias("permits_2012")).sort("zips")
fp_2013_fp = fp_2013_fp.groupby("zips").agg(func.count(func.lit(1)).alias("permits_2013")).sort("zips")
fp_2014_fp = fp_2014_fp.groupby("zips").agg(func.count(func.lit(1)).alias("permits_2014")).sort("zips")
fp_2015_fp = fp_2015_fp.groupby("zips").agg(func.count(func.lit(1)).alias("permits_2015")).sort("zips")
fp_2016_fp = fp_2016_fp.groupby("zips").agg(func.count(func.lit(1)).alias("permits_2016")).sort("zips")
fp_2017_fp = fp_2017_fp.groupby("zips").agg(func.count(func.lit(1)).alias("permits_2017")).sort("zips")
fp_2018_fp = fp_2018_fp.groupby("zips").agg(func.count(func.lit(1)).alias("permits_2018")).sort("zips")
fp_2019_fp = fp_2019_fp.groupby("zips").agg(func.count(func.lit(1)).alias("permits_2019")).sort("zips")
fp_2012_fp.show()
fp_2013_fp.show()
fp_2014_fp.show()
fp_2015_fp.show()
fp_2016_fp.show()
fp_2017_fp.show()
fp_2018_fp.show()
fp_2019_fp.show()

+-----+------------+
| zips|permits_2012|
+-----+------------+
|    0|           6|
|00083|          34|
|10001|         297|
|10002|         244|
|10003|         270|
|10004|          98|
|10005|          98|
|10006|          20|
|10007|         135|
|10009|          69|
|10010|         172|
|10011|         288|
|10012|         235|
|10013|         359|
|10014|         273|
|10016|         115|
|10017|         157|
|10018|         135|
|10019|         545|
|10020|         131|
+-----+------------+
only showing top 20 rows

+-----+------------+
| zips|permits_2013|
+-----+------------+
|    0|           8|
|00083|          29|
|10001|         398|
|10002|         278|
|10003|         284|
|10004|          79|
|10005|          96|
|10006|          35|
|10007|         157|
|10009|         101|
|10010|         172|
|10011|         365|
|10012|         231|
|10013|         441|
|10014|         322|
|10016|         139|
|10017|         185|
|10018|         146|
|10019|         617|
|10020| 

In [102]:
high_zips = fp.groupby("zips").agg(func.count(func.lit(1)).alias("permits")).sort(col("permits").desc()).select("zips")\
.limit(15)
high_zips.show()

+-----+
| zips|
+-----+
|11222|
|11101|
|10036|
|10019|
|10001|
|10013|
|10011|
|11201|
|10003|
|10023|
|10014|
|10012|
|11217|
|10002|
|11211|
+-----+



In [105]:
fp_2012_fp.join(high_zips, fp_2012_fp["zips"]==high_zips["zips"]).drop(high_zips.zips).sort("zips").show()
fp_2013_fp.join(high_zips, fp_2013_fp["zips"]==high_zips["zips"]).drop(high_zips.zips).sort("zips").show()
fp_2014_fp.join(high_zips, fp_2014_fp["zips"]==high_zips["zips"]).drop(high_zips.zips).sort("zips").show()
fp_2015_fp.join(high_zips, fp_2015_fp["zips"]==high_zips["zips"]).drop(high_zips.zips).sort("zips").show()
fp_2016_fp.join(high_zips, fp_2016_fp["zips"]==high_zips["zips"]).drop(high_zips.zips).sort("zips").show()
fp_2017_fp.join(high_zips, fp_2017_fp["zips"]==high_zips["zips"]).drop(high_zips.zips).sort("zips").show()
fp_2018_fp.join(high_zips, fp_2018_fp["zips"]==high_zips["zips"]).drop(high_zips.zips).sort("zips").show()
fp_2019_fp.join(high_zips, fp_2019_fp["zips"]==high_zips["zips"]).drop(high_zips.zips).sort("zips").show()

+------------+-----+
|permits_2012| zips|
+------------+-----+
|         297|10001|
|         244|10002|
|         270|10003|
|         288|10011|
|         235|10012|
|         359|10013|
|         273|10014|
|         545|10019|
|         322|10023|
|         455|10036|
|         641|11101|
|         301|11201|
|         242|11211|
|         143|11217|
|         686|11222|
+------------+-----+

+------------+-----+
|permits_2013| zips|
+------------+-----+
|         398|10001|
|         278|10002|
|         284|10003|
|         365|10011|
|         231|10012|
|         441|10013|
|         322|10014|
|         617|10019|
|         326|10023|
|         615|10036|
|         761|11101|
|         389|11201|
|         223|11211|
|         228|11217|
|         747|11222|
+------------+-----+

+------------+-----+
|permits_2014| zips|
+------------+-----+
|         442|10001|
|         227|10002|
|         340|10003|
|         407|10011|
|         253|10012|
|         340|10013|
|         2