In [37]:
spark.stop()
sc.stop()

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
import sys
import string
from csv import reader
from functools import reduce
from pyspark.sql import functions as f
from collections import defaultdict
import datetime

In [3]:
spark = SparkSession \
                .builder \
                .appName("TableCollections") \
                .config("spark.io.compression.codec", "snappy") \
                .config("spark.shuffle.service.enabled", "false") \
                .config("spark.dynamicAllocation.enabled", "false") \
                .config("spark.rdd.compress", "true") \
                .getOrCreate()
sc = spark.sparkContext

In [4]:
parkingTable = spark.read.format('csv').options(header='true',inferschema='true').load('/user/ecc290/HW1data/parking-violations-header.csv')
openTable = spark.read.format('csv').options(header='true',inferschema='true').load('/user/ecc290/HW1data/open-violations-header.csv')

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
import sys
import string
from csv import reader
from functools import reduce
from pyspark.sql import functions as f
from collections import defaultdict
import datetime
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, TimestampType, StringType

class TableCollections:
    def __init__(self,spark,sc):
        self.spark = spark
        self.sc = sc
        self.tableNames = []
        self.fs = self.sc._jvm.org.apache.hadoop.fs.FileSystem.get(sc._jsc.hadoopConfiguration())
        
    def add_registered_table_name(self, name):
        numFileName = name + "_num_metadata.csv"
        timeFileName = name + "_time_metadata.csv"
        stringFileName = name + "_string_metadata.csv"
        if  self.fs.exists(self.sc._jvm.org.apache.hadoop.fs.Path(numFileName)) or \
            self.fs.exists(self.sc._jvm.org.apache.hadoop.fs.Path(timeFileName)) or \
            self.fs.exists(self.sc._jvm.org.apache.hadoop.fs.Path(stringFileName)):
            self.tableNames.append(name)
            return True
        return False
        
    def register(self, df, name):
        # Clean up column names so that we can prevent future errors
        for colName, dtype in df.dtypes:
            if '.' in colName or '`' in colName or colName.strip() != colName:
                df = df.withColumnRenamed(colName, colName.strip().replace(".", "").replace("`", ""))

        # track down which tables have been registered to the class
        self.tableNames.append(name)
        numFileName = name + "_num_metadata.csv"
        timeFileName = name + "_time_metadata.csv"
        stringFileName = name + "_string_metadata.csv"
        num_cols, time_cols, string_cols, bool_cols = [], [], [], [] #####
        df.createOrReplaceTempView(name) # can be problematic

        # put column names into appropriate bin
        for colName, dtype in df.dtypes:
            if dtype == 'timestamp':
                time_cols.append(colName)
            elif dtype == 'string':
                string_cols.append(colName)
            elif dtype == 'boolean':#####
                bool_cols.append(colName)
            else:
                num_cols.append(colName)

        # For each datatype of columns, process metadata
        if not self.fs.exists(self.sc._jvm.org.apache.hadoop.fs.Path(numFileName)):
            self.createNumMetadata(df, num_cols, numFileName)
            self.createBoolMetadata(df, bool_cols, numFileName)####
        else:
            print("num metadata file exists for table {}".format(name))
        if not self.fs.exists(self.sc._jvm.org.apache.hadoop.fs.Path(timeFileName)):
            self.createTimeMetadata(df, time_cols, timeFileName)
        else:
            print("timestamp metadata file exists for table {}".format(name))

    def createBoolMetadata(self, df, bool_cols, boolFilename):
        for colName in bool_cols:
            minMax = df.agg(f.min(df[colName]), f.max(df[colName])).collect()[0]
            metaDf = self.sc.parallelize([
                    (colName,float(minMax[0]),float(minMax[1]))]).toDF(["colName","min","max"])
            metaDf.write.save(path=boolFilename, header="false", format='csv', mode='append', sep = '^')
            
    def createTimeMetadata(self, df, time_cols, timeFileName):
        for colName in time_cols:
            minMax = df.agg(f.min(df[colName]), f.max(df[colName])).collect()[0]
            metaDf = self.sc.parallelize([
                    (colName,minMax[0].strftime("%Y-%m-%d %H:%M:%S"),minMax[1].strftime("%Y-%m-%d %H:%M:%S"))]).toDF(["colName","min","max"])
            metaDf.write.save(path=timeFileName, header="false", format='csv', mode='append', sep = '^')

    def createNumMetadata(self, df, num_cols, numFileName):
        describeTable = df[num_cols].describe().collect()
        
        for colName in num_cols:
            metaDf = self.sc.parallelize([
                     (colName,float(describeTable[3][colName]),float(describeTable[4][colName]))]).toDF(["colName","min","max"])
            metaDf.write.save(path=numFileName, header="false", format='csv', mode='append', sep = '^')

    def timeColWithinRange(self, minTime, maxTime):
        resultCreated = False
        if type(minTime) != datetime.datetime or type(maxTime) != datetime.datetime:
            raise TypeError("minNum, maxNum must be timestamp")
            
        schema = StructType([
            StructField("colName", StringType(), True),
            StructField("min", TimestampType(), True),
            StructField("max", TimestampType(), True)])
        
        for each in self.tableNames:
            filename = each + '_time_metadata.csv'
            if self.fs.exists(self.sc._jvm.org.apache.hadoop.fs.Path(filename)):
                currentTable = spark.read.csv(filename,header=False,schema=schema, sep='^')
                if not resultCreated:
                    resultDf = currentTable.where(currentTable.min>minTime).where(currentTable.max<maxTime).select(currentTable.colName).withColumn("tableName", f.lit(each))
                    resultCreated = True
                else:
                    resultDf = resultDf.union(currentTable.where(currentTable.min>minTime).where(currentTable.max<maxTime).select(currentTable.colName).withColumn("tableName", f.lit(each)))

        return resultDf

    def numColWithinRange(self, minNum, maxNum):
        # int, bigint, float, long
        resultCreated = False
        if type(minNum) == datetime.datetime or \
            type(minNum) == str or \
            type(maxNum) == datetime.datetime or \
            type(maxNum) == str:
            raise TypeError("minNum, maxNum must be number")
            
        schema = StructType([
            StructField("colName", StringType(), True),
            StructField("min", DoubleType(), True),
            StructField("max", DoubleType(), True)])
        
        for each in self.tableNames:
            filename = each + '_num_metadata.csv'
            if self.fs.exists(self.sc._jvm.org.apache.hadoop.fs.Path(filename)):
                currentTable = spark.read.csv(filename,header=False,schema=schema, sep='^')
                if not resultCreated:
                    resultDf = currentTable.where(currentTable.min>minNum).where(currentTable.max<maxNum).select(currentTable.colName).withColumn("tableName", f.lit(each))
                    resultCreated = True
                else:
                    resultDf = resultDf.union(currentTable.where(currentTable.min>minNum).where(currentTable.max<maxNum).select(currentTable.colName).withColumn("tableName",f.lit(each)))
        return resultDf

In [16]:
tc = TableCollections(spark, sc)
tc.register(openTable, "open")
tc.register(parkingTable, "parking")
tc.numColWithinRange(0, 1000000000000).show()
tc.timeColWithinRange(datetime.datetime(1994,1,1), datetime.datetime(2018,5,1)).show()

num metadata file exists for table open
num metadata file exists for table parking
timestamp metadata file exists for table parking
+--------------+---------+
|       colName|tableName|
+--------------+---------+
|summons_number|     open|
|   fine_amount|     open|
|summons_number|  parking|
|violation_code|  parking|
+--------------+---------+

+----------+---------+
|   colName|tableName|
+----------+---------+
|issue_date|  parking|
+----------+---------+



In [17]:
tc = TableCollections(spark, sc)
dirname = '/user/jp4989/open_data_csv/'
found = False
with open('data_ids', 'r') as g:
    ids = g.readlines()
    for each in ids:
        if each.strip() == '54k3-2wtq':
            found = True
        if not found:
            continue
        filename = dirname+each.strip()+'.csv'
        df = spark.read.format('csv').options(header='true',inferschema='true').load(filename)
        if not df.rdd.isEmpty():
            tc.register(df, 'nyc_'+each.strip().replace('-','_'))

In [5]:
tc = TableCollections(spark, sc)
dirname = '/user/jp4989/open_data_csv/'
with open('data_ids', 'r') as g:
    ids = g.readlines()
    for each in ids:
        tc.add_registered_table_name('nyc_'+each.strip().replace('-','_'))
        if each.strip() == '6m56-5mfb':
            break

In [6]:
tc.numColWithinRange(0, 100).show()

+--------------------+-------------+
|             colName|    tableName|
+--------------------+-------------+
|City Council Dist...|nyc_22rf_yxcy|
| Community Districts|nyc_22rf_yxcy|
|  Borough Boundaries|nyc_22rf_yxcy|
|    Police Precincts|nyc_22rf_yxcy|
|         % Level 3+4|nyc_26kp_bgdh|
|           % Level 2|nyc_26kp_bgdh|
|           % Level 1|nyc_26kp_bgdh|
|           % Level 3|nyc_26kp_bgdh|
|  Average Class Size|nyc_276h_y36a|
|          Level3+4_%|nyc_27h8_t3wt|
|            Level2_%|nyc_27h8_t3wt|
|            Level3_%|nyc_27h8_t3wt|
|            district|nyc_27h8_t3wt|
|            Level4_%|nyc_27h8_t3wt|
|            Level1_%|nyc_27h8_t3wt|
|            latitude|nyc_29km_avyc|
|   Pct Level 3 and 4|nyc_2bh6_qmgg|
|         Pct Level 3|nyc_2bh6_qmgg|
|         Pct Level 4|nyc_2bh6_qmgg|
|         Pct Level 2|nyc_2bh6_qmgg|
+--------------------+-------------+
only showing top 20 rows



In [7]:
# return NYC longitude columns
tc.numColWithinRange(-76, -70).show()

+---------+-------------+
|  colName|    tableName|
+---------+-------------+
|longitude|nyc_29km_avyc|
|Longitude|nyc_2fra_mtpn|
|Longitude|nyc_2n4x_d97d|
|Longitude|nyc_2q48_ip9a|
|  Borough|nyc_35f6_8qd2|
|Longitude|nyc_35sw_rdxj|
|Longitude|nyc_36hn_wea6|
|Longitude|nyc_37fm_7uaa|
|Longitude|nyc_37it_gmcp|
|Longitude|nyc_3qfc_4tta|
|Longitude|nyc_3rfa_3xsf|
|Longitude|nyc_3spy_rjpw|
|Longitude|nyc_48pb_zy2g|
|Longitude|nyc_4wf2_7kdu|
|Longitude|nyc_4zdr_zwdi|
|Longitude|nyc_56u5_n9sa|
|Longitude|nyc_57mv_nv28|
|Longitude|nyc_59kj_x8nc|
|Longitude|nyc_59t5_r7nb|
|Longitude|nyc_5e24_x4wa|
+---------+-------------+
only showing top 20 rows



In [8]:
# return NYC latitude columns
tc.numColWithinRange(39, 43).show()

+--------+-------------+
| colName|    tableName|
+--------+-------------+
|latitude|nyc_29km_avyc|
|Latitude|nyc_2fra_mtpn|
|Latitude|nyc_2n4x_d97d|
|Latitude|nyc_2q48_ip9a|
|Latitude|nyc_35sw_rdxj|
|Latitude|nyc_36hn_wea6|
|Latitude|nyc_37fm_7uaa|
|Latitude|nyc_37it_gmcp|
|Latitude|nyc_3qfc_4tta|
|Latitude|nyc_3rfa_3xsf|
|Latitude|nyc_3spy_rjpw|
|Latitude|nyc_48pb_zy2g|
|Latitude|nyc_4wf2_7kdu|
|Latitude|nyc_4zdr_zwdi|
|Latitude|nyc_56u5_n9sa|
|Latitude|nyc_57mv_nv28|
|Latitude|nyc_59kj_x8nc|
|Latitude|nyc_59t5_r7nb|
|Latitude|nyc_5e24_x4wa|
|Latitude|nyc_5fn4_dr26|
+--------+-------------+
only showing top 20 rows

