In [13]:
import pandas as pd
import numpy as np
from pydataset import data

import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T

### Spark up Spark

In [3]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

### Get Mongo CSV file

In [10]:
df=spark.read.csv('spark_csv/case.csv', header=True, inferSchema=True)
df.printSchema()

root
 |-- case_id: integer (nullable = true)
 |-- case_opened_date: string (nullable = true)
 |-- case_closed_date: string (nullable = true)
 |-- SLA_due_date: string (nullable = true)
 |-- case_late: string (nullable = true)
 |-- num_days_late: double (nullable = true)
 |-- case_closed: string (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: double (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: integer (nullable = true)



In [12]:
df.show(5, vertical=True, truncate=False)

-RECORD 0-----------------------------------------------------
 case_id              | 1014127332                            
 case_opened_date     | 1/1/18 0:42                           
 case_closed_date     | 1/1/18 12:29                          
 SLA_due_date         | 9/26/20 0:42                          
 case_late            | NO                                    
 num_days_late        | -998.5087616000001                    
 case_closed          | YES                                   
 dept_division        | Field Operations                      
 service_request_type | Stray Animal                          
 SLA_days             | 999.0                                 
 case_status          | Closed                                
 source_id            | svcCRMLS                              
 request_address      | 2315  EL PASO ST, San Antonio, 78207  
 council_district     | 5                                     
-RECORD 1----------------------------------------------

### Create a schema

In [16]:
schema = T.StructType(
    [
        T.StructField("source_id", T.StringType()),
        T.StructField("source_username", T.StringType()),
    ]
)

### Apply a schema

In [17]:
src=spark.read.csv('spark_csv/source.csv', header=True, schema=schema)
src.printSchema()

root
 |-- source_id: string (nullable = true)
 |-- source_username: string (nullable = true)



In [26]:
src.show(10)

+---------+-------------------+
|source_id|    source_username|
+---------+-------------------+
|   100137|   Merlene Blodgett|
|   103582|        Carmen Cura|
|   106463|    Richard Sanchez|
|   119403|     Betty De Hoyos|
|   119555|     Socorro Quiara|
|   119868|Michelle San Miguel|
|   120752|     Eva T. Kleiber|
|   124405|          Lori Lara|
|   132408|      Leonard Silva|
|   135723|       Amy Cardenas|
+---------+-------------------+
only showing top 10 rows



Will Spark figure out YES is True and NO is False?

In [18]:
df.select(
    'case_closed',
    F.col('case_closed').cast('boolean').alias('case_closed_b')
).groupBy('case_closed').pivot('case_closed_b').count().show()

+-----------+-----+------+
|case_closed|false|  true|
+-----------+-----+------+
|        YES| null|823594|
|         NO|18110|  null|
+-----------+-----+------+



Wow! It did!

In [21]:
df = (df
      .withColumn("case_closed", F.expr('case_closed == "YES"'))
      .withColumn("case_late", F.expr('case_late == "YES"'))
     )
df.show(3, vertical=True)

+----------+----------------+----------------+------------+---------+-------------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|   case_id|case_opened_date|case_closed_date|SLA_due_date|case_late|      num_days_late|case_closed|   dept_division|service_request_type|   SLA_days|case_status|source_id|     request_address|council_district|
+----------+----------------+----------------+------------+---------+-------------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|1014127332|     1/1/18 0:42|    1/1/18 12:29|9/26/20 0:42|    false| -998.5087616000001|       true|Field Operations|        Stray Animal|      999.0|     Closed| svcCRMLS|2315  EL PASO ST,...|               5|
|1014127333|     1/1/18 0:46|     1/3/18 8:11| 1/5/18 8:30|    false|-2.0126041669999997|       true|     Storm Water|Removal Of Obstru...|4.322222222| 

In [24]:
df = (df
      .withColumnRenamed('SLA_due_date', 'case_due_date')
      .withColumnRenamed('SLA_days', 'case_days')
     )
df.show(3, vertical=True)

+----------+----------------+----------------+-------------+---------+-------------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|   case_id|case_opened_date|case_closed_date|case_due_date|case_late|      num_days_late|case_closed|   dept_division|service_request_type|  case_days|case_status|source_id|     request_address|council_district|
+----------+----------------+----------------+-------------+---------+-------------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|1014127332|     1/1/18 0:42|    1/1/18 12:29| 9/26/20 0:42|    false| -998.5087616000001|       true|Field Operations|        Stray Animal|      999.0|     Closed| svcCRMLS|2315  EL PASO ST,...|               5|
|1014127333|     1/1/18 0:46|     1/3/18 8:11|  1/5/18 8:30|    false|-2.0126041669999997|       true|     Storm Water|Removal Of Obstru...|4.322222

In [27]:
df = df.withColumn("council_district", F.col("council_district").cast("string"))
df.show(3, vertical=True)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 1/1/18 0:42          
 case_closed_date     | 1/1/18 12:29         
 case_due_date        | 9/26/20 0:42         
 case_late            | false                
 num_days_late        | -998.5087616000001   
 case_closed          | true                 
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 case_days            | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 5                    
-RECORD 1------------------------------------
 case_id              | 1014127333           
 case_opened_date     | 1/1/18 0:46          
 case_closed_date     | 1/3/18 8:11          
 case_due_date        | 1/5/18 8:30          
 case_late            | false                
 num_days_late        | -2.0126041

In [29]:
print("--- Before handling dates")
df.select("case_opened_date", "case_closed_date", "case_due_date").show(5)

fmt = "M/d/yy H:mm"
df = (
    df.withColumn("case_opened_date", F.to_timestamp("case_opened_date", fmt))
    .withColumn("case_closed_date", F.to_timestamp("case_closed_date", fmt))
    .withColumn("case_due_date", F.to_timestamp("case_due_date", fmt))
)

print("--- After")
df.select("case_opened_date", "case_closed_date", "case_due_date").show(5)

--- Before handling dates
+----------------+----------------+-------------+
|case_opened_date|case_closed_date|case_due_date|
+----------------+----------------+-------------+
|     1/1/18 0:42|    1/1/18 12:29| 9/26/20 0:42|
|     1/1/18 0:46|     1/3/18 8:11|  1/5/18 8:30|
|     1/1/18 0:48|     1/2/18 7:57|  1/5/18 8:30|
|     1/1/18 1:29|     1/2/18 8:13| 1/17/18 8:30|
|     1/1/18 1:34|    1/1/18 13:29|  1/1/18 4:34|
+----------------+----------------+-------------+
only showing top 5 rows

--- After
+-------------------+-------------------+-------------------+
|   case_opened_date|   case_closed_date|      case_due_date|
+-------------------+-------------------+-------------------+
|2018-01-01 00:42:00|2018-01-01 00:42:00|2018-01-01 00:42:00|
|2018-01-01 00:46:00|2018-01-01 00:46:00|2018-01-01 00:46:00|
|2018-01-01 00:48:00|2018-01-01 00:48:00|2018-01-01 00:48:00|
|2018-01-01 01:29:00|2018-01-01 01:29:00|2018-01-01 01:29:00|
|2018-01-01 01:34:00|2018-01-01 01:34:00|2018-01-01 01:

In [31]:
df.printSchema()
df.explain()

root
 |-- case_id: integer (nullable = true)
 |-- case_opened_date: timestamp (nullable = true)
 |-- case_closed_date: timestamp (nullable = true)
 |-- case_due_date: timestamp (nullable = true)
 |-- case_late: boolean (nullable = true)
 |-- num_days_late: double (nullable = true)
 |-- case_closed: boolean (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- case_days: double (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: string (nullable = true)

== Physical Plan ==
*(1) Project [case_id#523, cast(unix_timestamp(case_opened_date#524, M/d/yy H:mm, Some(America/Chicago)) as timestamp) AS case_opened_date#1103, cast(unix_timestamp(cast(unix_timestamp(case_opened_date#524, M/d/yy H:mm, Some(America/Chicago)) as timestamp), M/d/yy H:mm, Some(America/Chicago)) as timestamp) AS case_closed_date#1118, ca

In [38]:
df.groupBy('request_address').agg(F.count('case_id').alias('calls')).sort(F.col('calls').desc()).show(truncate=False)

+----------------------------------------+-----+
|request_address                         |calls|
+----------------------------------------+-----+
|10133  FIGARO CANYON, San Antonio, 78251|5149 |
|834  BARREL POINT, San Antonio, 78251   |3183 |
|5800  ENRIQUE M BARR, San Antonio, 78227|2872 |
|928  MARBLE POINT, San Antonio, 78251   |1939 |
|874  BARREL POINT, San Antonio, 78251   |1917 |
|802  BARREL POINT, San Antonio, 78251   |1334 |
|838  BARREL POINT, San Antonio, 78251   |1279 |
|837  BARREL POINT, San Antonio, 78251   |937  |
|10129  BOXING PASS, San Antonio, 78251  |744  |
|10141  FIGARO CANYON, San Antonio, 78251|666  |
|200  DELTA, San Antonio, 78237          |580  |
|800  CHERRY N, San Antonio, 78202       |462  |
|1103  CINCINNATI, San Antonio, 78201    |417  |
|3100  HIAWATHA, San Antonio, 78210      |404  |
|923  MARBLE POINT, San Antonio, 78251   |404  |
|854  BARREL POINT, San Antonio, 78251   |400  |
|915  RANGER POINT, San Antonio, 78251   |371  |
|16240  US HWY 281 N

In [None]:
df.