In [1]:
#import libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, ByteType, LongType, FloatType, ShortType
from pyspark.sql.functions import col, sum as spark_sum

In [2]:
#initialize Spark session
spark = SparkSession.builder \
    .config("spark.driver.memory", "10g") \
	.config("spark.executor.memory", "2g") \
    .config('spark.executor.instances', 5) \
	.config("spark.sql.debug.maxToStringFields", "100")\
    .appName("Flood Data") \
	.getOrCreate()

24/05/05 11:56:07 WARN Utils: Your hostname, MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.105 instead (on interface en0)
24/05/05 11:56:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/05 11:56:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
#declare schema for dataframe
schema = StructType([
    StructField("agriculturestructureindicator", StringType(), True),
    StructField("basefloodelevation", IntegerType(), True),
    StructField("basementenclosurecrawlspacetype", ByteType(), True),
    StructField("cancellationdateoffloodpolicy", DateType(), True),
    StructField("censustract", LongType(), True),
    StructField("condominiumindicator", StringType(), True),
    StructField("construction", StringType(), True),
    StructField("countycode", IntegerType(), True),
    StructField("crsdiscount", FloatType(), True),
    StructField("deductibleamountinbuildingcoverage", ByteType(), True),
    StructField("deductibleamountincontentscoverage", ByteType(), True),
    StructField("elevatedbuildingindicator", StringType(), True),
    StructField("elevationcertificateindicator", StringType(), True),
    StructField("elevationdifference", ByteType(), True),
    StructField("federalpolicyfee", ShortType(), True),
    StructField("floodzone", StringType(), True),
    StructField("hfiaasurcharge", ShortType(), True),
    StructField("houseofworshipindicator", StringType(), True),
    StructField("latitude", ByteType(), True),
    StructField("locationofcontents", StringType(), True),
    StructField("longitude", ShortType(), True),
    StructField("lowestadjacentgrade", IntegerType(), True),
    StructField("lowestfloorelevation", IntegerType(), True),
    StructField("nonprofitindicator", StringType(), True),
    StructField("numberoffloorsininsuredbuilding", ByteType(), True),
    StructField("obstructiontype", IntegerType(), True),
    StructField("occupancytype", ByteType(), True),
    StructField("originalconstructiondate", DateType(), True),
    StructField("originalnbdate", DateType(), True),
    StructField("policycost", IntegerType(), True),
    StructField("policycount", ShortType(), True),
    StructField("policyeffectivedate", DateType(), True),
    StructField("policyterminationdate", DateType(), True),
    StructField("policytermindicator", ByteType(), True),
    StructField("postfirmconstructionindicator", StringType(), True),
    StructField("primaryresidenceindicator", StringType(), True),
    StructField("propertystate", StringType(), True),
    StructField("reportedzipcode", IntegerType(), True),
    StructField("ratemethod", ByteType(), True),
    StructField("regularemergencyprogramindicator", StringType(), True),
    StructField("reportedcity", StringType(), True),
    StructField("smallbusinessindicatorbuilding", StringType(), True),
    StructField("totalbuildinginsurancecoverage", IntegerType(), True),
    StructField("totalcontentsinsurancecoverage", IntegerType(), True),
    StructField("totalinsurancepremiumofthepolicy", IntegerType(), True)])

In [4]:
#Create dataframe from data using schema defined above
df = spark.read.csv("./NFIP/nfip-flood-policies.csv", header=True, schema=schema)

In [5]:
df.printSchema()

root
 |-- agriculturestructureindicator: string (nullable = true)
 |-- basefloodelevation: integer (nullable = true)
 |-- basementenclosurecrawlspacetype: byte (nullable = true)
 |-- cancellationdateoffloodpolicy: date (nullable = true)
 |-- censustract: long (nullable = true)
 |-- condominiumindicator: string (nullable = true)
 |-- construction: string (nullable = true)
 |-- countycode: integer (nullable = true)
 |-- crsdiscount: float (nullable = true)
 |-- deductibleamountinbuildingcoverage: byte (nullable = true)
 |-- deductibleamountincontentscoverage: byte (nullable = true)
 |-- elevatedbuildingindicator: string (nullable = true)
 |-- elevationcertificateindicator: string (nullable = true)
 |-- elevationdifference: byte (nullable = true)
 |-- federalpolicyfee: short (nullable = true)
 |-- floodzone: string (nullable = true)
 |-- hfiaasurcharge: short (nullable = true)
 |-- houseofworshipindicator: string (nullable = true)
 |-- latitude: byte (nullable = true)
 |-- locationofconte

### Column Description

**Description of the columns from the FEMA's National Flood Insurance Policy Database, grouped by their data types and purpose:**

**Geographic and Location Details**
- **censustract (long):** Census tract number indicating the specific area where the property is located, used for demographic analysis.
- **countycode (integer):** Numeric code representing the county in which the property is insured.
- **floodzone (string):** Designation of the flood zone according to FEMA's mapping, crucial for assessing the property's flood risk.
- **latitude (byte), longitude (short):** Geographic coordinates specifying the precise location of the property. These should likely be of type double for accurate geographical representation, suggesting a potential data quality issue.
- **propertystate (string):** The U.S. state where the property is located.
- **reportedcity (string):** The city reported for the insured property.
- **reportedzipcode (integer):** Zip code where the property is situated, used for localizing insurance coverage and risk.

**Property and Construction Details**
- **agriculturestructureindicator (string):** Indicates whether the property is used for agricultural purposes.
- **basementenclosurecrawlspacetype (byte):** Type of basement or crawlspace present at the property, affecting flood risk assessment.
- **construction (string):** Describes the type of construction materials and methods used, which can affect the property's vulnerability to flood damage.
- **numberoffloorsininsuredbuilding (byte):** Total floors in the insured building, important for determining potential flood damage and insurance coverage needs.
- **elevatedbuildingindicator (string):** Indicates whether the building is elevated, a key factor in reducing flood risk.

**Policy Details**
- **policycost (integer):** The total cost of the flood insurance policy.
- **policycount (short):** The number of policies associated with a single property or account.
- **policyeffectivedate (date), policyterminationdate (date):** Start and end dates of the flood insurance coverage.
- **totalbuildinginsurancecoverage (integer), totalcontentsinsurancecoverage (integer):** The amount of insurance coverage for the building and its contents, respectively.
- **totalinsurancepremiumofthepolicy (integer):** Total premium amount for the flood insurance policy.

**Flood Risk Assessment Specifics**
- **basefloodelevation (integer):** The base flood elevation expected for a particular area, critical for understanding flood risk levels.
- **elevationcertificateindicator (string), elevationdifference (byte):** Presence of an elevation certificate and the difference in elevation, respectively, both crucial for assessing compliance with floodplain management.
- **lowestadjacentgrade (integer), lowestfloorelevation (integer):** Measures of elevation that help determine the property's flood exposure.

**Insurance Policy Features**
- **crsdiscount (float):** Community Rating System discount applied to the policy, which can reduce insurance premiums based on community flood preparedness.
- **deductibleamountinbuildingcoverage (byte), deductibleamountincontentscoverage (byte):** Deductible amounts for building and contents coverage, influencing out-of-pocket costs after a flood.
- **hfiaasurcharge (short):** Surcharge applied under the Homeowner Flood Insurance Affordability Act.
- **federalpolicyfee (short):** A fee associated with the federal policy governing flood insurance.

**Special Indicators**
- **condominiumindicator (string), primaryresidenceindicator (string):** Indicate whether the insured property is a condominium or the primary residence of the owner.
- **houseofworshipindicator (string), nonprofitindicator (string):** Indicators of whether the property is used as a house of worship or is owned by a nonprofit organization, affecting policy terms and possibly qualifying for special considerations.
- **postfirmconstructionindicator (string):** Indicates if the building was constructed after the community's first Flood Insurance Rate Map was issued, which can affect insurance rates.
- **smallbusinessindicatorbuilding (string):** Indicates whether the insured building is used for small business purposes.

**Additional Policy and Coverage Information**
- **originalconstructiondate (date), originalnbdate (date):** Dates of original construction and the building's initial notebook entry, important for historical property assessments.
- **cancellationdateoffloodpolicy (date):** Date when the flood policy was cancelled, if applicable.
- **regularemergencyprogramindicator (string):** Indicates the type of FEMA program under which the policy is covered, distinguishing between regular and emergency management programs.
- **ratemethod (byte):** Describes the method used to calculate the insurance rate, impacting how premiums are determined.
- **locationofcontents (string):** Specifies where within the property the insured contents are located, relevant for claims and risk assessments.

### Display number of Variables (Columns)


In [6]:
num_variables = len(df.columns)
print("Number of Columns:", num_variables)

Number of Columns: 45


### Print Schema

Understanding the structure of data (column names and types) is crucial.

### Show the first few rows

In [7]:
df.show(5,vertical=True)

                                                                                

-RECORD 0--------------------------------------------------
 agriculturestructureindicator      | NULL                 
 basefloodelevation                 | NULL                 
 basementenclosurecrawlspacetype    | 2                    
 cancellationdateoffloodpolicy      | NULL                 
 censustract                        | 33013038500          
 condominiumindicator               | N                    
 construction                       | N                    
 countycode                         | 33013                
 crsdiscount                        | 0.0                  
 deductibleamountinbuildingcoverage | 0                    
 deductibleamountincontentscoverage | 0                    
 elevatedbuildingindicator          | N                    
 elevationcertificateindicator      | NULL                 
 elevationdifference                | NULL                 
 federalpolicyfee                   | 13                   
 floodzone                          | X 

### Display number of observations

In [8]:
df.count()

                                                                                

50406943

FEMA's National Flood Insurance Policy Database, containing over 50 million (50,406,943) policy transactions.

### Missing Value in the Dataframe

In [9]:
#find number of missing values in the dataframe
missing_vals = df.select(*(spark_sum(col(i).isNull().cast("int")).alias(i) for i in df.columns))

In [10]:
missing_vals.show(vertical= True)



-RECORD 0--------------------------------------
 agriculturestructureindicator      | 38923313 
 basefloodelevation                 | 50406943 
 basementenclosurecrawlspacetype    | 802      
 cancellationdateoffloodpolicy      | 43614057 
 censustract                        | 467119   
 condominiumindicator               | 6        
 construction                       | 13       
 countycode                         | 48999    
 crsdiscount                        | 0        
 deductibleamountinbuildingcoverage | 15649149 
 deductibleamountincontentscoverage | 18265104 
 elevatedbuildingindicator          | 258      
 elevationcertificateindicator      | 32606397 
 elevationdifference                | 32897994 
 federalpolicyfee                   | 0        
 floodzone                          | 169145   
 hfiaasurcharge                     | 0        
 houseofworshipindicator            | 34476251 
 latitude                           | 50406943 
 locationofcontents                 | 15

                                                                                

**Here's a description of the missing value situation in the FEMA Flood Insurance Policy Database:**

1. **High Missing Values:**
- **Base Flood Elevation, Latitude, Longitude, Lowest Adjacent Grade, Lowest Floor Elevation:** These fields each have around 50,406,943 missing values. This suggests a significant lack of geographic and elevation data, which are critical in flood insurance calculations.
- **Elevation Certificate Indicator, Elevation Difference:** Both fields are missing approximately 32,806,397 and 32,897,994 values respectively, indicating that elevation certificates, which are vital for verifying compliance with floodplain management regulations, are largely absent.
- **Obstruction Type:** Missing around 40,793,526 values, indicating that details about obstructions which can affect flood risk assessments are predominantly not reported.
- **House of Worship Indicator, Nonprofit Indicator:** Each has over 34,476,251 and 34,493,094 missing entries respectively, indicating a lack of identification of these property types, which might have different considerations in policy terms.

2. **Moderate Missing Values:**
- **Deductible Amount in Building Coverage, Deductible Amount in Contents Coverage:** Missing 15,649,149 and 18,265,104 values respectively, which implies incomplete data on policy deductibles that could affect premium calculations and risk assessments.
- **Location of Contents:** With 15,389,767 missing entries, there's substantial missing information on where contents are located within the insured buildings, which is vital for damage assessments.

3. **Low Missing Values:**
- **Census Tract, Flood Zone:** Missing 467,119 and 169,145 entries respectively. Although relatively lower, these still represent significant gaps, especially as these fields are crucial for location-specific risk assessment.
- **Number of Floors in Insured Building:** Missing data on 162,301 entries could affect understanding building structure and associated risk.

4. **Minimal to No Missing Values:**
- Fields like **CRS Discount, Federal Policy Fee**, and various policy-related dates (effectiveness, termination) and costs show zero missing values, indicating complete data in terms of policy transaction details.
- Similarly, **County Code, Construction, Condominium Indicator, Occupancy Type** show minimal missing data (under 50,000), suggesting good coverage of basic property and policyholder information.

Overall, the dataset shows a strong presence of policy and basic property information but suffers from a significant absence of detailed geographic and structural data. This gap in data can hamper effective risk assessment and pricing of flood insurance policies, especially in areas prone to flooding where such data is most critical. Addressing these missing values, either by data imputation where appropriate or by collecting missing data, could significantly enhance the robustness of any analysis or predictive modeling based on this dataset.

### Statistics Summary and Data Distribution

In [24]:
numerical_columns = [col_name for col_name, data_type in df.dtypes if data_type in ['int', 'bigint', 'smallint', 'tinyint', 'float']]

# Select only numerical columns
numerical_df = df.select(*numerical_columns)

# Generate summary statistics
summary_stats = numerical_df.describe()

# Show summary statistics
summary_stats.show(vertical = True)




-RECORD 0--------------------------------------------------
 summary                            | count                
 basefloodelevation                 | 0                    
 basementenclosurecrawlspacetype    | 50406141             
 censustract                        | 49939824             
 countycode                         | 50357944             
 crsdiscount                        | 50406943             
 deductibleamountinbuildingcoverage | 34757794             
 deductibleamountincontentscoverage | 32141839             
 elevationdifference                | 17508949             
 federalpolicyfee                   | 50406943             
 hfiaasurcharge                     | 50406943             
 latitude                           | 0                    
 longitude                          | 0                    
 lowestadjacentgrade                | 0                    
 lowestfloorelevation               | 0                    
 numberoffloorsininsuredbuilding    | 50

                                                                                

The summary statistics for the FEMA National Flood Insurance Policy Database provide a comprehensive overview of various policy and property-related numerical attributes. These statistics include measures of central tendency, dispersion, and range, all of which are critical for understanding the distribution and potential data quality issues within the dataset. Below is a detailed analysis of the key statistical summaries:

**Central Tendency and Dispersion**
1. **Base Flood Elevation, Latitude, Longitude, Lowest Adjacent Grade, Lowest Floor Elevation:** These columns have no recorded data (count = 0), indicating that all values are missing or unrecorded for the entire dataset.


2. **Basement Enclosure Crawl Space Type:**
- Average (Mean): 0.37, indicating a slight bias towards lower classifications.
- Standard Deviation: 0.86, showing moderate variability within the data.
- Range: Min 0 to Max 4, spanning several classification levels.
3. **Census Tract:**
- Average: Approximately 2.6 x 10¹⁰.
- Standard Deviation: About 1.58 x 10¹⁰, suggesting a wide spread across census tracts.
4. **CRS Discount:**
- Average: 0.064, typically low across the dataset.
- Standard Deviation: 0.091, with most data points close to zero but some higher values.
5. **Deductible Amount in Building and Contents Coverage:**
- Building Coverage Average: 1.66 with a deviation of 1.46.
- Contents Coverage Average: 0.98 with a deviation of 1.05.
- Both show low average deductible amounts but with notable variation.
6. **Elevation Difference:**
- Average: 1.69, indicating minor differences in elevation on average.
- Standard Deviation: 3.39, suggesting significant outliers affecting the elevation difference.
7. **Policy Related Figures (Policy Cost, Policy Count, Total Insurance Coverage, etc.):**
- These values have a high mean and standard deviation, indicating a significant spread in the policy costs and coverages, reflecting diverse insurance policies and property valuations.

**Extremes (Minimum and Maximum Values)**
- Notable minimums include negative values in **Federal Policy Fee and HFIAA Surcharge**, possibly indicating refunds or adjustments.
- The maximum values in **Total Building Insurance Coverage and Total Insurance Premium of the Policy** reach into the hundreds of millions, highlighting cases with exceptionally high insurance coverage.

**Implications**
The substantial missing data in critical geographical and elevation columns could significantly hinder risk assessment accuracy. The wide variability in policy costs and coverage levels underscores the diverse nature of the insured properties. Accurate and complete data in these fields are crucial for effective risk management and policy pricing in flood insurance.

This analysis provides a basis for further data cleaning, particularly in addressing missing values and outliers, which are essential for improving data quality and the reliability of subsequent analyses and decision-making processes based on this dataset.

### Correlations Among Variables

In [25]:
numerical_df.show(5,vertical=True)

-RECORD 0-----------------------------------------
 basefloodelevation                 | NULL        
 basementenclosurecrawlspacetype    | 2           
 censustract                        | 33013038500 
 countycode                         | 33013       
 crsdiscount                        | 0.0         
 deductibleamountinbuildingcoverage | 0           
 deductibleamountincontentscoverage | 0           
 elevationdifference                | NULL        
 federalpolicyfee                   | 13          
 hfiaasurcharge                     | 0           
 latitude                           | NULL        
 longitude                          | NULL        
 lowestadjacentgrade                | NULL        
 lowestfloorelevation               | NULL        
 numberoffloorsininsuredbuilding    | 2           
 obstructiontype                    | NULL        
 occupancytype                      | 1           
 policycost                         | 388         
 policycount                   

In [26]:
import pandas as pd

#sample_num_df = numerical_df.sample(False, 0.5,seed=42)
sample_num_df = numerical_df.drop('basefloodelevation','latitude','longitude','lowestadjacentgrade','lowestfloorelevation')
#sample_num_df = sample_num_df.fillna(0)
#sample_num_df = sample_num_df.dropna()


from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Imputer


v_col = "features"
input = sample_num_df.columns
output = [v_col + str(i) for i in range(len(input))]
imputer = Imputer(strategy='mean',inputCols=input,outputCols=output)
changed_df = imputer.fit(sample_num_df).transform(sample_num_df) 

assembler = VectorAssembler(inputCols=output,outputCol=v_col)
numerical_df_vector = assembler.transform(changed_df).select(v_col)
numerical_df_vector

                                                                                

DataFrame[features: vector]

In [27]:
matrix = Correlation.corr(numerical_df_vector,v_col).collect()[0][0]
corr_matrix = matrix.toArray().tolist()
print(corr_matrix)

                                                                                

[[1.0, 0.13623903586845484, 0.13222438896911154, -0.13374568394718683, 0.15131028789055215, 0.0015086650337750639, -0.1581868385179278, 0.021515636223077703, 0.039205129679828934, 0.4205917981132782, 0.5982425523315253, -0.06389760287441462, 0.07213744564440353, 0.008797902979000462, -0.02450568557810686, -0.1891662489640829, -0.06343028257339026, 0.006946886998778825, -0.0989240851702171, 0.0693259281247627], [0.13623903586845484, 1.0, 0.9861052821687086, -0.23633772584537877, -0.015750279701414523, -0.008547830783034996, -0.024669084575636498, -0.04651095355139787, -0.0017065025678084952, 0.13092070392149222, 0.03762336418085031, -0.0215185447700448, -0.009406725795160385, -0.023542521100064735, 0.035625079186621965, 0.08585317162751503, 0.1268144994695155, -0.026642760572685668, 0.014045383330901511, -0.007888058597759437], [0.13222438896911154, 0.9861052821687086, 1.0, -0.2395338046623131, -0.01666870410289107, -0.009070867940185069, -0.022579883994719545, -0.04591061242264458, -0.

In [28]:
sample_num_df.columns

['basementenclosurecrawlspacetype',
 'censustract',
 'countycode',
 'crsdiscount',
 'deductibleamountinbuildingcoverage',
 'deductibleamountincontentscoverage',
 'elevationdifference',
 'federalpolicyfee',
 'hfiaasurcharge',
 'numberoffloorsininsuredbuilding',
 'obstructiontype',
 'occupancytype',
 'policycost',
 'policycount',
 'policytermindicator',
 'reportedzipcode',
 'ratemethod',
 'totalbuildinginsurancecoverage',
 'totalcontentsinsurancecoverage',
 'totalinsurancepremiumofthepolicy']

In [29]:
#columns = ['basefloodelevation',
# 'basementenclosurecrawlspacetype',
# 'censustract',
# 'countycode',
# 'deductibleamountinbuildingcoverage',
# 'deductibleamountincontentscoverage',
# 'elevationdifference',
# 'federalpolicyfee',
# 'hfiaasurcharge',
# 'latitude',
# 'longitude',
# 'lowestadjacentgrade',
# 'lowestfloorelevation',
# 'numberoffloorsininsuredbuilding',
# 'obstructiontype',
# 'occupancytype',
# 'policycost',
# 'policycount',
# 'policytermindicator',
# 'reportedzipcode',
# 'ratemethod',
# 'totalbuildinginsurancecoverage',
# 'totalcontentsinsurancecoverage',
# 'totalinsurancepremiumofthepolicy']
columns = ['basementenclosurecrawlspacetype',
 'censustract',
 'countycode',
 'crsdiscount',
 'deductibleamountinbuildingcoverage',
 'deductibleamountincontentscoverage',
 'elevationdifference',
 'federalpolicyfee',
 'hfiaasurcharge',
 'numberoffloorsininsuredbuilding',
 'obstructiontype',
 'occupancytype',
 'policycost',
 'policycount',
 'policytermindicator',
 'reportedzipcode',
 'ratemethod',
 'totalbuildinginsurancecoverage',
 'totalcontentsinsurancecoverage',
 'totalinsurancepremiumofthepolicy']
df_c = spark.createDataFrame(corr_matrix,columns)
df_c.select('basementenclosurecrawlspacetype',
 'censustract',
 'countycode','crsdiscount').show(20)

+-------------------------------+--------------------+--------------------+--------------------+
|basementenclosurecrawlspacetype|         censustract|          countycode|         crsdiscount|
+-------------------------------+--------------------+--------------------+--------------------+
|                            1.0| 0.13623903586845484| 0.13222438896911154|-0.13374568394718683|
|            0.13623903586845484|                 1.0|  0.9861052821687086|-0.23633772584537877|
|            0.13222438896911154|  0.9861052821687086|                 1.0| -0.2395338046623131|
|           -0.13374568394718683|-0.23633772584537877| -0.2395338046623131|                 1.0|
|            0.15131028789055215|-0.01575027970141...|-0.01666870410289107| 0.12280136982072971|
|           0.001508665033775...|-0.00854783078303...|-0.00907086794018...| 0.10083331434329317|
|            -0.1581868385179278|-0.02466908457563...|-0.02257988399471...| 0.14040974889492416|
|           0.0215156362230777

In [30]:
df_c.select( 'deductibleamountinbuildingcoverage',
 'deductibleamountincontentscoverage',
 'elevationdifference',
 'federalpolicyfee',
 'hfiaasurcharge').show(20)

+----------------------------------+----------------------------------+--------------------+--------------------+--------------------+
|deductibleamountinbuildingcoverage|deductibleamountincontentscoverage| elevationdifference|    federalpolicyfee|      hfiaasurcharge|
+----------------------------------+----------------------------------+--------------------+--------------------+--------------------+
|               0.15131028789055215|              0.001508665033775...| -0.1581868385179278|0.021515636223077703|0.039205129679828934|
|              -0.01575027970141...|              -0.00854783078303...|-0.02466908457563...|-0.04651095355139787|-0.00170650256780...|
|              -0.01666870410289107|              -0.00907086794018...|-0.02257988399471...|-0.04591061242264458|-0.00380940314362...|
|               0.12280136982072971|               0.10083331434329317| 0.14040974889492416|  0.1261809584870922|  0.0489544131764724|
|                               1.0|                0.3

In [31]:
df_c.select('numberoffloorsininsuredbuilding','obstructiontype',
 'occupancytype',
 'policycost',
 'policycount',
 'policytermindicator').show(20)

+-------------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|numberoffloorsininsuredbuilding|     obstructiontype|       occupancytype|          policycost|         policycount| policytermindicator|
+-------------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|             0.4205917981132782|  0.5982425523315253|-0.06389760287441462| 0.07213744564440353|0.008797902979000462|-0.02450568557810686|
|            0.13092070392149222| 0.03762336418085031| -0.0215185447700448|-0.00940672579516...|-0.02354252110006...|0.035625079186621965|
|             0.1266704346367964| 0.03611444575519996|-0.02083336431546...|-0.01020270930151...|-0.02335026782739...| 0.04136907348096722|
|           -0.06978076131143907|-0.09715089602625693| 0.04701164776520701| 0.06739894223234527| 0.02814710887032962|-0.03970568669785352|
|            0.071598041893

In [32]:
df_c.select('reportedzipcode',
 'ratemethod',
 'totalbuildinginsurancecoverage',
 'totalcontentsinsurancecoverage',
 'totalinsurancepremiumofthepolicy').show(20)

+--------------------+--------------------+------------------------------+------------------------------+--------------------------------+
|     reportedzipcode|          ratemethod|totalbuildinginsurancecoverage|totalcontentsinsurancecoverage|totalinsurancepremiumofthepolicy|
+--------------------+--------------------+------------------------------+------------------------------+--------------------------------+
| -0.1891662489640829|-0.06343028257339026|          0.006946886998778825|           -0.0989240851702171|              0.0693259281247627|
| 0.08585317162751503|  0.1268144994695155|          -0.02664276057268...|          0.014045383330901511|            -0.00788805859775...|
| 0.07289856703721888| 0.12029321305468112|          -0.02736003235722579|           0.00836107538616118|            -0.00851191616326...|
|-0.13151159218440586| -0.5619767116287033|          0.025212130060008484|           -0.1443948059913775|            0.058638703709192275|
|-0.03062548154001...|-0.33

### Scatterplots 

In [41]:
#num_pdf = sample_num_df.toPandas()
#num_pdf.plot.scatter(x='countycode',y='censustract')
#changed_df.plot.scatter(x='countycode',y='censustract')

#import os
#os.environ['PYARROW_IGNORE_TIMEZONE'] = '1'
#import pyspark.pandas as ps
#changed_num_df = changed_df.sample(False, 0.001)
#num_pdf = ps.DataFrame(changed_num_df)
#num_pdf.plot.scatter(x='countycode',y='censustract')

import matplotlib.pyplot as plt
changed_num_df = changed_df.sample(False, 0.5)
num_pdf = changed_num_df.toPandas()
print(num_pdf)
#plt.scatter(x='countycode',y='censustract')
#plt.title('
#plt.show()

24/05/05 14:56:36 ERROR TaskSetManager: Total size of serialized results of 55 tasks (1025.1 MiB) is bigger than spark.driver.maxResultSize (1024.0 MiB)
24/05/05 14:56:36 WARN TaskSetManager: Lost task 50.0 in stage 87.0 (TID 1979) (192.168.1.105 executor driver): TaskKilled (Tasks result size has exceeded maxResultSize)
24/05/05 14:56:36 ERROR TaskSetManager: Total size of serialized results of 56 tasks (1044.6 MiB) is bigger than spark.driver.maxResultSize (1024.0 MiB)
24/05/05 14:56:36 WARN TaskSetManager: Lost task 55.0 in stage 87.0 (TID 1984) (192.168.1.105 executor driver): TaskKilled (Tasks result size has exceeded maxResultSize)
24/05/05 14:56:37 WARN TaskSetManager: Lost task 61.0 in stage 87.0 (TID 1990) (192.168.1.105 executor driver): TaskKilled (Stage cancelled: Job aborted due to stage failure: Total size of serialized results of 55 tasks (1025.1 MiB) is bigger than spark.driver.maxResultSize (1024.0 MiB))
24/05/05 14:56:37 WARN TaskSetManager: Lost task 56.0 in stage 87

Py4JJavaError: An error occurred while calling o1900.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Total size of serialized results of 55 tasks (1025.1 MiB) is bigger than spark.driver.maxResultSize (1024.0 MiB)
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2419)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2438)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2463)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1049)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1048)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:448)
	at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:4148)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4322)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4320)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4320)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:4145)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
