In [1]:
spark

In [2]:
from google.cloud import storage

import pyspark
from pyspark.sql import SparkSession

In [17]:
bucket_name = 'ppp-loan-analysis'
gs_path  = f'gs://{bucket_name}/'
raw_folder = 'raw-data/'
clean_folder = 'clean-data/'
final_folder = 'final-data/'
dropped_folder = 'dropped-data/'
gdp_folder = 'gdp-data/'

In [4]:
spark = SparkSession.builder.appName('gdp_eda').getOrCreate()

25/09/04 23:10:05 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [5]:
storage_client = storage.Client() 
bucket = storage_client.get_bucket(bucket_name)

# Raw GDP Data

In [7]:
gdp_raw_path = gs_path + raw_folder + gdp_folder + '*.csv'
raw_gdp_df = spark.read.csv(gdp_raw_path, header=True, inferSchema=True)

In [9]:
raw_gdp_df.printSchema()

root
 |-- GeoFIPS: string (nullable = true)
 |-- GeoName: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- TableName: string (nullable = true)
 |-- LineCode: double (nullable = true)
 |-- IndustryClassification: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Unit: string (nullable = true)
 |-- 2001: string (nullable = true)
 |-- 2002: string (nullable = true)
 |-- 2003: string (nullable = true)
 |-- 2004: string (nullable = true)
 |-- 2005: string (nullable = true)
 |-- 2006: string (nullable = true)
 |-- 2007: string (nullable = true)
 |-- 2008: string (nullable = true)
 |-- 2009: string (nullable = true)
 |-- 2010: string (nullable = true)
 |-- 2011: string (nullable = true)
 |-- 2012: string (nullable = true)
 |-- 2013: string (nullable = true)
 |-- 2014: string (nullable = true)
 |-- 2015: string (nullable = true)
 |-- 2016: string (nullable = true)
 |-- 2017: string (nullable = true)
 |-- 2018: string (nullable = true)
 |-- 2019: string

In [11]:
# Print the total number of rows in the dataframe
total_rows = raw_gdp_df.count()
print(f"Total number of rows: {total_rows}")

Total number of rows: 9538


In [10]:
# Print the number of unique GeoFIPS in the dataframe
unique_geo_fips = raw_gdp_df.select("GeoFIPS").distinct().count()
print(f"Number of unique GeoFIPS: {unique_geo_fips}")

[Stage 6:>                                                          (0 + 1) / 1]

Number of unique GeoFIPS: 3182


                                                                                

In [24]:
raw_gdp_df.select("GeoFIPS").show(20)

+------------+
|     GeoFIPS|
+------------+
|" ""00000"""|
|" ""00000"""|
|" ""00000"""|
|" ""01000"""|
|" ""01000"""|
|" ""01000"""|
|" ""01001"""|
|" ""01001"""|
|" ""01001"""|
|" ""01003"""|
|" ""01003"""|
|" ""01003"""|
|" ""01005"""|
|" ""01005"""|
|" ""01005"""|
|" ""01007"""|
|" ""01007"""|
|" ""01007"""|
|" ""01009"""|
|" ""01009"""|
+------------+
only showing top 20 rows



# Clean GDP Data

In [12]:
gdp_clean_path = gs_path + clean_folder + gdp_folder + '*.csv'
clean_gdp_df = spark.read.csv(gdp_clean_path, header=True, inferSchema=True)

In [13]:
clean_gdp_df.printSchema()

root
 |-- facts_gdp_id: integer (nullable = true)
 |-- geofips: integer (nullable = true)
 |-- geo_name: string (nullable = true)
 |-- region: string (nullable = true)
 |-- year_id: integer (nullable = true)
 |-- chain_type_index_gdp: double (nullable = true)
 |-- current_dollar_gdp: double (nullable = true)
 |-- real_gdp: double (nullable = true)



In [14]:
# Print the number of rows in the cleaned dataframe
clean_total_rows = clean_gdp_df.count()
print(f"Total number of rows in cleaned dataframe: {clean_total_rows}")

Total number of rows in cleaned dataframe: 19032


In [30]:
# Print the number of unique GeoFIPS in the dataframe
clean_unique_geo_fips = clean_gdp_df.select("geofips").distinct().count()
print(f"Number of unique GeoFIPS in cleaned dataframe: {clean_unique_geo_fips}")

Number of unique GeoFIPS in cleaned dataframe: 3172


# Dropped Data

In [18]:
# Dropped Data
dropped_gdp_path = gs_path + dropped_folder + gdp_folder + '*.csv'
dropped_gdp_df = spark.read.csv(dropped_gdp_path, header=True, inferSchema=True)

In [19]:
# print the number of rows in the dropped dataframe
dropped_total_rows = dropped_gdp_df.count()
print(f"Total number of rows in dropped dataframe: {dropped_total_rows}")

Total number of rows in dropped dataframe: 18


In [22]:
# Print the number of unique GeoFIPS in the dropped_gdp_df
dropped_unique_geo_fips = dropped_gdp_df.select("GeoFIPS").distinct().count()
print(f"Number of unique GeoFIPS in dropped dataframe: {dropped_unique_geo_fips}")

Number of unique GeoFIPS in dropped dataframe: 6


In [20]:
# Show all the rows in the dropped dataframe
dropped_gdp_df.show()

+------------+--------------------+------+--------------------+-------+-------+-------+-------+-------+-------+-----------+
|     GeoFIPS|             GeoName|Region|         Description|   2017|   2018|   2019|   2020|   2021|   2022|drop_reason|
+------------+--------------------+------+--------------------+-------+-------+-------+-------+-------+-------+-----------+
|" ""02063"""|Chugach Census Ar...|     8|Real GDP (thousan...|   (NA)|   (NA)|   (NA)|1268010|1330368|1406228|    2017_na|
|" ""02063"""|Chugach Census Ar...|     8|Chain-type quanti...|   (NA)|   (NA)|   (NA)|   (NM)|   (NM)|   (NM)|    2017_na|
|" ""02063"""|Chugach Census Ar...|     8|Current-dollar GD...|   (NA)|   (NA)|   (NA)|1291960|1426984|1635885|    2017_na|
|" ""02066"""|Copper River Cens...|     8|Real GDP (thousan...|   (NA)|   (NA)|   (NA)| 106221| 124946| 128311|    2017_na|
|" ""02066"""|Copper River Cens...|     8|Chain-type quanti...|   (NA)|   (NA)|   (NA)|   (NM)|   (NM)|   (NM)|    2017_na|
|" ""020