In [None]:
%pip install google-cloud-storage
%pip install pandas
%pip install numpy
%pip install fsspec
%pip install gcsfs

In [5]:
from google.cloud import storage
import pandas as pd
import numpy as np

In [6]:
bucket_name = 'ppp-loan-analysis'
gs_path  = f'gs://{bucket_name}/'
raw_folder = 'raw-data/'
clean_folder = 'clean-data/'
final_folder = 'final-data/'
dropped_folder = 'dropped-data/'
gdp_folder = 'gdp-data/'

In [7]:
storage_client = storage.Client() 
bucket = storage_client.get_bucket(bucket_name)

# Raw GDP Data

In [17]:
raw_gdp_path = gs_path + raw_folder + gdp_folder + 'CAGDP1__ALL_AREAS_2001_2023.csv'
raw_gdp_df = pd.read_csv(raw_gdp_path, header=0)

In [18]:
# Print gdp_df schema
raw_gdp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9538 entries, 0 to 9537
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   GeoFIPS                 9538 non-null   object 
 1   GeoName                 9534 non-null   object 
 2   Region                  9534 non-null   object 
 3   TableName               9534 non-null   object 
 4   LineCode                9534 non-null   float64
 5   IndustryClassification  9534 non-null   object 
 6   Description             9534 non-null   object 
 7   Unit                    9534 non-null   object 
 8   2001                    9534 non-null   object 
 9   2002                    9534 non-null   object 
 10  2003                    9534 non-null   object 
 11  2004                    9534 non-null   object 
 12  2005                    9534 non-null   object 
 13  2006                    9534 non-null   object 
 14  2007                    9534 non-null   

In [19]:
# Print the total number of rows in the dataframe
total_rows = len(raw_gdp_df)
print(f"Total number of rows: {total_rows}")

Total number of rows: 9538


In [20]:
# Print the number of unique GeoFIPS in the dataframe
unique_geofips = raw_gdp_df['GeoFIPS'].nunique()
print(f"Number of unique GeoFIPS: {unique_geofips}")

Number of unique GeoFIPS: 3182


In [None]:
# Show the first 10 GEOFIPS values
print("First 10 GeoFIPS values:")
print(raw_gdp_df['GeoFIPS'][:10])

First 10 GeoFIPS values:
0     "00000"
1     "00000"
2     "00000"
3     "01000"
4     "01000"
5     "01000"
6     "01001"
7     "01001"
8     "01001"
9     "01003"
Name: GeoFIPS, dtype: object


# Clean GDP Data

In [None]:
clean_gdp_path = gs_path + clean_folder + gdp_folder + '*.csv'
clean_gdp_df = spark.read.csv(clean_gdp_path, header=True, inferSchema=True)

In [13]:
clean_gdp_df.printSchema()

root
 |-- facts_gdp_id: integer (nullable = true)
 |-- geofips: integer (nullable = true)
 |-- geo_name: string (nullable = true)
 |-- region: string (nullable = true)
 |-- year_id: integer (nullable = true)
 |-- chain_type_index_gdp: double (nullable = true)
 |-- current_dollar_gdp: double (nullable = true)
 |-- real_gdp: double (nullable = true)



In [14]:
# Print the number of rows in the cleaned dataframe
clean_total_rows = clean_gdp_df.count()
print(f"Total number of rows in cleaned dataframe: {clean_total_rows}")

Total number of rows in cleaned dataframe: 19032


In [30]:
# Print the number of unique GeoFIPS in the dataframe
clean_unique_geo_fips = clean_gdp_df.select("geofips").distinct().count()
print(f"Number of unique GeoFIPS in cleaned dataframe: {clean_unique_geo_fips}")

Number of unique GeoFIPS in cleaned dataframe: 3172


# Dropped Data

In [18]:
# Dropped Data
dropped_gdp_path = gs_path + dropped_folder + gdp_folder + '*.csv'
dropped_gdp_df = spark.read.csv(dropped_gdp_path, header=True, inferSchema=True)

In [19]:
# print the number of rows in the dropped dataframe
dropped_total_rows = dropped_gdp_df.count()
print(f"Total number of rows in dropped dataframe: {dropped_total_rows}")

Total number of rows in dropped dataframe: 18


In [22]:
# Print the number of unique GeoFIPS in the dropped_gdp_df
dropped_unique_geo_fips = dropped_gdp_df.select("GeoFIPS").distinct().count()
print(f"Number of unique GeoFIPS in dropped dataframe: {dropped_unique_geo_fips}")

Number of unique GeoFIPS in dropped dataframe: 6


In [20]:
# Show all the rows in the dropped dataframe
dropped_gdp_df.show()

+------------+--------------------+------+--------------------+-------+-------+-------+-------+-------+-------+-----------+
|     GeoFIPS|             GeoName|Region|         Description|   2017|   2018|   2019|   2020|   2021|   2022|drop_reason|
+------------+--------------------+------+--------------------+-------+-------+-------+-------+-------+-------+-----------+
|" ""02063"""|Chugach Census Ar...|     8|Real GDP (thousan...|   (NA)|   (NA)|   (NA)|1268010|1330368|1406228|    2017_na|
|" ""02063"""|Chugach Census Ar...|     8|Chain-type quanti...|   (NA)|   (NA)|   (NA)|   (NM)|   (NM)|   (NM)|    2017_na|
|" ""02063"""|Chugach Census Ar...|     8|Current-dollar GD...|   (NA)|   (NA)|   (NA)|1291960|1426984|1635885|    2017_na|
|" ""02066"""|Copper River Cens...|     8|Real GDP (thousan...|   (NA)|   (NA)|   (NA)| 106221| 124946| 128311|    2017_na|
|" ""02066"""|Copper River Cens...|     8|Chain-type quanti...|   (NA)|   (NA)|   (NA)|   (NM)|   (NM)|   (NM)|    2017_na|
|" ""020