In [1]:
# Download Libraries
!pip3 install google-cloud-storage
!pip3 install pyarrow # Apache Arrow

Collecting google-cloud-storage
  Downloading google_cloud_storage-2.16.0-py2.py3-none-any.whl.metadata (6.1 kB)
Collecting google-auth<3.0dev,>=2.26.1 (from google-cloud-storage)
  Downloading google_auth-2.29.0-py2.py3-none-any.whl.metadata (4.7 kB)
Collecting google-api-core<3.0.0dev,>=2.15.0 (from google-cloud-storage)
  Downloading google_api_core-2.19.0-py3-none-any.whl.metadata (2.7 kB)
Collecting google-cloud-core<3.0dev,>=2.3.0 (from google-cloud-storage)
  Downloading google_cloud_core-2.4.1-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting google-resumable-media>=2.6.0 (from google-cloud-storage)
  Downloading google_resumable_media-2.7.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting requests<3.0.0dev,>=2.18.0 (from google-cloud-storage)
  Downloading requests-2.31.0-py3-none-any.whl.metadata (4.6 kB)
Collecting google-crc32c<2.0dev,>=1.0 (from google-cloud-storage)
  Downloading google-crc32c-1.5.0.tar.gz (12 kB)
  Installing build dependencies: started
  Installing bui

ERROR: Invalid requirement: '#'


In [5]:
# Import Libraries
import pandas as pd
import numpy as np
import json
import requests
import zipfile
import io
from io import StringIO
import pyarrow as pa
import pyarrow.parquet as pq
from google.cloud import storage
import os
import datetime
#from google.cloud import bigquery
import unicodedata


In [6]:
#Gathering Data
#Extracting from datasource to view the head 

URL = 'https://data.cityofnewyork.us/api/views/vx8i-nprf/rows.csv?accessType=DOWNLOAD'

# Define the data type for column 17 because it contains mixed data types including NaN
dtype_mapping = {'Exam No': 'object', 'List No': 'object', 'List Title Code': 'object', 'Group No': 'object', 'List Agency Code': 'object', 'List Div Code': 'object', 'Veteran Credit': 'object'}


# Specify NaN values to be considered as missing values
na_values = ['NaN', '', 'NA', 'nan']

df_raw = pd.read_csv(URL, dtype=dtype_mapping, na_values=na_values, low_memory=False)

print(df_raw.info())
print('\n')
print(df_raw.shape)
print('\n')
df_raw.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 491645 entries, 0 to 491644
Data columns (total 20 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Exam No             491645 non-null  object 
 1   List No             491645 non-null  object 
 2   First Name          491638 non-null  object 
 3   MI                  272377 non-null  object 
 4   Last Name           491637 non-null  object 
 5   Adj. FA             491645 non-null  float64
 6   List Title Code     491645 non-null  object 
 7   List Title Desc     491645 non-null  object 
 8   Group No            491645 non-null  object 
 9   List Agency Code    491645 non-null  object 
 10  List Agency Desc    491645 non-null  object 
 11  List Div Code       0 non-null       object 
 12  Published Date      221816 non-null  object 
 13  Established Date    476876 non-null  object 
 14  Anniversary Date    476876 non-null  object 
 15  Extension Date      222939 non-nul

Unnamed: 0,Exam No,List No,First Name,MI,Last Name,Adj. FA,List Title Code,List Title Desc,Group No,List Agency Code,List Agency Desc,List Div Code,Published Date,Established Date,Anniversary Date,Extension Date,Veteran Credit,Parent Lgy Credit,Sibling Lgy Credit,Residency Credit
0,7001,17897.0,MACDANIEL,,CHARLES,94.0,70310,FIREFIGHTER,0,0,OPEN COMPETITIVE,,06/13/2018,02/27/2019,02/27/2023,02/27/2025,,,,Residency Credit
1,3055,52.0,MARIUSZ,,PRZEZDZIECKI,85.85,92005,CARPENTER,0,0,OPEN COMPETITIVE,,04/10/2024,,,,,,,
2,1111,634.0,TERRENCE,,HAYES,70.0,13652,CERTIFIED IT ADMINISTRATOR (LAN/WAN),0,0,OPEN COMPETITIVE,,05/18/2022,04/19/2023,04/19/2027,,,,,
3,7001,21755.0,DANIEL,,MELENDEZ,93.0,70310,FIREFIGHTER,0,0,OPEN COMPETITIVE,,06/13/2018,02/27/2019,02/27/2023,02/27/2025,,,,Residency Credit
4,7001,29676.0,KAREEM,P,MYERS,89.0,70310,FIREFIGHTER,0,0,OPEN COMPETITIVE,,06/13/2018,02/27/2019,02/27/2023,02/27/2025,,,,


In [22]:
#Storing Data
# Specify the path to your service account key file
#service_account_key_file = '/Users/karmayangchentenzin/Downloads/Service_Key_HW.json'
service_account_key_file = 'service_acc_key.json' 

# Optionally, load other configuration settings from the JSON file
with open(service_account_key_file, 'r') as f:
    config = json.load(f)

# Fetch data from the web (CSV file)
url = 'https://data.cityofnewyork.us/api/views/vx8i-nprf/rows.csv?accessType=DOWNLOAD'

# Read the contents of the CSV file
response = requests.get(url)

# Check if the request was successful
if response.status_code != 200:
    print("Failed to fetch CSV file.")
    exit()

# Read the CSV data into a DataFrame
try: 
    df = pd.read_csv(io.StringIO(response.text), dtype=dtype_mapping, na_values=na_values, low_memory=False)

    # Add timestamp column
    df['load_date'] = datetime.datetime.now() #Date the data is stored. A new column named "load_data" is created.

    # Replace periods with underscores in column names #This had to be done before the original data was loaded from the source to GC because when creating tables in BigQuery, an error message was shown saying that field names can not contain a period (.)
    df.columns = df.columns.str.replace('.', '_')

    # Convert DataFrame to PyArrow Table
    table = pa.Table.from_pandas(df)

    # Write PyArrow Table to Parquet format
    parquet_file_name = 'ny_civil_service_exam.parquet'  # name for the Parquet file
    pq.write_table(table, parquet_file_name)

    # Upload Parquet file to Google Cloud Storage
    bucket_name = 'cis4400_hw1_kyt'
    blob_name = parquet_file_name

    client = storage.Client.from_service_account_json(service_account_key_file)
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    blob.upload_from_filename(parquet_file_name)

    print(f"Parquet file '{parquet_file_name}' uploaded to {bucket_name}/{blob_name} in Google Cloud Storage.")
except Exception as e:
    print(f"Error occurred: {e}")

Parquet file 'ny_civil_service_exam.parquet' uploaded to cis4400_hw1_kyt/ny_civil_service_exam.parquet in Google Cloud Storage.


In [23]:
# Display DataFrame with added timestamp column
print(df.head())

# Verify Parquet file contents
parquet_table = pq.read_table(parquet_file_name)
parquet_df = parquet_table.to_pandas()
print(parquet_df.head())

  Exam No    List No First Name   MI     Last Name  Adj_ FA List Title Code  \
0    7001  17897.000  MACDANIEL  NaN       CHARLES    94.00           70310   
1    3055     52.000    MARIUSZ  NaN  PRZEZDZIECKI    85.85           92005   
2    1111    634.000   TERRENCE  NaN         HAYES    70.00           13652   
3    7001  21755.000     DANIEL  NaN      MELENDEZ    93.00           70310   
4    7001  29676.000     KAREEM    P         MYERS    89.00           70310   

                        List Title Desc Group No List Agency Code  ...  \
0                           FIREFIGHTER      000              000  ...   
1                             CARPENTER      000              000  ...   
2  CERTIFIED IT ADMINISTRATOR (LAN/WAN)      000              000  ...   
3                           FIREFIGHTER      000              000  ...   
4                           FIREFIGHTER      000              000  ...   

  List Div Code Published Date Established Date Anniversary Date  \
0           

In [22]:


# Initialize a client using the default credentials #Without initializing default credentials first, was unable to create any tables in BigQuery.
client = storage.Client(project="avian-silicon-418821")

# List buckets
buckets = list(client.list_buckets())

# Print bucket names
for bucket in buckets:
    print(bucket.name)  #The correct buckt name from GC is printed.


cis4400_hw1_kyt


In [32]:
# Read Parquet file into DataFrame
#parquet_file_path = 'gs://cis4400_hw1_kyt/ny_civil_service_exam.parquet'
#df = pd.read_parquet(parquet_file_path)

# Print DataFrame contents
#print("DataFrame Contents:")
#print(df)
#Just for testing

DataFrame Contents:
        Exam No  List No   First Name    MI   Last Name  Adj_ FA  \
0          9618   1214.0       GERMAN     A        SOSA    86.67   
1          7001  24935.0       DARREN     L       PAYNE    91.00   
2          7001  11653.0      STEPHEN  None      MANFRE    96.00   
3          2060  15669.0       ELIJAH     T  RICHARDSON    95.71   
4          6601  16886.0          JAY     M        AMES    88.75   
...         ...      ...          ...   ...         ...      ...   
491814      162    640.0        KARIM  None      NUGENT    94.00   
491815      320   3127.0      NICOLAS     C        PUMA    93.61   
491816     2027   4724.0       DILCIA  None       TAPIA    82.44   
491817     2095    291.0  MD ABUBAKAR  None      SIDDIK    70.00   
491818     2545     53.0         LUIS     M       GOMEZ    70.60   

        List Title Code                                 List Title Desc  \
0                 91203                                    BUS OPERATOR   
1            