In [1]:
# Download Libraries
!pip3 install google-cloud-storage
!pip3 install pyarrow # Apache Arrow

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import json
import requests
import zipfile
import io
from io import StringIO
import pyarrow as pa
import pyarrow.parquet as pq
from google.cloud import storage
import os
import datetime
from google.cloud import bigquery
import unicodedata




In [3]:
#Gathering Data
#Extracting from datasource to view the head 

URL = 'https://data.cityofnewyork.us/api/views/vx8i-nprf/rows.csv?accessType=DOWNLOAD'

# Define the data type for column 17 because it contains mixed data types including NaN
dtype_mapping = {'Exam No': 'object', 'List No': 'object', 'List Title Code': 'object', 'Group No': 'object', 'List Agency Code': 'object', 'List Div Code': 'object', 'Veteran Credit': 'object'}


# Specify NaN values to be considered as missing values
na_values = ['NaN', '', 'NA', 'nan']

df_raw = pd.read_csv(URL, dtype=dtype_mapping, na_values=na_values, low_memory=False)

print(df_raw.info())
print('\n')
print(df_raw.shape)
print('\n')
df_raw.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 491644 entries, 0 to 491643
Data columns (total 20 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Exam No             491644 non-null  object 
 1   List No             491644 non-null  object 
 2   First Name          491637 non-null  object 
 3   MI                  272355 non-null  object 
 4   Last Name           491636 non-null  object 
 5   Adj. FA             491644 non-null  float64
 6   List Title Code     491644 non-null  object 
 7   List Title Desc     491644 non-null  object 
 8   Group No            491644 non-null  object 
 9   List Agency Code    491644 non-null  object 
 10  List Agency Desc    491644 non-null  object 
 11  List Div Code       0 non-null       object 
 12  Published Date      221816 non-null  object 
 13  Established Date    476875 non-null  object 
 14  Anniversary Date    476875 non-null  object 
 15  Extension Date      222939 non-nul

Unnamed: 0,Exam No,List No,First Name,MI,Last Name,Adj. FA,List Title Code,List Title Desc,Group No,List Agency Code,List Agency Desc,List Div Code,Published Date,Established Date,Anniversary Date,Extension Date,Veteran Credit,Parent Lgy Credit,Sibling Lgy Credit,Residency Credit
0,3528,1138.0,OMAR,,AGUILAR,72.37,70260,LIEUTENANT (POLICE),0,56,POLICE DEPARTMENT,,,12/20/2023,12/20/2027,,,,,
1,6601,24529.0,FRANK,A,GRIFFITH,83.75,91207,CONDUCTOR,0,0,OPEN COMPETITIVE,,,02/14/2018,02/14/2022,02/14/2025,,,,
2,4502,474.0,HENRY,,JABLONSKY,81.03,70392,FIRE MARSHAL (UNIFORMED),0,57,FIRE DEPARTMENT,,,08/06/2014,08/06/2018,08/06/2024,,,,
3,1101,268.0,ANUC,,VELA,80.0,20210,ASSISTANT CIVIL ENGINEER,0,0,OPEN COMPETITIVE,,02/24/2021,09/22/2021,09/22/2025,,,,,
4,2554,119.0,FRANCINE,,WILLIAMS,81.94,82011,SUPERVISOR OF HOUSING CARETAKERS,0,996,N.Y.C. HOUSING AUTHORITY,,,04/19/2023,04/19/2027,,,,,


In [6]:
#Storing Data
# Specify the path to your service account key file
service_account_key_file = '/Users/karmayangchentenzin/Downloads/Service_Key_HW.json'

# Optionally, load other configuration settings from the JSON file
with open(service_account_key_file, 'r') as f:
    config = json.load(f)

# Fetch data from the web (CSV file)
url = 'https://data.cityofnewyork.us/api/views/vx8i-nprf/rows.csv?accessType=DOWNLOAD'

# Read the contents of the CSV file
response = requests.get(url)

# Check if the request was successful
if response.status_code != 200:
    print("Failed to fetch CSV file.")
    exit()

# Read the CSV data into a DataFrame
try: 
    df = pd.read_csv(io.StringIO(response.text), dtype=dtype_mapping, na_values=na_values, low_memory=False)

    # Add timestamp column
    df['load_date'] = datetime.datetime.now() #Date the data is stored. A new column named "load_data" is created.

    # Replace periods with underscores in column names #This had to be done before the original data was loaded from the source to GC because when creating tables in BigQuery, an error message was shown saying that field names can not contain a period (.)
    df.columns = df.columns.str.replace('.', '_')

    # Convert DataFrame to PyArrow Table
    table = pa.Table.from_pandas(df)

    # Write PyArrow Table to Parquet format
    parquet_file_name = 'ny_civil_service_exam.parquet'  # name for the Parquet file
    pq.write_table(table, parquet_file_name)

    # Upload Parquet file to Google Cloud Storage
    bucket_name = 'cis4400_hw1_kyt'
    blob_name = parquet_file_name

    client = storage.Client.from_service_account_json(service_account_key_file)
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    blob.upload_from_filename(parquet_file_name)

    print(f"Parquet file '{parquet_file_name}' uploaded to {bucket_name}/{blob_name} in Google Cloud Storage.")
except Exception as e:
    print(f"Error occurred: {e}")

Error occurred: ('Connection aborted.', timeout('The write operation timed out'))


In [14]:
# Display DataFrame with added timestamp column
print(df.head())

# Verify Parquet file contents
parquet_table = pq.read_table(parquet_file_name)
parquet_df = parquet_table.to_pandas()
print(parquet_df.head())

   Exam No  List No First Name   MI   Last Name  Adj_ FA  List Title Code  \
0     9618   1214.0     GERMAN    A        SOSA    86.67            91203   
1     7001  24935.0     DARREN    L       PAYNE    91.00            70310   
2     7001  11653.0    STEPHEN  NaN      MANFRE    96.00            70310   
3     2060  15669.0     ELIJAH    T  RICHARDSON    95.71            70112   
4     6601  16886.0        JAY    M        AMES    88.75            91207   

     List Title Desc  Group No  List Agency Code  ... List Div Code  \
0       BUS OPERATOR         0                 0  ...           NaN   
1        FIREFIGHTER         0                 0  ...           NaN   
2        FIREFIGHTER         0                 0  ...           NaN   
3  SANITATION WORKER         0                 0  ...           NaN   
4          CONDUCTOR         0                 0  ...           NaN   

   Published Date Established Date Anniversary Date Extension Date  \
0             NaN       09/08/2021      

In [22]:


# Initialize a client using the default credentials #Without initializing default credentials first, was unable to create any tables in BigQuery.
client = storage.Client(project="avian-silicon-418821")

# List buckets
buckets = list(client.list_buckets())

# Print bucket names
for bucket in buckets:
    print(bucket.name)  #The correct buckt name from GC is printed.


cis4400_hw1_kyt


In [32]:
# Read Parquet file into DataFrame
#parquet_file_path = 'gs://cis4400_hw1_kyt/ny_civil_service_exam.parquet'
#df = pd.read_parquet(parquet_file_path)

# Print DataFrame contents
#print("DataFrame Contents:")
#print(df)
#Just for testing

DataFrame Contents:
        Exam No  List No   First Name    MI   Last Name  Adj_ FA  \
0          9618   1214.0       GERMAN     A        SOSA    86.67   
1          7001  24935.0       DARREN     L       PAYNE    91.00   
2          7001  11653.0      STEPHEN  None      MANFRE    96.00   
3          2060  15669.0       ELIJAH     T  RICHARDSON    95.71   
4          6601  16886.0          JAY     M        AMES    88.75   
...         ...      ...          ...   ...         ...      ...   
491814      162    640.0        KARIM  None      NUGENT    94.00   
491815      320   3127.0      NICOLAS     C        PUMA    93.61   
491816     2027   4724.0       DILCIA  None       TAPIA    82.44   
491817     2095    291.0  MD ABUBAKAR  None      SIDDIK    70.00   
491818     2545     53.0         LUIS     M       GOMEZ    70.60   

        List Title Code                                 List Title Desc  \
0                 91203                                    BUS OPERATOR   
1            