# Big Data File Formats

## Parquet

### Convert To Parquet (Pandas)

In [4]:
import pandas as pd

df = pd.read_csv('files/equities.csv')

print(df.head())

df.to_parquet('files/equities.parquet')

      symbol                              name  \
0  000002.SZ             China Vanke Co., Ltd.   
1  000004.SZ                               two   
2  000005.SZ     Shenzhen Fountain Corporation   
3  000006.SZ  Shenzhen Zhenye (Group) Co.,Ltd.   
4  000007.SZ     Shenzhen Quanxinhao Co., Ltd.   

                                             summary currency  \
0  China Vanke Co., Ltd., together with its subsi...      CNY   
1  two is a blank check company. The company was ...      CNY   
2  Shenzhen Fountain Corporation engages in real ...      CNY   
3  Shenzhen Zhenye (Group) Co.,Ltd. engages in th...      CNY   
4  Shenzhen Quanxinhao Co., Ltd. owns and operate...      CNY   

                   sector          industry_group  \
0             Real Estate             Real Estate   
1              Financials  Diversified Financials   
2             Real Estate             Real Estate   
3             Real Estate             Real Estate   
4  Consumer Discretionary       Consumer Se

### Convert to Parquet (Pyarrow)

In [6]:
import pyarrow.csv as pv
import pyarrow.parquet as pq

file_location = "files/equities.csv"

table = pv.read_csv(file_location)
pq.write_table(table, "files/equities-2.parquet")


### Read Metadata

In [7]:
from pyarrow import parquet as pq

file_info = pq.ParquetFile("files/equities-2.parquet")
print("--- METADATA ---")
print(file_info.metadata)
print("")
print(file_info.schema)

--- METADATA ---
<pyarrow._parquet.FileMetaData object at 0x10db3ae80>
  created_by: parquet-cpp-arrow version 14.0.1
  num_columns: 20
  num_rows: 158647
  num_row_groups: 1
  format_version: 2.6
  serialized_size: 4318

<pyarrow._parquet.ParquetSchema object at 0x12647f240>
required group field_id=-1 schema {
  optional binary field_id=-1 symbol (String);
  optional binary field_id=-1 name (String);
  optional binary field_id=-1 summary (String);
  optional binary field_id=-1 currency (String);
  optional binary field_id=-1 sector (String);
  optional binary field_id=-1 industry_group (String);
  optional binary field_id=-1 industry (String);
  optional binary field_id=-1 exchange (String);
  optional binary field_id=-1 market (String);
  optional binary field_id=-1 country (String);
  optional binary field_id=-1 state (String);
  optional binary field_id=-1 city (String);
  optional binary field_id=-1 zipcode (String);
  optional binary field_id=-1 website (String);
  optional binar

### Parking Violations - Create Parquet

In [19]:
import pandas as pd

dtype_dict = {18: str, 29: str, 38: str, 40: str, 41: str, 42: str}

df = pd.read_csv(
    "files/Parking_Violations_Issued_-_Fiscal_Year_2015.csv",
    dtype=dtype_dict,
    parse_dates=["Issue Date"],
)

print(df.head())
print(df.dtypes)

df.to_parquet("files/Parking_Violations_Issued_-_Fiscal_Year_2015.parquet")

df

   Summons Number  Plate ID Registration State Plate Type Issue Date  \
0      8002531292   EPC5238                 NY        PAS 2014-10-01   
1      8015318440    5298MD                 NY        COM 2015-03-06   
2      7611181981   FYW2775                 NY        PAS 2014-07-28   
3      7445908067   GWE1987                 NY        PAS 2015-04-13   
4      7037692864  T671196C                 NY        PAS 2015-05-19   

   Violation Code Vehicle Body Type Vehicle Make Issuing Agency  Street Code1  \
0              21              SUBN        CHEVR              T         20390   
1              14               VAN        FRUEH              T         27790   
2              46              SUBN        SUBAR              T          8130   
3              19              4DSD        LEXUS              T         59990   
4              19              4DSD        CHRYS              T         36090   

   ...  Hydrant Violation  Double Parking Violation Latitude  Longitude  \
0  ..

Unnamed: 0,Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,...,Hydrant Violation,Double Parking Violation,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
0,8002531292,EPC5238,NY,PAS,2014-10-01,21,SUBN,CHEVR,T,20390,...,,,,,,,,,,
1,8015318440,5298MD,NY,COM,2015-03-06,14,VAN,FRUEH,T,27790,...,,,,,,,,,,
2,7611181981,FYW2775,NY,PAS,2014-07-28,46,SUBN,SUBAR,T,8130,...,,,,,,,,,,
3,7445908067,GWE1987,NY,PAS,2015-04-13,19,4DSD,LEXUS,T,59990,...,,,,,,,,,,
4,7037692864,T671196C,NY,PAS,2015-05-19,19,4DSD,CHRYS,T,36090,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11809228,8035170326,VCT8217,VA,PAS,2015-05-19,14,4DSD,HONDA,T,72830,...,,,,,,,,,,
11809229,8005276357,GHK4864,NY,PAS,2014-10-20,31,SUBN,ME/BE,T,34210,...,,,,,,,,,,
11809230,7155525273,T616724C,NY,OMT,2014-09-25,31,4DSD,LINCO,T,10610,...,,,,,,,,,,
11809231,8044311063,97720MD,NY,COM,2015-06-03,51,DUMP,PETER,T,10540,...,,,,,,,,,,


### Parking Violations - Query Parquet

In [28]:
import pandas as pd
from pyarrow import parquet as pq

file_location = "files/Parking_Violations_Issued_-_Fiscal_Year_2015.parquet"
file_info = pq.ParquetFile(file_location)

print(file_info.metadata)
print(file_info.schema)

df = pd.read_parquet(file_location, columns=["Issue Date", "Vehicle Make"])

print(df.head())

### GET TOTAL VIOLATIONS BY VEHICLE MAKE FOR MONTH OF JANUARY ###

# Filter for January 2015
jan_2015 = df[(df["Issue Date"] >= "2015-01-01") & (df["Issue Date"] < "2015-02-01")]

# Group by 'Vehicle Make' and count tickets
ticket_counts = jan_2015.groupby("Vehicle Make").size()

# Drop counts less than 100 (assuming this is bogus data)
ticket_counts = ticket_counts[ticket_counts >= 1000]

# The result is a Series with 'Vehicle Make' as the index and ticket counts as the values
ticket_counts


<pyarrow._parquet.FileMetaData object at 0x1263057b0>
  created_by: parquet-cpp-arrow version 14.0.1
  num_columns: 51
  num_rows: 11809233
  num_row_groups: 12
  format_version: 2.6
  serialized_size: 84344
<pyarrow._parquet.ParquetSchema object at 0x3e42e3b40>
required group field_id=-1 schema {
  optional int64 field_id=-1 Summons Number;
  optional binary field_id=-1 Plate ID (String);
  optional binary field_id=-1 Registration State (String);
  optional binary field_id=-1 Plate Type (String);
  optional int64 field_id=-1 Issue Date (Timestamp(isAdjustedToUTC=false, timeUnit=nanoseconds, is_from_converted_type=false, force_set_converted_type=false));
  optional int64 field_id=-1 Violation Code;
  optional binary field_id=-1 Vehicle Body Type (String);
  optional binary field_id=-1 Vehicle Make (String);
  optional binary field_id=-1 Issuing Agency (String);
  optional int64 field_id=-1 Street Code1;
  optional int64 field_id=-1 Street Code2;
  optional int64 field_id=-1 Street Code

Vehicle Make
ACURA     26068
AUDI      16892
BMW       45932
BUICK      5620
CADIL     12310
CHEVR    103546
CHRYS     25271
DODGE     47542
FIAT       1636
FORD     176314
FREIG      1523
FRUEH     48399
GMC       40340
HINO      11135
HONDA    130974
HYUND     32168
INFIN     20387
INTER     40393
ISUZU     16045
JAGUA      2186
JEEP      36725
KENWO      3991
KIA       11859
LEXUS     30743
LINCO     17828
MACK       3799
MAZDA     14098
ME/BE     47436
MERCU     10533
MI/F       1239
MINI       3025
MITSU     16302
NISSA    107729
NS/OT     19613
OLDSM      1086
PETER      4486
PONTI      3719
PORSC      2876
ROVER      8316
SAAB       1753
SATUR      3478
SMART      2038
STERL      1391
SUBAR     16251
SUZUK      1175
TOYOT    145664
UD         1889
UTIL       1361
VOLKS     25371
VOLVO     10697
WORK       1209
WORKH      6243
dtype: int64

### Parking Violations - Query Parquet (PyArrow)

In [29]:
from datetime import datetime

import pandas as pd

file_location = "files/Parking_Violations_Issued_-_Fiscal_Year_2015.parquet"

start_dt = datetime(2015, 1, 1)
end_dt = datetime(2015, 2, 1)
df = pd.read_parquet(
    file_location,
    engine="pyarrow",
    columns=["Issue Date", "Vehicle Make"],
    filters=[[("Issue Date", ">=", start_dt), ("Issue Date", "<", end_dt)]],
)

print(df.head())

### GET TOTAL VIOLATIONS BY VEHICLE MAKE FOR MONTH OF JANUARY ###

# Group by 'Vehicle Make' and count tickets
ticket_counts = df.groupby("Vehicle Make").size()

# Drop counts less than 100 (assuming this is bogus data)
ticket_counts = ticket_counts[ticket_counts >= 1000]

# The result is a Series with 'Vehicle Make' as the index and ticket counts as the values
ticket_counts


  Issue Date Vehicle Make
0 2015-01-20        LEXUS
1 2015-01-20        LEXUS
2 2015-01-21        TOYOT
3 2015-01-24        NISSA
4 2015-01-24        NISSA


Vehicle Make
ACURA     26068
AUDI      16892
BMW       45932
BUICK      5620
CADIL     12310
CHEVR    103546
CHRYS     25271
DODGE     47542
FIAT       1636
FORD     176314
FREIG      1523
FRUEH     48399
GMC       40340
HINO      11135
HONDA    130974
HYUND     32168
INFIN     20387
INTER     40393
ISUZU     16045
JAGUA      2186
JEEP      36725
KENWO      3991
KIA       11859
LEXUS     30743
LINCO     17828
MACK       3799
MAZDA     14098
ME/BE     47436
MERCU     10533
MI/F       1239
MINI       3025
MITSU     16302
NISSA    107729
NS/OT     19613
OLDSM      1086
PETER      4486
PONTI      3719
PORSC      2876
ROVER      8316
SAAB       1753
SATUR      3478
SMART      2038
STERL      1391
SUBAR     16251
SUZUK      1175
TOYOT    145664
UD         1889
UTIL       1361
VOLKS     25371
VOLVO     10697
WORK       1209
WORKH      6243
dtype: int64

## Apache ORC

## Convert To ORC using Pandas

In [16]:
import pandas as pd

df = pd.read_csv("files/equities.csv")

print(df.head())

df.to_orc("files/equities.orc")

df


      symbol                              name  \
0  000002.SZ             China Vanke Co., Ltd.   
1  000004.SZ                               two   
2  000005.SZ     Shenzhen Fountain Corporation   
3  000006.SZ  Shenzhen Zhenye (Group) Co.,Ltd.   
4  000007.SZ     Shenzhen Quanxinhao Co., Ltd.   

                                             summary currency  \
0  China Vanke Co., Ltd., together with its subsi...      CNY   
1  two is a blank check company. The company was ...      CNY   
2  Shenzhen Fountain Corporation engages in real ...      CNY   
3  Shenzhen Zhenye (Group) Co.,Ltd. engages in th...      CNY   
4  Shenzhen Quanxinhao Co., Ltd. owns and operate...      CNY   

                   sector          industry_group  \
0             Real Estate             Real Estate   
1              Financials  Diversified Financials   
2             Real Estate             Real Estate   
3             Real Estate             Real Estate   
4  Consumer Discretionary       Consumer Se

Unnamed: 0,symbol,name,summary,currency,sector,industry_group,industry,exchange,market,country,state,city,zipcode,website,market_cap,isin,cusip,figi,composite_figi,shareclass_figi
0,000002.SZ,"China Vanke Co., Ltd.","China Vanke Co., Ltd., together with its subsi...",CNY,Real Estate,Real Estate,Real Estate Management & Development,SHZ,Shenzhen Stock Exchange,China,,Shenzhen,518083,http://www.vanke.com,Large Cap,CNE100001SR9,,,,
1,000004.SZ,two,two is a blank check company. The company was ...,CNY,Financials,Diversified Financials,Diversified Financial Services,SHZ,Shenzhen Stock Exchange,United States,CA,San Francisco,94129,http://www.sz000004.cn,Micro Cap,,,,,
2,000005.SZ,Shenzhen Fountain Corporation,Shenzhen Fountain Corporation engages in real ...,CNY,Real Estate,Real Estate,Real Estate Management & Development,SHZ,Shenzhen Stock Exchange,China,,Shenzhen,518001,http://www.fountain.com.cn,Small Cap,CNE0000001L7,,,,
3,000006.SZ,"Shenzhen Zhenye (Group) Co.,Ltd.","Shenzhen Zhenye (Group) Co.,Ltd. engages in th...",CNY,Real Estate,Real Estate,Real Estate Management & Development,SHZ,Shenzhen Stock Exchange,China,,Shenzhen,518008,http://www.zhenye.com,Small Cap,CNE000000164,,,,
4,000007.SZ,"Shenzhen Quanxinhao Co., Ltd.","Shenzhen Quanxinhao Co., Ltd. owns and operate...",CNY,Consumer Discretionary,Consumer Services,"Hotels, Restaurants & Leisure",SHZ,Shenzhen Stock Exchange,China,,Shenzhen,518031,http://www.sz000007.com,Micro Cap,CNE0000000P0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158642,ZZMS.SG,Commerzbank AG,Commerzbank AG provides banking and capital ma...,EUR,Financials,Banks,Banks,STU,Stuttgart Stock Exchange,Germany,,Frankfurt am Main,60311,http://www.commerzbank.com,Mid Cap,,,,,
158643,ZZVA.BE,,,EUR,,,,BER,Berlin Stock Exchange,,,,,,,,,,,
158644,ZZVA.DU,,,EUR,,,,DUS,Dusseldorf Stock Exchange,,,,,,,,,,,
158645,ZZZ.TO,Sleep Country Canada Holdings Inc.,"Sleep Country Canada Holdings Inc., together w...",CAD,Consumer Discretionary,Retailing,Specialty Retail,TOR,TSX Toronto Exchange,Canada,ON,Brampton,L6T 4N8,http://www.sleepcountry.ca,Small Cap,,,,,


### Read Orc Using Pandas

In [30]:
import pandas as pd

df = pd.read_orc(
    "files/equities.orc", columns=["symbol", "name", "currency", "exchange"]
)

df

Unnamed: 0,symbol,name,currency,exchange
0,000002.SZ,"China Vanke Co., Ltd.",CNY,SHZ
1,000004.SZ,two,CNY,SHZ
2,000005.SZ,Shenzhen Fountain Corporation,CNY,SHZ
3,000006.SZ,"Shenzhen Zhenye (Group) Co.,Ltd.",CNY,SHZ
4,000007.SZ,"Shenzhen Quanxinhao Co., Ltd.",CNY,SHZ
...,...,...,...,...
158642,ZZMS.SG,Commerzbank AG,EUR,STU
158643,ZZVA.BE,,EUR,BER
158644,ZZVA.DU,,EUR,DUS
158645,ZZZ.TO,Sleep Country Canada Holdings Inc.,CAD,TOR


## Avro

### Convert to Avro

In [23]:
import json

from fastavro import writer, parse_schema

with open("avro/countries-schema.avsc", "r") as schema_file:
    countries_schema = json.load(schema_file)
    parsed_schema = parse_schema(countries_schema)

with open("files/countries-worldbank.json", "r") as countries_file:
    countries_data = json.load(countries_file)

with open("files/countries-worldbank.avro", "wb") as countries_avro:
    writer(countries_avro, parsed_schema, countries_data)



### Read Avro File

In [24]:
from fastavro import reader

with open("files/countries-worldbank.avro", "rb") as f:
    for country in reader(f):
        print(country)


{'id': 'ABW', 'iso2Code': 'AW', 'name': 'Aruba', 'region': {'id': 'LCN', 'iso2code': 'ZJ', 'value': 'Latin America & Caribbean '}, 'adminregion': {'id': '', 'iso2code': '', 'value': ''}, 'incomeLevel': {'id': 'HIC', 'iso2code': 'XD', 'value': 'High income'}, 'lendingType': {'id': 'LNX', 'iso2code': 'XX', 'value': 'Not classified'}, 'capitalCity': 'Oranjestad', 'longitude': '-70.0167', 'latitude': '12.5167'}
{'id': 'AFE', 'iso2Code': 'ZH', 'name': 'Africa Eastern and Southern', 'region': {'id': 'NA', 'iso2code': 'NA', 'value': 'Aggregates'}, 'adminregion': {'id': '', 'iso2code': '', 'value': ''}, 'incomeLevel': {'id': 'NA', 'iso2code': 'NA', 'value': 'Aggregates'}, 'lendingType': {'id': '', 'iso2code': '', 'value': 'Aggregates'}, 'capitalCity': '', 'longitude': '', 'latitude': ''}
{'id': 'AFG', 'iso2Code': 'AF', 'name': 'Afghanistan', 'region': {'id': 'SAS', 'iso2code': '8S', 'value': 'South Asia'}, 'adminregion': {'id': 'SAS', 'iso2code': '8S', 'value': 'South Asia'}, 'incomeLevel': {'

### Validate Avro

In [32]:
import json

from fastavro.validation import validate_many

with open("avro/countries-schema.avsc", "r") as schema_file:
    countries_schema = json.load(schema_file)

with open("files/countries-worldbank.json", "r") as countries_file:
    countries_data = json.load(countries_file)

validate_many(countries_data, countries_schema)

# test schema with invalid data
invalid_data = [{"id": "1", "b": 2, "c": 3}, {"id": "2", "b": 2, "c": 3}]

# fastavro._validate_common.ValidationError
validate_many(invalid_data, countries_schema)


ValidationError: [
  "Field(com.example.Country.iso2Code) is None expected string",
  "Field(com.example.Country.iso2Code) is None expected string"
]