In [1]:
import pyspark
from pyspark.sql.session import SparkSession

In [2]:
spark = SparkSession.builder\
       .master("local[*]")\
       .appName("parquet_example")\
       .getOrCreate()

In [3]:
spark.sparkContext

In [4]:
df = spark.read.csv('Vermont_Vendor_Payments.csv', header='true', inferSchema = True)

In [5]:
from pyspark.sql.functions import col, to_date
from pyspark.sql.types import DoubleType

In [6]:
df = (df
      .withColumn("Amount", col("Amount").cast(DoubleType()))
      .withColumn("Quarter Ending", to_date(col("Quarter Ending"),"MM/dd/yyy"))
      .withColumnRenamed("Quarter Ending","Quarter_Ending")
      .withColumnRenamed("Vendor Number","Vendor_Number")
      .withColumnRenamed("DeptID Description","DeptID_Description")
      .withColumnRenamed("Fund Description","Fund_Description")
     )

In [7]:
df.printSchema()

root
 |-- Quarter_Ending: date (nullable = true)
 |-- Department: string (nullable = true)
 |-- UnitNo: integer (nullable = true)
 |-- Vendor_Number: string (nullable = true)
 |-- Vendor: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- DeptID_Description: string (nullable = true)
 |-- DeptID: string (nullable = true)
 |-- Amount: double (nullable = true)
 |-- Account: string (nullable = true)
 |-- AcctNo: string (nullable = true)
 |-- Fund_Description: string (nullable = true)
 |-- Fund: string (nullable = true)



In [8]:
print('The total number of rows is:', df.count(), '\nThe total number of columns is:', len(df.columns))

The total number of rows is: 1680170 
The total number of columns is: 14


In [9]:
df.repartition(1).write.mode('overwrite').parquet('Vermont_Vendor_Payments.parquet')

In [10]:
df_parquet=spark.read.parquet('Vermont_Vendor_Payments.parquet')

In [11]:
df_parquet.printSchema()

root
 |-- Quarter_Ending: date (nullable = true)
 |-- Department: string (nullable = true)
 |-- UnitNo: integer (nullable = true)
 |-- Vendor_Number: string (nullable = true)
 |-- Vendor: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- DeptID_Description: string (nullable = true)
 |-- DeptID: string (nullable = true)
 |-- Amount: double (nullable = true)
 |-- Account: string (nullable = true)
 |-- AcctNo: string (nullable = true)
 |-- Fund_Description: string (nullable = true)
 |-- Fund: string (nullable = true)



In [12]:
print('The total number of rows is:', df_parquet.count(), '\nThe total number of columns is:', len(df_parquet.columns))

The total number of rows is: 1680170 
The total number of columns is: 14


In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
parquet_pd = pd.read_parquet('Vermont_Vendor_Payments.parquet', engine='pyarrow')

In [3]:
parquet_pd.head()

Unnamed: 0,Quarter_Ending,Department,UnitNo,Vendor_Number,Vendor,City,State,DeptID_Description,DeptID,Amount,Account,AcctNo,Fund_Description,Fund
0,2009-09-30,Environmental Conservation,6140.0,276016,1st Run Computer Services Inc,,NY,WQD - Waterbury,6140040206,930.0,Rep&Maint-Info Tech Hardware,513000,Environmental Permit Fund,21295
1,2009-09-30,Environmental Conservation,6140.0,276016,1st Run Computer Services Inc,,NY,Water Supply Division - Wtby,6140040406,930.0,Rep&Maint-Info Tech Hardware,513000,Environmental Permit Fund,21295
2,2009-09-30,Vermont Veterans' Home,3300.0,284121,210 Innovations LLC,,CT,MAINTENANCE,3300010300,24.0,Freight & Express Mail,517300,Vermont Medicaid,21782
3,2009-09-30,Vermont Veterans' Home,3300.0,284121,210 Innovations LLC,,CT,MAINTENANCE,3300010300,420.0,Building Maintenance Supplies,520200,Vermont Medicaid,21782
4,2009-09-30,Corrections,3480.0,207719,21st Century Cellular,,PA,Brattleboro P&P,3480004630,270.8,Telecom-Wireless Phone Service,516659,General Fund,10000


In [4]:
print('The total number of rows is:', parquet_pd.shape[0], '\nThe total number of columns is:', parquet_pd.shape[1])

The total number of rows is: 1680170 
The total number of columns is: 14
