In [0]:
from pyspark.sql import *
from pyspark.sql.types import IntegerType, StringType, DoubleType, StructField, StructType

In [0]:
path = 'dbfs:/FileStore/sample_data/countries.csv'

countries = (
    spark.read
 .format('csv')
 .option('header','true')
 .option('inferschema', 'true') # Infer the schema initially when exploring the data, but define it in production pipelines
 .load(path)
)

In [0]:
countries.dtypes

[('COUNTRY_ID', 'int'),
 ('NAME', 'string'),
 ('NATIONALITY', 'string'),
 ('COUNTRY_CODE', 'string'),
 ('ISO_ALPHA2', 'string'),
 ('CAPITAL', 'string'),
 ('POPULATION', 'int'),
 ('AREA_KM2', 'double'),
 ('REGION_ID', 'int'),
 ('SUB_REGION_ID', 'int'),
 ('INTERMEDIATE_REGION_ID', 'int'),
 ('ORGANIZATION_REGION_ID', 'int')]

In [0]:

countries_schema = StructType([
        StructField('COUNTRY_ID', IntegerType(), False),
        StructField('NAME', StringType(), False),
        StructField('NATIONALITY', StringType(),False),
        StructField('COUNTRY_CODE', StringType(),False),
        StructField('ISO_ALPHA2', StringType(),False),
        StructField("CAPITAL", StringType(), False),
        StructField("POPULATION", DoubleType(), False),
        StructField("AREA_KM2", IntegerType(), False),
        StructField("REGION_ID", IntegerType(), True),
        StructField("SUB_REGION_ID", IntegerType(), True),
         StructField("INTERMEDIATE_REGION_ID", IntegerType(), True),
        StructField("ORGANIZATION_REGION_ID", IntegerType(), True) 
        ])

In [0]:
countries = (
spark.read
 .format('csv')
 .option('header','true')
 .schema(countries_schema)
 .load(path)
)

In [0]:
display(countries.limit(5))

COUNTRY_ID,NAME,NATIONALITY,COUNTRY_CODE,ISO_ALPHA2,CAPITAL,POPULATION,AREA_KM2,REGION_ID,SUB_REGION_ID,INTERMEDIATE_REGION_ID,ORGANIZATION_REGION_ID
1,Afghanistan,Afghan,AFG,AF,Kabul,38041754.0,652230,30,30,,30
2,Albania,Albanian,ALB,AL,Tirana,2880917.0,28748,20,70,,20
3,Algeria,Algerian,DZA,DZ,Algiers,43053054.0,2381741,50,40,,20
4,American Samoa,American Samoan,ASM,AS,Pago Pago,55312.0,199,40,20,,30
5,Andorra,Andorran,AND,AD,Andorra la Vella,77142.0,468,20,70,,20


In [0]:
%sql
CREATE DATABASE IF NOT EXISTS locations_db;



In [0]:
spark.catalog.setCurrentDatabase("locations_db")

In [0]:
countries.write.mode('overwrite').saveAsTable('countries')

In [0]:
%sql
DESCRIBE EXTENDED locations_db.countries

col_name,data_type,comment
COUNTRY_ID,int,
NAME,string,
NATIONALITY,string,
COUNTRY_CODE,string,
ISO_ALPHA2,string,
CAPITAL,string,
POPULATION,double,
AREA_KM2,int,
REGION_ID,int,
SUB_REGION_ID,int,
