In [0]:
from pyspark.sql import *
from pyspark.sql.types import IntegerType, StringType, DoubleType, StructField, StructType

In [0]:
path = 'dbfs:/FileStore/sample_data/countries.csv'

countries = (
    spark.read
 .format('csv')
 .option('header','true')
 .option('inferschema', 'true') # Infer the schema initially when exploring the data, but define it in production pipelines
 .load(path)
)

In [0]:
display(countries)

COUNTRY_ID,NAME,NATIONALITY,COUNTRY_CODE,ISO_ALPHA2,CAPITAL,POPULATION,AREA_KM2,REGION_ID,SUB_REGION_ID,INTERMEDIATE_REGION_ID,ORGANIZATION_REGION_ID
1,Afghanistan,Afghan,AFG,AF,Kabul,38041754,652230.0,30,30.0,,30
2,Albania,Albanian,ALB,AL,Tirana,2880917,28748.0,20,70.0,,20
3,Algeria,Algerian,DZA,DZ,Algiers,43053054,2381741.0,50,40.0,,20
4,American Samoa,American Samoan,ASM,AS,Pago Pago,55312,199.0,40,20.0,,30
5,Andorra,Andorran,AND,AD,Andorra la Vella,77142,468.0,20,70.0,,20
6,Angola,Angolan,AGO,AO,Luanda,31825295,1246700.0,50,160.0,80.0,20
7,Anguilla,Anguillan,AIA,AI,The Valley,14869,91.0,10,10.0,60.0,40
8,Antarctica,Antarctic,ATA,AQ,McMurdo Station,1106,14200000.0,40,,,30
9,Antigua and Barbuda,Antiguan or Barbudan,ATG,AG,St. John's,97118,442.0,10,10.0,60.0,40
10,Argentina,Argentine,ARG,AR,Buenos Aires,44780677,2780400.0,10,10.0,40.0,40


In [0]:
countries.dtypes

[('COUNTRY_ID', 'int'),
 ('NAME', 'string'),
 ('NATIONALITY', 'string'),
 ('COUNTRY_CODE', 'string'),
 ('ISO_ALPHA2', 'string'),
 ('CAPITAL', 'string'),
 ('POPULATION', 'int'),
 ('AREA_KM2', 'double'),
 ('REGION_ID', 'int'),
 ('SUB_REGION_ID', 'int'),
 ('INTERMEDIATE_REGION_ID', 'int'),
 ('ORGANIZATION_REGION_ID', 'int')]

In [0]:

countries_schema = StructType([
        StructField('COUNTRY_ID', IntegerType(), False),
        StructField('NAME', StringType(), False),
        StructField('NATIONALITY', StringType(),False),
        StructField('COUNTRY_CODE', StringType(),False),
        StructField('ISO_ALPHA2', StringType(),False),
        StructField("CAPITAL", StringType(), False),
        StructField("POPULATION", DoubleType(), False),
        StructField("AREA_KM2", IntegerType(), False),
        StructField("REGION_ID", IntegerType(), True),
        StructField("SUB_REGION_ID", IntegerType(), True),
         StructField("INTERMEDIATE_REGION_ID", IntegerType(), True),
        StructField("ORGANIZATION_REGION_ID", IntegerType(), True) 
        ])

In [0]:
countries = (
spark.read
 .format('csv')
 .option('header','true')
 .schema(countries_schema)
 .load(path)
)

In [0]:
display(countries)

COUNTRY_ID,NAME,NATIONALITY,COUNTRY_CODE,ISO_ALPHA2,CAPITAL,POPULATION,AREA_KM2,REGION_ID,SUB_REGION_ID,INTERMEDIATE_REGION_ID,ORGANIZATION_REGION_ID
1,Afghanistan,Afghan,AFG,AF,Kabul,38041754.0,652230.0,30,30.0,,30
2,Albania,Albanian,ALB,AL,Tirana,2880917.0,28748.0,20,70.0,,20
3,Algeria,Algerian,DZA,DZ,Algiers,43053054.0,2381741.0,50,40.0,,20
4,American Samoa,American Samoan,ASM,AS,Pago Pago,55312.0,199.0,40,20.0,,30
5,Andorra,Andorran,AND,AD,Andorra la Vella,77142.0,468.0,20,70.0,,20
6,Angola,Angolan,AGO,AO,Luanda,31825295.0,1246700.0,50,160.0,80.0,20
7,Anguilla,Anguillan,AIA,AI,The Valley,14869.0,91.0,10,10.0,60.0,40
8,Antarctica,Antarctic,ATA,AQ,McMurdo Station,1106.0,14200000.0,40,,,30
9,Antigua and Barbuda,Antiguan or Barbudan,ATG,AG,St. John's,97118.0,442.0,10,10.0,60.0,40
10,Argentina,Argentine,ARG,AR,Buenos Aires,44780677.0,2780400.0,10,10.0,40.0,40


In [0]:
%sql
CREATE DATABASE IF NOT EXISTS locations_db;



In [0]:
spark.catalog.setCurrentDatabase("locations_db")

In [0]:
countries.write.mode('overwrite').saveAsTable('countries')

In [0]:
%sql
DESCRIBE EXTENDED locations_db.countries

col_name,data_type,comment
COUNTRY_ID,int,
NAME,string,
NATIONALITY,string,
COUNTRY_CODE,string,
ISO_ALPHA2,string,
CAPITAL,string,
POPULATION,double,
AREA_KM2,int,
REGION_ID,int,
SUB_REGION_ID,int,
