NB: pandas doesn’t work in local IDE to load data through databricks-connect (pandas works just in notebook in cloud, within Databricks workspace). P.S. the purpose we want to load data into pandas, not into spark dataframe is the difference in data distribution. Pandas will load to the default number of partitions(spark will automatically load up to 128 Mb in a single partition). 
In our case, due to the cluster type selected, the default number of partitions = 4 (number of cores)

In [0]:
import pandas as pd

In [0]:
# load to pandas to destribute by default to 4 partitions(cluster has 4 cores). Otherwise spark will load up to 128 mb to single partition

insurance_pandas_df = pd.read_csv("/dbfs/FileStore/datasets/insurance.csv")
insurance_df = spark.createDataFrame(insurance_pandas_df)

In [0]:
# check number of partitions = 4 (number of cores on cluster)

insurance_df.rdd.getNumPartitions()

In [0]:
# displays first 20 rows by default, selected from a single partition
insurance_df.show()

In [0]:
# take a look in job description. Count includes shuffle, and writes on the disk
insurance_df.count()

In [0]:
#select/display is a narrow transformation
insurance_df.select('age', 'sex', 'smoker').display(5)

age,sex,smoker
19,female,yes
18,male,no
28,male,no
33,male,no
32,male,no
31,female,no
46,female,no
37,female,no
37,male,no
60,female,no


In [0]:
# operation leads from 4 to 1 partition (wrtites into the disc)
insurance_df.describe().display()

summary,age,sex,bmi,children,smoker,region,charges,insuranceclaim
count,1338.0,1338,1338.0,1338.0,1338,1338,1338.0,1338
mean,39.20702541106129,,30.66339686098655,1.0949177877429,,,13270.422265141257,
stddev,14.049960379216154,,6.098186911679014,1.205492739781914,,,12110.011236694005,
min,18.0,female,15.96,0.0,no,northeast,1121.8739,No
max,64.0,male,53.13,5.0,yes,southwest,63770.42801,Yes


In [0]:
#another narrow transformation
insurance_df.select('sex').distinct().show()

In [0]:
# cross-tabulation is a wide transformation
insurance_df.crosstab('sex', 'smoker')\
            .select('sex_smoker', 'yes', 'no')\
            .display()

sex_smoker,yes,no
male,159,517
female,115,547


In [0]:
# filter works like WHERE condition in SQL. It's a narrow transformation due to just 1 region is selected
insurance_df.select('age', 'sex', 'bmi', 'region', 'charges')\
            .filter(insurance_df['region'] == 'southwest')\
            .limit(10)\
            .display()

age,sex,bmi,region,charges
19,female,27.9,southwest,16884.924
23,male,34.4,southwest,1826.843
19,male,24.6,southwest,1837.237
56,male,40.3,southwest,10602.385
30,male,35.3,southwest,36837.467
30,female,32.4,southwest,4149.736
31,male,36.3,southwest,38711.0
22,male,35.6,southwest,35585.576
19,female,28.6,southwest,4687.797
28,male,36.4,southwest,51194.55914


In [0]:
#where() and filter() are substitutionary commands, return same result
insurance_df.select('age', 'sex', 'bmi', 'region', 'charges')\
            .where(insurance_df['age']>50)\
            .limit(10)\
            .display()

age,sex,bmi,region,charges
60,female,25.84,northwest,28923.13692
62,female,26.29,southeast,27808.7251
56,female,39.82,southeast,11090.7178
52,female,30.78,northeast,10797.3362
56,male,40.3,southwest,10602.385
60,female,36.005,northeast,13228.84695
59,female,27.72,southeast,14001.1338
63,female,23.085,northeast,14451.83515
55,female,32.775,northwest,12268.63225
63,male,28.31,northwest,13770.0979


In [0]:
insurance_df.select('age', 'sex', 'bmi', 'region', 'charges')\
            .where((insurance_df['age']>50) & (insurance_df['sex']=='female'))\
            .orderBy('charges')\
            .display()

age,sex,bmi,region,charges
51,female,20.6,southwest,9264.797
51,female,34.1,southeast,9283.562
52,female,31.2,southwest,9625.92
52,female,37.4,southwest,9634.538
51,female,18.05,northwest,9644.2525
51,female,21.56,southeast,9855.1314
51,female,25.8,southwest,9861.025
51,female,33.915,northeast,9866.30485
51,female,34.2,southwest,9872.701
51,female,40.66,northeast,9875.6804


In [0]:
#.isin([])
insurance_df.select('age', 'sex', 'bmi', 'region', 'charges')\
            .where(insurance_df['region'].isin(['southwest', 'southeast']))\
            .withColumnRenamed('sex', 'gender')\
            .orderBy('charges')\
            .display()

age,gender,bmi,region,charges
18,male,23.21,southeast,1121.8739
18,male,30.14,southeast,1131.5066
18,male,33.33,southeast,1135.9407
18,male,33.66,southeast,1136.3994
18,male,34.1,southeast,1137.011
18,male,34.43,southeast,1137.4697
18,male,37.29,southeast,1141.4451
18,male,41.14,southeast,1146.7966
18,male,43.01,southeast,1149.3959
18,male,53.13,southeast,1163.4627
