In [46]:
from pyspark.sql import SparkSession

In [47]:
# building pyspark session
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [48]:
spark

In [49]:
# reading a dataframe -- use show method to check the values
df_pyspark = spark.read.option('header','true').csv('data1.csv',inferSchema = True)

In [50]:
# getting the schema
df_pyspark.printSchema()

root
 |-- description: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- level: integer (nullable = true)
 |-- size: string (nullable = true)
 |-- line_code: string (nullable = true)
 |-- value: integer (nullable = true)



In [51]:
# another way
df_pyspark = spark.read.csv('data1.csv',header = True,inferSchema = True)
df_pyspark.show(5)

+--------------------+--------------------+-----+---------------+---------+-----+
|         description|            industry|level|           size|line_code|value|
+--------------------+--------------------+-----+---------------+---------+-----+
|Business main cus...|               total|    0| 6�19 employees| C0300.01|15639|
|Business main cus...|               total|    0|20�49 employees| C0300.01| 2943|
|Business main cus...|               total|    0|50�99 employees| C0300.01|  639|
|Business main cus...|               total|    0| 100+ employees| C0300.01|  555|
|Business main cus...|Agriculture, fore...|    1|          total| C0300.01|  348|
+--------------------+--------------------+-----+---------------+---------+-----+
only showing top 5 rows



In [52]:
type(df_pyspark) # Dataframe is a datastructure

pyspark.sql.dataframe.DataFrame

In [53]:
df_pyspark.columns

['description', 'industry', 'level', 'size', 'line_code', 'value']

In [54]:
df_pyspark.head(3)

[Row(description='Business main customer: individuals or households', industry='total', level=0, size='6�19 employees', line_code='C0300.01', value=15639),
 Row(description='Business main customer: individuals or households', industry='total', level=0, size='20�49 employees', line_code='C0300.01', value=2943),
 Row(description='Business main customer: individuals or households', industry='total', level=0, size='50�99 employees', line_code='C0300.01', value=639)]

In [55]:
# getting a selected column
df_pyspark.select('description')

DataFrame[description: string]

In [56]:
df_pyspark.select('description').show(5)

+--------------------+
|         description|
+--------------------+
|Business main cus...|
|Business main cus...|
|Business main cus...|
|Business main cus...|
|Business main cus...|
+--------------------+
only showing top 5 rows



In [57]:
# selecting multiple columns 
df_pyspark.select(['description','industry'])

DataFrame[description: string, industry: string]

In [58]:
df_pyspark.select(['description','industry']).show(5)

+--------------------+--------------------+
|         description|            industry|
+--------------------+--------------------+
|Business main cus...|               total|
|Business main cus...|               total|
|Business main cus...|               total|
|Business main cus...|               total|
|Business main cus...|Agriculture, fore...|
+--------------------+--------------------+
only showing top 5 rows



In [59]:
# column object is returned
df_pyspark['description']

Column<'description'>

In [60]:
# getting the datatypes of the columns 
df_pyspark.dtypes

[('description', 'string'),
 ('industry', 'string'),
 ('level', 'int'),
 ('size', 'string'),
 ('line_code', 'string'),
 ('value', 'int')]

In [61]:
# describe
df_pyspark.describe()

DataFrame[summary: string, description: string, industry: string, level: string, size: string, line_code: string, value: string]

In [62]:
df_pyspark.describe().show()

+-------+--------------------+--------------------+------------------+--------------+---------+-----------------+
|summary|         description|            industry|             level|          size|line_code|            value|
+-------+--------------------+--------------------+------------------+--------------+---------+-----------------+
|  count|               14758|               14758|             14758|         14758|    14758|            14758|
|   mean|                NULL|                NULL|1.4042553191489362|          NULL|     NULL|669.3437457650089|
| stddev|                NULL|                NULL|0.6735203422442279|          NULL|     NULL| 2443.43706710439|
|    min|Are answers appli...|Accommodation & f...|                 0|100+ employees| C0300.01|                0|
|    max|Who reviews and s...|               total|                 2|         total| C3307.04|            44877|
+-------+--------------------+--------------------+------------------+--------------+---

In [65]:
# Adding columns in dataframe 
df_pyspark = df_pyspark.withColumn('level+2',df_pyspark['level']+2)

In [66]:
df_pyspark.select(['level','level+2']).show(5)

+-----+-------+
|level|level+2|
+-----+-------+
|    0|      2|
|    0|      2|
|    0|      2|
|    0|      2|
|    1|      3|
+-----+-------+
only showing top 5 rows



In [71]:
# dropping columns
df_pyspark.drop('level+2').show(5)

+--------------------+--------------------+-----+---------------+---------+-----+
|         description|            industry|level|           size|line_code|value|
+--------------------+--------------------+-----+---------------+---------+-----+
|Business main cus...|               total|    0| 6�19 employees| C0300.01|15639|
|Business main cus...|               total|    0|20�49 employees| C0300.01| 2943|
|Business main cus...|               total|    0|50�99 employees| C0300.01|  639|
|Business main cus...|               total|    0| 100+ employees| C0300.01|  555|
|Business main cus...|Agriculture, fore...|    1|          total| C0300.01|  348|
+--------------------+--------------------+-----+---------------+---------+-----+
only showing top 5 rows



In [72]:
## renaming a column
df_pyspark.withColumnRenamed('industry','paisa').show()

+--------------------+--------------------+-----+---------------+---------+-----+-------+
|         description|               paisa|level|           size|line_code|value|level+2|
+--------------------+--------------------+-----+---------------+---------+-----+-------+
|Business main cus...|               total|    0| 6�19 employees| C0300.01|15639|      2|
|Business main cus...|               total|    0|20�49 employees| C0300.01| 2943|      2|
|Business main cus...|               total|    0|50�99 employees| C0300.01|  639|      2|
|Business main cus...|               total|    0| 100+ employees| C0300.01|  555|      2|
|Business main cus...|Agriculture, fore...|    1|          total| C0300.01|  348|      3|
|Business main cus...|         Agriculture|    2|          total| C0300.01|  177|      4|
|Business main cus...|  Commercial fishing|    2|          total| C0300.01|    3|      4|
|Business main cus...|  Forestry & logging|    2|          total| C0300.01|   12|      4|
|Business 

In [73]:
df_pyspark.show(5)

+--------------------+--------------------+-----+---------------+---------+-----+-------+
|         description|            industry|level|           size|line_code|value|level+2|
+--------------------+--------------------+-----+---------------+---------+-----+-------+
|Business main cus...|               total|    0| 6�19 employees| C0300.01|15639|      2|
|Business main cus...|               total|    0|20�49 employees| C0300.01| 2943|      2|
|Business main cus...|               total|    0|50�99 employees| C0300.01|  639|      2|
|Business main cus...|               total|    0| 100+ employees| C0300.01|  555|      2|
|Business main cus...|Agriculture, fore...|    1|          total| C0300.01|  348|      3|
+--------------------+--------------------+-----+---------------+---------+-----+-------+
only showing top 5 rows



### Pyspark Handling Missing Values
- Dropping columns
- Dropping Rows
- Various parameters in dropping functionalities
- Handling missing values by mean