# Importing libs and reading csv

In [4]:
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName('pyspark_test').getOrCreate()

df = spark.read.option('header','true').csv('ggplot2_diamonds.csv',inferSchema=True)

# Printing Options

In [5]:
df.printSchema()

root
 |-- carat: string (nullable = true)
 |-- cut: string (nullable = true)
 |-- color: string (nullable = true)
 |-- clarity: string (nullable = true)
 |-- depth: string (nullable = true)
 |-- table: double (nullable = true)
 |-- price: string (nullable = true)
 |-- x: double (nullable = true)
 |-- y: double (nullable = true)
 |-- z: double (nullable = true)



In [6]:
df.head(5)

[Row(carat='0.23', cut='Ideal', color='NA', clarity='SI2', depth='61.5', table=55.0, price='326', x=3.95, y=3.98, z=2.43),
 Row(carat='0.21', cut='Premium', color='E', clarity='SI1', depth='59.8', table=61.0, price='NA', x=3.89, y=3.84, z=2.31),
 Row(carat='0.23', cut='Good', color='E', clarity='VS1', depth='56.9', table=65.0, price='327', x=4.05, y=4.07, z=2.31),
 Row(carat='0.29', cut='Premium', color='I', clarity='VS2', depth='62.4', table=58.0, price='334', x=4.2, y=4.23, z=2.63),
 Row(carat='0.31', cut='Good', color='J', clarity='SI2', depth='NA', table=58.0, price='335', x=4.34, y=4.35, z=2.75)]

In [7]:
df.show(10)

+-----+---------+-----+-------+-----+-----+-----+----+----+----+
|carat|      cut|color|clarity|depth|table|price|   x|   y|   z|
+-----+---------+-----+-------+-----+-----+-----+----+----+----+
| 0.23|    Ideal|   NA|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|
| 0.21|  Premium|    E|    SI1| 59.8| 61.0|   NA|3.89|3.84|2.31|
| 0.23|     Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|
| 0.29|  Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|
| 0.31|     Good|    J|    SI2|   NA| 58.0|  335|4.34|4.35|2.75|
| 0.24|Very Good|    J|   VVS2| 62.8| 57.0|  336|3.94|3.96|2.48|
| 0.24|Very Good|    I|   VVS1| 62.3| 57.0|  336|3.95|3.98|2.47|
|   NA|Very Good|    H|    SI1| 61.9| 55.0|  337|4.07|4.11|2.53|
| 0.22|     Fair|    E|     NA| 65.1| 61.0|  337|3.87|3.78|2.49|
| 0.23|Very Good|    H|    VS1| 59.4| 61.0|  338| 4.0|4.05|2.39|
+-----+---------+-----+-------+-----+-----+-----+----+----+----+
only showing top 10 rows



In [8]:
df.describe().show()



+-------+-------------------+---------+-----+-------+------------------+------------------+------------------+------------------+------------------+------------------+
|summary|              carat|      cut|color|clarity|             depth|             table|             price|                 x|                 y|                 z|
+-------+-------------------+---------+-----+-------+------------------+------------------+------------------+------------------+------------------+------------------+
|  count|              53940|    53940|53940|  53940|             53940|             53940|             53940|             53940|             53940|             53940|
|   mean|  0.797949720981092|     null| null|   null| 61.74937614712838| 57.45718390804603|3932.8665900368937| 5.731157211716609| 5.734525954764462|3.5387337782723316|
| stddev|0.47400997915312215|     null| null|   null|1.4326190412759938|2.2344905628213247|3989.4464914438868|1.1217607467924915|1.1421346741235616|0.7056988469



# Operating with columns

In [9]:
df['carat']

Column<'carat'>

## Add new column

In [13]:
df = df.withColumn('table_doubled',2 * df['table'])
df.show(5)

+-----+-------+-----+-------+-----+-----+-----+----+----+----+-------------+
|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|table_doubled|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+-------------+
| 0.23|  Ideal|   NA|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|        110.0|
| 0.21|Premium|    E|    SI1| 59.8| 61.0|   NA|3.89|3.84|2.31|        122.0|
| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|        130.0|
| 0.29|Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|        116.0|
| 0.31|   Good|    J|    SI2|   NA| 58.0|  335|4.34|4.35|2.75|        116.0|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+-------------+
only showing top 5 rows



## Drop column

In [14]:
df = df.drop('table_doubled')
df.show(5)

+-----+-------+-----+-------+-----+-----+-----+----+----+----+
|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+
| 0.23|  Ideal|   NA|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|
| 0.21|Premium|    E|    SI1| 59.8| 61.0|   NA|3.89|3.84|2.31|
| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|
| 0.29|Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|
| 0.31|   Good|    J|    SI2|   NA| 58.0|  335|4.34|4.35|2.75|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+
only showing top 5 rows



## Rename column

In [15]:
df.withColumnRenamed('x','asdasd').show()

+-----+---------+-----+-------+-----+-----+-----+------+----+----+
|carat|      cut|color|clarity|depth|table|price|asdasd|   y|   z|
+-----+---------+-----+-------+-----+-----+-----+------+----+----+
| 0.23|    Ideal|   NA|    SI2| 61.5| 55.0|  326|  3.95|3.98|2.43|
| 0.21|  Premium|    E|    SI1| 59.8| 61.0|   NA|  3.89|3.84|2.31|
| 0.23|     Good|    E|    VS1| 56.9| 65.0|  327|  4.05|4.07|2.31|
| 0.29|  Premium|    I|    VS2| 62.4| 58.0|  334|   4.2|4.23|2.63|
| 0.31|     Good|    J|    SI2|   NA| 58.0|  335|  4.34|4.35|2.75|
| 0.24|Very Good|    J|   VVS2| 62.8| 57.0|  336|  3.94|3.96|2.48|
| 0.24|Very Good|    I|   VVS1| 62.3| 57.0|  336|  3.95|3.98|2.47|
|   NA|Very Good|    H|    SI1| 61.9| 55.0|  337|  4.07|4.11|2.53|
| 0.22|     Fair|    E|     NA| 65.1| 61.0|  337|  3.87|3.78|2.49|
| 0.23|Very Good|    H|    VS1| 59.4| 61.0|  338|   4.0|4.05|2.39|
|  0.3|     Good|    J|    SI1|   64| 55.0|  339|  4.25|4.28|2.73|
| 0.23|    Ideal|    J|    VS1| 62.8| 56.0|  340|  3.93| 3.9|2

## Casting column types

### Option 01: use python builtin datatypes

In [16]:
df.withColumn("carat",df["carat"].cast('float')).printSchema()

root
 |-- carat: float (nullable = true)
 |-- cut: string (nullable = true)
 |-- color: string (nullable = true)
 |-- clarity: string (nullable = true)
 |-- depth: string (nullable = true)
 |-- table: double (nullable = true)
 |-- price: string (nullable = true)
 |-- x: double (nullable = true)
 |-- y: double (nullable = true)
 |-- z: double (nullable = true)



### Option 02: use pyspark datatypes (preferable)

In [19]:
from pyspark.sql.types import StringType, DateType, FloatType

float_cols = ['carat','depth','table','price']

for col in float_cols:
    df = df.withColumn(col,df[col].cast(StringType()))

df.printSchema()

root
 |-- carat: string (nullable = true)
 |-- cut: string (nullable = true)
 |-- color: string (nullable = true)
 |-- clarity: string (nullable = true)
 |-- depth: string (nullable = true)
 |-- table: string (nullable = true)
 |-- price: string (nullable = true)
 |-- x: double (nullable = true)
 |-- y: double (nullable = true)
 |-- z: double (nullable = true)



# Removing NAs

In [3]:
df.printSchema()

root
 |-- carat: string (nullable = true)
 |-- cut: string (nullable = true)
 |-- color: string (nullable = true)
 |-- clarity: string (nullable = true)
 |-- depth: string (nullable = true)
 |-- table: double (nullable = true)
 |-- price: string (nullable = true)
 |-- x: double (nullable = true)
 |-- y: double (nullable = true)
 |-- z: double (nullable = true)



In [None]:
df.na.drop().show()