In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [5]:
spark

In [6]:
# Read the dataset
df_pyspark = spark.read.option('header', 'true').csv('Experience.csv')
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|Harrison| 24|         2|
|  Edward| 27|         2|
| Charlie| 25|         4|
|    Luka| 25|         4|
| Kirstin| 51|        30|
|   Harry| 55|        34|
+--------+---+----------+



In [7]:
# Check the schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [8]:
df_pyspark = spark.read.csv('Experience.csv', header=True, inferSchema=True)
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|Harrison| 24|         2|
|  Edward| 27|         2|
| Charlie| 25|         4|
|    Luka| 25|         4|
| Kirstin| 51|        30|
|   Harry| 55|        34|
+--------+---+----------+



In [10]:
# Check the schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [11]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [13]:
df_pyspark.head(3)

[Row(Name='Harrison', Age=24, Experience=2),
 Row(Name='Edward', Age=27, Experience=2),
 Row(Name='Charlie', Age=25, Experience=4)]

In [14]:
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|Harrison| 24|         2|
|  Edward| 27|         2|
| Charlie| 25|         4|
|    Luka| 25|         4|
| Kirstin| 51|        30|
|   Harry| 55|        34|
+--------+---+----------+



In [17]:
df_pyspark.select(['Name', 'Experience']).show()

+--------+----------+
|    Name|Experience|
+--------+----------+
|Harrison|         2|
|  Edward|         2|
| Charlie|         4|
|    Luka|         4|
| Kirstin|        30|
|   Harry|        34|
+--------+----------+



In [20]:
df_pyspark['Name']

Column<'Name'>

In [22]:
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [23]:
df_pyspark.describe().show()


+-------+-------+------------------+------------------+
|summary|   Name|               Age|        Experience|
+-------+-------+------------------+------------------+
|  count|      6|                 6|                 6|
|   mean|   null|              34.5|12.666666666666666|
| stddev|   null|14.418737808837498|15.055453054181621|
|    min|Charlie|                24|                 2|
|    max|   Luka|                55|                34|
+-------+-------+------------------+------------------+



In [27]:
# Adding columns in data frame
df_pyspark = df_pyspark.withColumn('Experience After 2 years', df_pyspark['Experience']+2)
df_pyspark.show()

In [28]:
df_pyspark.show()

+--------+---+----------+------------------------+
|    Name|Age|Experience|Experience After 2 years|
+--------+---+----------+------------------------+
|Harrison| 24|         2|                       4|
|  Edward| 27|         2|                       4|
| Charlie| 25|         4|                       6|
|    Luka| 25|         4|                       6|
| Kirstin| 51|        30|                      32|
|   Harry| 55|        34|                      36|
+--------+---+----------+------------------------+



In [30]:
# Drop the columns
df_pyspark = df_pyspark.drop('Experience after 2 years')
df_pyspark.show

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|Harrison| 24|         2|
|  Edward| 27|         2|
| Charlie| 25|         4|
|    Luka| 25|         4|
| Kirstin| 51|        30|
|   Harry| 55|        34|
+--------+---+----------+



In [32]:
# Rename the columns
df_pyspark = df_pyspark.withColumnRenamed('Name', 'New Name')
df_pyspark.show()

+--------+---+----------+
|New Name|Age|Experience|
+--------+---+----------+
|Harrison| 24|         2|
|  Edward| 27|         2|
| Charlie| 25|         4|
|    Luka| 25|         4|
| Kirstin| 51|        30|
|   Harry| 55|        34|
+--------+---+----------+

