In [1]:
import findspark
findspark.init('/usr/local/spark')
import pyspark
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.appName("DataFrame").getOrCreate()

In [4]:
spark

# read the dataset

In [9]:
df_pyspark=spark.read.option("header","true").csv("/home/hduser/Downloads/sharedfolder/emp.csv",inferSchema="true")
df_pyspark.show()

+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
| Rohan| 30|         6|
|  Jaya| 25|         2|
| Kirti| 32|         7|
|Amrita| 28|         4|
| Mohit| 25|         5|
|  Jaya| 29|         2|
+------+---+----------+



# Check schema

In [10]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



# Different way for reading the file

In [12]:
df_spark=spark.read.csv("/home/hduser/Downloads/sharedfolder/emp.csv",inferSchema=True, header=True)
df_spark.show()

+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
| Rohan| 30|         6|
|  Jaya| 25|         2|
| Kirti| 32|         7|
|Amrita| 28|         4|
| Mohit| 25|         5|
|  Jaya| 29|         2|
+------+---+----------+



In [13]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [14]:
type(df_spark)

pyspark.sql.dataframe.DataFrame

# select the columns

In [15]:
df_spark.columns

['Name', 'Age', 'Experience']

In [16]:
df_spark.head(4)

[Row(Name='Rohan', Age=30, Experience=6),
 Row(Name='Jaya', Age=25, Experience=2),
 Row(Name='Kirti', Age=32, Experience=7),
 Row(Name='Amrita', Age=28, Experience=4)]

In [17]:
df_pyspark.show()

+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
| Rohan| 30|         6|
|  Jaya| 25|         2|
| Kirti| 32|         7|
|Amrita| 28|         4|
| Mohit| 25|         5|
|  Jaya| 29|         2|
+------+---+----------+



In [18]:
df_pyspark.select("Name").show()

+------+
|  Name|
+------+
| Rohan|
|  Jaya|
| Kirti|
|Amrita|
| Mohit|
|  Jaya|
+------+



In [19]:
df_pyspark.select(["Name","Experience"]).show()

+------+----------+
|  Name|Experience|
+------+----------+
| Rohan|         6|
|  Jaya|         2|
| Kirti|         7|
|Amrita|         4|
| Mohit|         5|
|  Jaya|         2|
+------+----------+



In [21]:
df_pyspark.show()

+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
| Rohan| 30|         6|
|  Jaya| 25|         2|
| Kirti| 32|         7|
|Amrita| 28|         4|
| Mohit| 25|         5|
|  Jaya| 29|         2|
+------+---+----------+



In [22]:
df_pyspark['Name']

Column<b'Name'>

# check datatypes

In [24]:
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

# describe method

In [26]:
df_pyspark.describe().show()

+-------+------+------------------+-----------------+
|summary|  Name|               Age|       Experience|
+-------+------+------------------+-----------------+
|  count|     6|                 6|                6|
|   mean|  null|28.166666666666668|4.333333333333333|
| stddev|  null| 2.786873995477131|2.065591117977289|
|    min|Amrita|                25|                2|
|    max| Rohan|                32|                7|
+-------+------+------------------+-----------------+



# Adding Columns


In [31]:
df_pyspark=df_pyspark.withColumn("Experience after 2 years",df_pyspark["Experience"]+2)

In [32]:
df_pyspark.show()

+------+---+----------+------------------------+
|  Name|Age|Experience|Experience after 2 years|
+------+---+----------+------------------------+
| Rohan| 30|         6|                       8|
|  Jaya| 25|         2|                       4|
| Kirti| 32|         7|                       9|
|Amrita| 28|         4|                       6|
| Mohit| 25|         5|                       7|
|  Jaya| 29|         2|                       4|
+------+---+----------+------------------------+



# Drop the column


In [33]:
df_spark=df_spark.drop("Experience after 2 years")
df_spark.show()

+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
| Rohan| 30|         6|
|  Jaya| 25|         2|
| Kirti| 32|         7|
|Amrita| 28|         4|
| Mohit| 25|         5|
|  Jaya| 29|         2|
+------+---+----------+



# Rename the column

In [35]:
df_pyspark.withColumnRenamed("Name","NewName").show()

+-------+---+----------+------------------------+
|NewName|Age|Experience|Experience after 2 years|
+-------+---+----------+------------------------+
|  Rohan| 30|         6|                       8|
|   Jaya| 25|         2|                       4|
|  Kirti| 32|         7|                       9|
| Amrita| 28|         4|                       6|
|  Mohit| 25|         5|                       7|
|   Jaya| 29|         2|                       4|
+-------+---+----------+------------------------+



# Dropping Rows