# PySpark Tutorial

In [66]:
#import paskages
from pyspark.sql import SparkSession

In [67]:
#Creating a SparkSession with the application name "Dataframe"
spark =SparkSession.builder.appName("Dataframe").getOrCreate()

In [68]:
spark

In [69]:
#read dataset
df_pyspark = spark.read.option("header",'true').csv("text2.csv")# it doesn't infer the data types of the columns; all columns will be treated as strings.
df_pyspark

DataFrame[Name: string, Age: string, Experience: string]

In [70]:
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|sudanshu| 30|         8|
|   sunny| 29|         4|
+--------+---+----------+



In [71]:
#check is schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [72]:
#read dataset

df_pyspark = spark.read.option("header",'true').csv("text2.csv" , inferSchema=True)#option tells PySpark to automatically infer the data types of each column in the DataFrame
df_pyspark

DataFrame[Name: string, Age: int, Experience: int]

In [73]:
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|sudanshu| 30|         8|
|   sunny| 29|         4|
+--------+---+----------+



In [74]:
#check is schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [101]:
#read dataset
#specifies   first row of  CSV file contains column names.
df_pyspark = spark.read.option("header",'true').csv("text2.csv" , inferSchema=True ,header=True)
df_pyspark

DataFrame[Name: string, Age: int, Experience: int]

In [102]:
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|sudanshu| 30|         8|
|   sunny| 29|         4|
+--------+---+----------+



In [77]:
#check is schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [78]:
#print type
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [79]:
#name columns
df_pyspark.columns

['Name', 'Age', 'Experience']

In [80]:
#print head data 3 
df_pyspark.head(3)

[Row(Name='Krish', Age=31, Experience=10),
 Row(Name='sudanshu', Age=30, Experience=8),
 Row(Name='sunny', Age=29, Experience=4)]

In [81]:
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|sudanshu| 30|         8|
|   sunny| 29|         4|
+--------+---+----------+



In [82]:
df_pyspark.select("Name")

DataFrame[Name: string]

In [83]:
type(df_pyspark.select("Name"))

pyspark.sql.dataframe.DataFrame

In [84]:
df_pyspark.select(['Name','Experience']).show()

+--------+----------+
|    Name|Experience|
+--------+----------+
|   Krish|        10|
|sudanshu|         8|
|   sunny|         4|
+--------+----------+



In [85]:
#check describe
df_pyspark.describe().show()

+-------+-----+----+-----------------+
|summary| Name| Age|       Experience|
+-------+-----+----+-----------------+
|  count|    3|   3|                3|
|   mean| NULL|30.0|7.333333333333333|
| stddev| NULL| 1.0|3.055050463303893|
|    min|Krish|  29|                4|
|    max|sunny|  31|               10|
+-------+-----+----+-----------------+



In [86]:
#Adding columns in data frame
df_pyspark = df_pyspark.withColumn("Experience After 2 year",df_pyspark['Experience']+2)
df_pyspark.show()

+--------+---+----------+-----------------------+
|    Name|Age|Experience|Experience After 2 year|
+--------+---+----------+-----------------------+
|   Krish| 31|        10|                     12|
|sudanshu| 30|         8|                     10|
|   sunny| 29|         4|                      6|
+--------+---+----------+-----------------------+



In [88]:
#Drop columns 
df_pyspark = df_pyspark.drop('Experience After 2 year')

In [89]:
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|sudanshu| 30|         8|
|   sunny| 29|         4|
+--------+---+----------+



In [90]:
# rename columns
df_pyspark.withColumnRenamed("Name",'New Name').show()

+--------+---+----------+
|New Name|Age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|sudanshu| 30|         8|
|   sunny| 29|         4|
+--------+---+----------+



In [92]:
####################################################

##### Test 

In [96]:
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|sudanshu| 30|         8|
|   sunny| 29|         4|
+--------+---+----------+



In [95]:
df_pyspark.withColumn("Age",df_pyspark['Age']+3).show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|   Krish| 34|        10|
|sudanshu| 33|         8|
|   sunny| 32|         4|
+--------+---+----------+

