In [1]:
from pyspark.sql import SparkSession

In [2]:
spark2 = SparkSession.builder.appName("dataframe").getOrCreate()

In [3]:
spark2

#### reading the column

In [33]:
df_pyspark = spark2.read.option('header','true').csv('test.csv',inferSchema=True)

In [34]:
df_pyspark

DataFrame[year: int, country: string, captain: string]

In [35]:
df_pyspark.show()

+----+-------+-------+
|year|country|captain|
+----+-------+-------+
|2003|    Aus|  Ricky|
|2007|    Aus|  Ricky|
|2011|    Ind|  Dhoni|
|2015|    Aus| Clarke|
|2019|    Eng| Morgan|
|2023|    Aus|    Pat|
+----+-------+-------+



#### checking the datatypes of the dataframe

In [36]:
df_pyspark.printSchema()

root
 |-- year: integer (nullable = true)
 |-- country: string (nullable = true)
 |-- captain: string (nullable = true)



In [37]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [38]:
df_pyspark.columns # get table columns

['year', 'country', 'captain']

#### viewing top/bottom records

In [42]:
df_pyspark.head(4) # top n records

[Row(year=2003, country='Aus', captain='Ricky'),
 Row(year=2007, country='Aus', captain='Ricky'),
 Row(year=2011, country='Ind', captain='Dhoni'),
 Row(year=2015, country='Aus', captain='Clarke')]

In [43]:
df_pyspark.tail(2) # top n records

[Row(year=2019, country='Eng', captain='Morgan'),
 Row(year=2023, country='Aus', captain='Pat')]

### Selecting specific columns

In [44]:
df_pyspark.select('year') #get info about column

DataFrame[year: int]

In [45]:
df_pyspark.select('year').show() #view entire column

+----+
|year|
+----+
|2003|
|2007|
|2011|
|2015|
|2019|
|2023|
+----+



In [46]:
df_pyspark.select(['year','captain']) #get info about multiple columns

DataFrame[year: int, captain: string]

In [47]:
df_pyspark.select(['year','captain']).show() #view multiple columns

+----+-------+
|year|captain|
+----+-------+
|2003|  Ricky|
|2007|  Ricky|
|2011|  Dhoni|
|2015| Clarke|
|2019| Morgan|
|2023|    Pat|
+----+-------+



In [15]:
#usage similar to pandas
df_pyspark['year'] ## --> pyspark just provides column name back to us

Column<'year'>

In [16]:
#similar to pandas
df_pyspark.dtypes

[('year', 'int'), ('country', 'string'), ('captain', 'string')]

#### describe in pyspark

In [48]:
df_pyspark.describe()

DataFrame[summary: string, year: string, country: string, captain: string]

In [49]:
df_pyspark.describe().show()

+-------+-----------------+-------+-------+
|summary|             year|country|captain|
+-------+-----------------+-------+-------+
|  count|                6|      6|      6|
|   mean|           2013.0|   NULL|   NULL|
| stddev|7.483314773547883|   NULL|   NULL|
|    min|             2003|    Aus| Clarke|
|    max|             2023|    Ind|  Ricky|
+-------+-----------------+-------+-------+



#### adding columns to dataframes

In [50]:
#adding volumns to dataframe

df_pyspark.withColumn('year added',df_pyspark['year']+5)

DataFrame[year: int, country: string, captain: string, year added: int]

In [51]:
df_pyspark = df_pyspark.withColumn('year added',df_pyspark['year']+5)

In [52]:
df_pyspark.show()

+----+-------+-------+----------+
|year|country|captain|year added|
+----+-------+-------+----------+
|2003|    Aus|  Ricky|      2008|
|2007|    Aus|  Ricky|      2012|
|2011|    Ind|  Dhoni|      2016|
|2015|    Aus| Clarke|      2020|
|2019|    Eng| Morgan|      2024|
|2023|    Aus|    Pat|      2028|
+----+-------+-------+----------+



#### dropping column/s in pyspark

In [53]:
#drop the column

In [54]:
df_pyspark = df_pyspark.drop('year added')

In [55]:
df_pyspark.show()

+----+-------+-------+
|year|country|captain|
+----+-------+-------+
|2003|    Aus|  Ricky|
|2007|    Aus|  Ricky|
|2011|    Ind|  Dhoni|
|2015|    Aus| Clarke|
|2019|    Eng| Morgan|
|2023|    Aus|    Pat|
+----+-------+-------+



In [56]:
df_pyspark.select('year').show()

+----+
|year|
+----+
|2003|
|2007|
|2011|
|2015|
|2019|
|2023|
+----+



#### Renaming columns

In [59]:
df_pyspark = df_pyspark.withColumnRenamed("year","Year-New")

In [60]:
df_pyspark.show()

+--------+-------+-------+
|Year-New|country|captain|
+--------+-------+-------+
|    2003|    Aus|  Ricky|
|    2007|    Aus|  Ricky|
|    2011|    Ind|  Dhoni|
|    2015|    Aus| Clarke|
|    2019|    Eng| Morgan|
|    2023|    Aus|    Pat|
+--------+-------+-------+

