In [1]:
! python --version

Python 3.8.12


In [2]:
!pip install pyspark

Collecting pyspark
  Using cached pyspark-3.2.1-py2.py3-none-any.whl
Collecting py4j==0.10.9.3
  Using cached py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


## Pyspark Introduction

In [2]:
import pyspark

In [5]:
!pip install pandas

Collecting pandas
  Downloading pandas-1.4.0-cp38-cp38-win_amd64.whl (10.6 MB)
Collecting numpy>=1.18.5
  Downloading numpy-1.22.2-cp38-cp38-win_amd64.whl (14.7 MB)
Collecting pytz>=2020.1
  Downloading pytz-2021.3-py2.py3-none-any.whl (503 kB)
Installing collected packages: pytz, numpy, pandas
Successfully installed numpy-1.22.2 pandas-1.4.0 pytz-2021.3


In [12]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.0.9-py2.py3-none-any.whl (242 kB)
Collecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.9


In [3]:
import pandas as pd

In [17]:
pandas_df = pd.read_csv('test1.csv')
pandas_df

Unnamed: 0,name;age
0,Jasmin;27
1,Marcus;29
2,Vanessa;25
3,Christine;51


In [5]:
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [7]:
spark

In [30]:
df_pyspark = spark.read.csv('test1.csv')

In [40]:
df_pyspark.printSchema()

root
 |-- name;age: string (nullable = true)



In [32]:
df_pyspark.show()

+------------+
|         _c0|
+------------+
|    name;age|
|   Jasmin;27|
|   Marcus;29|
|  Vanessa;25|
|Christine;51|
+------------+



make first row column names, ; as seperator so we don't mix up columns

In [41]:
df_pyspark = spark.read.csv('test1.csv', header=True, sep=";")

In [44]:
df_pyspark.show()

+---------+---+
|     name|age|
+---------+---+
|   Jasmin| 27|
|   Marcus| 29|
|  Vanessa| 25|
|Christine| 51|
+---------+---+



In [42]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [43]:
df_pyspark.head(1)

[Row(name='Jasmin', age='27')]

In [39]:
df_pyspark.printSchema()

root
 |-- name;age: string (nullable = true)



## PySpark DataFrames Part 1
### reading dataset, checking the datatypes of the column (Schema), selecting columns and indexing, check describe option similar to pandas, adding columns, dropping columns

let's see different ways to read in data

In [50]:
spark

In [70]:
# read dataset
df1 = spark.read.csv('test1.csv', header=True, sep=";", inferSchema=True)

In [71]:
# Check schema
df1.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Alter: integer (nullable = true)
 |-- Skills: integer (nullable = true)



Why is it reading in my ints as strings? --> add the inferSchema option above in the read function!

In [72]:
df1 = spark.read.csv('test1.csv', header=True, sep=";", inferSchema=True)
df1.show()

+-------+-----+------+
|   Name|Alter|Skills|
+-------+-----+------+
| Marcus|   29|   100|
|Vanessa|   25|    90|
| Jasmin|   27|    80|
+-------+-----+------+



In [73]:
type(df1)

pyspark.sql.dataframe.DataFrame

In [75]:
#select certain columns
df1.select(['Name', 'Skills']).show()

+-------+------+
|   Name|Skills|
+-------+------+
| Marcus|   100|
|Vanessa|    90|
| Jasmin|    80|
+-------+------+



In [76]:
#show datatypes of all columns
df1.dtypes

[('Name', 'string'), ('Alter', 'int'), ('Skills', 'int')]

In [78]:
#check the describe options - gives out statistics about my dataframe
df1.describe().show()

+-------+-------+-----+------+
|summary|   Name|Alter|Skills|
+-------+-------+-----+------+
|  count|      3|    3|     3|
|   mean|   null| 27.0|  90.0|
| stddev|   null|  2.0|  10.0|
|    min| Jasmin|   25|    80|
|    max|Vanessa|   29|   100|
+-------+-------+-----+------+



In [82]:
#adding columns --> add new column and add 2 two values of old column Skills
df1.withColumn('Skills After Two Years', df1['Skills']+2).show()

+-------+-----+------+----------------------+
|   Name|Alter|Skills|Skills After Two Years|
+-------+-----+------+----------------------+
| Marcus|   29|   100|                   102|
|Vanessa|   25|    90|                    92|
| Jasmin|   27|    80|                    82|
+-------+-----+------+----------------------+

