# Installation et imports

In [None]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
import pandas as pd
import numpy as np

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col

from pyspark.ml.regression import LinearRegression
from pyspark.mllib.evaluation import RegressionMetrics

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator

# Download datasets

In [None]:
!gdown https://drive.google.com/uc?id=1sp0G0pLgqn7hk7v6Sx2Il3Tg_nqZuSG4 -O dataset.zip

Downloading...
From: https://drive.google.com/uc?id=15rYqEualW8Z4nnzCjd3kecrBWcMu3YYF
To: /content/dataset.zip
  0% 0.00/830k [00:00<?, ?B/s]100% 830k/830k [00:00<00:00, 115MB/s]


In [None]:
!unzip dataset.zip

Archive:  dataset.zip
   creating: dataset/
  inflating: dataset/iris.csv        
  inflating: __MACOSX/dataset/._iris.csv  
  inflating: dataset/titanic.csv     
  inflating: __MACOSX/dataset/._titanic.csv  
  inflating: dataset/wine.csv        
  inflating: __MACOSX/dataset/._wine.csv  
  inflating: dataset/house.csv       
  inflating: __MACOSX/dataset/._house.csv  
  inflating: dataset/diabetes.csv    
  inflating: __MACOSX/dataset/._diabetes.csv  


In [None]:
ls ./dataset/

diabetes.csv  house.csv  iris.csv  titanic.csv  wine.csv


# Manipulation de Spark Scala

In [None]:
spark = (SparkSession.builder
                    .master("local[2]")
                    .appName("prise_en_main")
                    .getOrCreate())

In [None]:
df = (spark.read.format("com.databricks.spark.csv")
              .option("header", "true")
              .option("inferSchema", "true") 
              .load("./dataset/iris.csv"))

### Show the dataframe
Use the function `show`

###  Print the schema of the dataframe
Use the fonction `printSchema`

### Select some columns
Use the function `select`

### Rename some columns
Use the fonction `withColumnRenamed("actual_name", "new_name")`
For example rename the column "class" to "irisClass".

### Get the distinct values of a columns
Use the function `select`and the function `distinct`

### Insert a new column
Use the fonction `withColumn`
For example add a new column with the categorisation of the Iris classes into numeric variable.
The column will be `IrisNum` and we want the following classification :
- Iris-virginica => 0
- Iris-setosa => 1
- Iris-versicolor => 2

You will need to use operators `when` and `otherwise`.

### Count the number of lines of a dataframe
Use the function `count`

### Change the type of column
The transformations on some columns are made for example doing a `withColumn`.
And inside the `withColumn` you can make some transformation using some created functions or existing functions.

**For example**: Use the function `cast` in order to change the type of the column "sepalLength" to string type.

# Exercice : pratique

1. Show the dataframe
2. Print the schema of the dataframe
3. Select the columns `Survived` and `Name`
4. Rename the column `PassengerId` to `Id`
5. Get the distinct values of the column `Embarked`
6. Count how many NaN values in the column `Embarked`
7. Filter the rows with NaN values in the column `Embarked`
8. Drop all rows with NaN values

In [None]:
df = (spark.read.format("com.databricks.spark.csv")
              .option("header", "true")
              .option("inferSchema", "true") 
              .load("./dataset/titanic.csv"))