In [2]:
from pyspark import SparkContext
sc = SparkContext("local","Intro")
# Spark Context allows you to work with the low level features of Spark and the core functions ( RDD Operations )

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SparkIntro").getOrCreate()
# Spark Session is for high level opertions. it allows you to work with structured data in the form of dataframes or Datasets

<hr>

In [4]:
data = [1,2,3,4,5,6,7,8,9,0]
rdd1 = sc.parallelize(data)
# Convert a python listo into an RDD in SPARK

In [13]:
# Rdd transformations  - Keep in mind that they are considered lazy
# All RDD transformations create a new RDD (They are inmutable Objects ) 
squared_rdd = rdd1.map(lambda x : x ** 2 )
even_rdd = rdd1.filter(lambda x : x%2 ==0)

In [17]:
# Rdd Actions - Collect, count, reduce ... The actions return a result to the prgram or write data to external source
# RDD Actions return results to the program
collected_data = squared_rdd.collect()
num_elements = even_rdd.count()

<hr>

In [31]:
# Creating a Dataframe 
from pyspark.sql.types import StructType, StructField, StringType,IntegerType
schema = StructType(
    [
        StructField("Name",StringType(),True),
        StructField("Age",IntegerType(),True),
        StructField("Salary",IntegerType(),True)
    ]
)
data = [("Alejandro",28,13500000),("Sara",30,4500000),("Mauro",35,13500000 )]
df1 = spark.createDataFrame(data,schema)
df1.show()

+---------+---+--------+
|     Name|Age|  Salary|
+---------+---+--------+
|Alejandro| 28|13500000|
|     Sara| 30| 4500000|
|    Mauro| 35|13500000|
+---------+---+--------+



In [4]:
# Reading Csv Files
data_file = "/home/jovyan/work/Jupyter_Jobs/data_training/car data.csv"
df2 = spark.read.csv(data_file,header=True,inferSchema =True)
df2.printSchema()
df2.show(5)

root
 |-- Car_Name: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Selling_Price: double (nullable = true)
 |-- Present_Price: double (nullable = true)
 |-- Kms_Driven: integer (nullable = true)
 |-- Fuel_Type: string (nullable = true)
 |-- Seller_Type: string (nullable = true)
 |-- Transmission: string (nullable = true)
 |-- Owner: integer (nullable = true)

+--------+----+-------------+-------------+----------+---------+-----------+------------+-----+
|Car_Name|Year|Selling_Price|Present_Price|Kms_Driven|Fuel_Type|Seller_Type|Transmission|Owner|
+--------+----+-------------+-------------+----------+---------+-----------+------------+-----+
|    ritz|2014|         3.35|         5.59|     27000|   Petrol|     Dealer|      Manual|    0|
|     sx4|2013|         4.75|         9.54|     43000|   Diesel|     Dealer|      Manual|    0|
|    ciaz|2017|         7.25|         9.85|      6900|   Petrol|     Dealer|      Manual|    0|
| wagon r|2011|         2.85|         4.15

In [41]:
#Reading_json_file
path_json_file = "/home/jovyan/work/Jupyter_Jobs/data/iris.json"
df_json = spark.read.json(path_json_file)
df_json.show(3)

+---------------+-----------+----------+-----------+----------+-------+
|_corrupt_record|petalLength|petalWidth|sepalLength|sepalWidth|species|
+---------------+-----------+----------+-----------+----------+-------+
|              [|       NULL|      NULL|       NULL|      NULL|   NULL|
|           NULL|        1.4|       0.2|        5.1|       3.5| setosa|
|           NULL|        1.4|       0.2|        4.9|       3.0| setosa|
+---------------+-----------+----------+-----------+----------+-------+
only showing top 3 rows



In [59]:
#Cleaning data with Missing Values
data_with_missing = [("A",None,10000), ("C",12,None),("B",3,14500),("A",3,9400)]
df_missing = spark.createDataFrame(data_with_missing, ["Name","Age","Status"])
df_missing.show()
#We're gonna fill the age column with the mean
mean_age = df_missing.select("Age").agg({"Age":"avg"}).collect()[0][0]
df_clean = df_missing.na.fill(mean_age,subset =["Age"])
df_clean.show()
#let's fill the other column with the max value
df_clean = df_clean.na.fill(
    df_clean.select("Status").agg({"Status":"max"}).collect()[0][0],
    subset = ["Status"]
)
df_clean.show()

+----+----+------+
|Name| Age|Status|
+----+----+------+
|   A|NULL| 10000|
|   C|  12|  NULL|
|   B|   3| 14500|
|   A|   3|  9400|
+----+----+------+

+----+---+------+
|Name|Age|Status|
+----+---+------+
|   A|  6| 10000|
|   C| 12|  NULL|
|   B|  3| 14500|
|   A|  3|  9400|
+----+---+------+

+----+---+------+
|Name|Age|Status|
+----+---+------+
|   A|  6| 10000|
|   C| 12| 14500|
|   B|  3| 14500|
|   A|  3|  9400|
+----+---+------+



In [69]:
#Feature Scaling
from pyspark.ml.feature import MinMaxScaler, StandardScaler, VectorAssembler

# First We will create a new column called features using VectorAssembler with the values From Age and Status
assembler = VectorAssembler(
    inputCols = ["Age","Status"],
    outputCol="Features"
)
data_for_scaling = assembler.transform(df_clean)
data_for_scaling.show()
# Create a Min Max Scaler object
scaler_min_max = MinMaxScaler(inputCol = "Features",outputCol = "Scaled_Features")
# Apply the min max scaler
scaled_min_max = scaler_min_max.fit(data_for_scaling).transform(data_for_scaling)
# Show the result
scaled_min_max.show()

+----+---+------+--------------+
|Name|Age|Status|      Features|
+----+---+------+--------------+
|   A|  6| 10000| [6.0,10000.0]|
|   C| 12| 14500|[12.0,14500.0]|
|   B|  3| 14500| [3.0,14500.0]|
|   A|  3|  9400|  [3.0,9400.0]|
+----+---+------+--------------+

+----+---+------+--------------+--------------------+
|Name|Age|Status|      Features|     Scaled_Features|
+----+---+------+--------------+--------------------+
|   A|  6| 10000| [6.0,10000.0]|[0.33333333333333...|
|   C| 12| 14500|[12.0,14500.0]|           [1.0,1.0]|
|   B|  3| 14500| [3.0,14500.0]|           [0.0,1.0]|
|   A|  3|  9400|  [3.0,9400.0]|           (2,[],[])|
+----+---+------+--------------+--------------------+



In [75]:
standard_scaler = StandardScaler(
    inputCol="Features",
    outputCol="Scaled_Features",
    withStd =True,
    withMean =True)
scaled_standard = standard_scaler.fit(data_for_scaling).transform(data_for_scaling)
print(scaled_standard.select("Scaled_Features").collect()[0])
scaled_standard.show()


Row(Scaled_Features=DenseVector([0.0, -0.7548]))
+----+---+------+--------------+--------------------+
|Name|Age|Status|      Features|     Scaled_Features|
+----+---+------+--------------+--------------------+
|   A|  6| 10000| [6.0,10000.0]|[0.0,-0.754829412...|
|   C| 12| 14500|[12.0,14500.0]|[1.41421356237309...|
|   B|  3| 14500| [3.0,14500.0]|[-0.7071067811865...|
|   A|  3|  9400|  [3.0,9400.0]|[-0.7071067811865...|
+----+---+------+--------------+--------------------+

