In [2]:
#gambis local
import os
os.environ['JAVA_HOME']='C:/Program Files (x86)/Java/jre1.8.0_421'

import findspark
findspark.init()

from pyspark.context import SparkContext
from pyspark.sql import SparkSession

sc=SparkContext.getOrCreate()
spark=SparkSession.builder.master('local[*]').getOrCreate()

# basics
- pyspark dataframe
- reading it
- checking schema
- selecting and indexing
- describing it similar to pandas
- adding, renaming and dropping columns

In [5]:
import pandas as pd
pd.read_csv('./data.csv')

Unnamed: 0,name,age
0,krish,31
1,sudansh,30
2,sunny,29


In [64]:
spark_df=spark.read.csv('data.csv', header=True, inferSchema=True)

In [21]:
spark_df.head(10)
spark_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



In [22]:
spark_df.show()

+-------+---+
|   name|age|
+-------+---+
|  krish| 31|
|sudansh| 30|
|  sunny| 29|
+-------+---+



In [29]:
spark_df.select(['name', 'age']).show()

+-------+---+
|   name|age|
+-------+---+
|  krish| 31|
|sudansh| 30|
|  sunny| 29|
+-------+---+



In [30]:
type(spark_df['name'])

pyspark.sql.column.Column

In [31]:
spark_df.dtypes

[('name', 'string'), ('age', 'int')]

In [34]:
spark_df.describe().show()

+-------+-----+----+
|summary| name| age|
+-------+-----+----+
|  count|    3|   3|
|   mean| null|30.0|
| stddev| null| 1.0|
|    min|krish|  29|
|    max|sunny|  31|
+-------+-----+----+



In [46]:
spark_df=spark_df.withColumn('Age after 2 years', spark_df['age']+2)
spark_df=spark_df.drop('Age after 2 years')

In [50]:
spark_df.withColumnRenamed('name', 'new name').show()

+--------+---+
|new name|age|
+--------+---+
|   krish| 31|
| sudansh| 30|
|   sunny| 29|
+--------+---+



# handling missing values
- dropping columns and rows
- handling missing values by mean, median and mode

In [55]:
spark_df.show()

+-------+----+
|   name| age|
+-------+----+
|  krish|  31|
|sudansh|  30|
|  sunny|  29|
|   paul|  29|
| harsha|  21|
|   null|  34|
|   null|  30|
| mahesh|null|
+-------+----+



In [54]:
spark_df.na.drop().show()

+-------+---+
|   name|age|
+-------+---+
|  krish| 31|
|sudansh| 30|
|  sunny| 29|
|   paul| 29|
| harsha| 21|
+-------+---+



In [56]:
spark_df.na.drop(how='all').show()

+-------+----+
|   name| age|
+-------+----+
|  krish|  31|
|sudansh|  30|
|  sunny|  29|
|   paul|  29|
| harsha|  21|
|   null|  34|
|   null|  30|
| mahesh|null|
+-------+----+



In [65]:
from pyspark.sql.functions import floor
spark_df=spark_df.withColumn('fake xp', floor(spark_df['age']/2))

In [66]:
spark_df.show()

+-------+----+-------+
|   name| age|fake xp|
+-------+----+-------+
|  krish|  31|     15|
|sudansh|  30|     15|
|  sunny|  29|     14|
|   paul|  29|     14|
| harsha|  21|     10|
|   null|  34|     17|
|   null|  30|     15|
| mahesh|null|   null|
+-------+----+-------+



In [70]:
spark_df.na.drop(how='any', thresh=2).show() #if there's at least 2 non-null values (threash)

+-------+---+-------+
|   name|age|fake xp|
+-------+---+-------+
|  krish| 31|     15|
|sudansh| 30|     15|
|  sunny| 29|     14|
|   paul| 29|     14|
| harsha| 21|     10|
|   null| 34|     17|
|   null| 30|     15|
+-------+---+-------+



In [71]:
spark_df.na.drop(how='any', subset=['name']).show()

+-------+----+-------+
|   name| age|fake xp|
+-------+----+-------+
|  krish|  31|     15|
|sudansh|  30|     15|
|  sunny|  29|     14|
|   paul|  29|     14|
| harsha|  21|     10|
| mahesh|null|   null|
+-------+----+-------+



In [80]:
spark_df.na.fill('missing', ['age']).show() #?

+-------+----+-------+
|   name| age|fake xp|
+-------+----+-------+
|  krish|  31|     15|
|sudansh|  30|     15|
|  sunny|  29|     14|
|   paul|  29|     14|
| harsha|  21|     10|
|   null|  34|     17|
|   null|  30|     15|
| mahesh|null|   null|
+-------+----+-------+



In [85]:
from pyspark.ml.feature import Imputer

imp=Imputer(
    inputCols=['age', 'fake xp'],
    outputCols=[f'{col}_imputed' for col in ['age', 'fake xp']]
).setStrategy('mean') #mean, median, mode

In [87]:
spark_df.describe().show()

+-------+------+------------------+------------------+
|summary|  name|               age|           fake xp|
+-------+------+------------------+------------------+
|  count|     6|                 7|                 7|
|   mean|  null|29.142857142857142|14.285714285714286|
| stddev|  null|3.9761191895520196| 2.138089935299395|
|    min|harsha|                21|                10|
|    max| sunny|                34|                17|
+-------+------+------------------+------------------+



In [86]:
imp.fit(spark_df).transform(spark_df).show()

+-------+----+-------+-----------+---------------+
|   name| age|fake xp|age_imputed|fake xp_imputed|
+-------+----+-------+-----------+---------------+
|  krish|  31|     15|         31|             15|
|sudansh|  30|     15|         30|             15|
|  sunny|  29|     14|         29|             14|
|   paul|  29|     14|         29|             14|
| harsha|  21|     10|         21|             10|
|   null|  34|     17|         34|             17|
|   null|  30|     15|         30|             15|
| mahesh|null|   null|         29|             14|
+-------+----+-------+-----------+---------------+



# dataframes filter ops
- filter
- `&`, `|`, `==` e `~`

In [118]:
import random
spark.createDataFrame(
    [(random.randint(2000, 10000),) for n in range(0, 7)], 'f int, a string').show()

ValueError: Length of object (1) does not match with length of fields (2)

In [113]:
[(random.randint(2000, 10000),) for n in range(0, 7)]

[(9342,), (2017,), (6340,), (2163,), (3149,), (5963,), (3303,)]