In [3]:
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName('Practise').getOrCreate()
spark

In [65]:
%%time
df_pyspark=spark.read.csv('test1.csv', sep=';', header=True, inferSchema=True) # inferSchema=True -> adjusts dtype in each column (by default all dtypes are string)

# or 
# df_pyspark=spark.read.option('header', 'true').csv('test1.csv').show()

df_pyspark.show()

+------+----+----+------+
|  name| age| exp|salary|
+------+----+----+------+
|  adam|  20|   5|  1000|
|   ewa|  21|   3|  2000|
|marcin|  30|  14|  3000|
|   jan|  23|null|  4000|
|  null|null|  13|  5000|
| kasia|null|null|  null|
|   ala|null|  34|  6000|
+------+----+----+------+

Wall time: 303 ms


In [66]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [67]:
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- exp: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [68]:
df_pyspark.head(2)

[Row(name='adam', age=20, exp=5, salary=1000),
 Row(name='ewa', age=21, exp=3, salary=2000)]

In [69]:
# selecting particular columns:
df_pyspark.select(['Name', 'exp']).show()

+------+----+
|  Name| exp|
+------+----+
|  adam|   5|
|   ewa|   3|
|marcin|  14|
|   jan|null|
|  null|  13|
| kasia|null|
|   ala|  34|
+------+----+



In [70]:
df_pyspark.describe().show()

+-------+------+-----------------+------------------+------------------+
|summary|  name|              age|               exp|            salary|
+-------+------+-----------------+------------------+------------------+
|  count|     6|                4|                 5|                 6|
|   mean|  null|             23.5|              13.8|            3500.0|
| stddev|  null|4.509249752822894|12.275992831539126|1870.8286933869706|
|    min|  adam|               20|                 3|              1000|
|    max|marcin|               30|                34|              6000|
+-------+------+-----------------+------------------+------------------+



In [71]:
# adding columns in df:
df_pyspark=df_pyspark.withColumn('Add _2_years', df_pyspark['age']+2)
df_pyspark.show()

+------+----+----+------+------------+
|  name| age| exp|salary|Add _2_years|
+------+----+----+------+------------+
|  adam|  20|   5|  1000|          22|
|   ewa|  21|   3|  2000|          23|
|marcin|  30|  14|  3000|          32|
|   jan|  23|null|  4000|          25|
|  null|null|  13|  5000|        null|
| kasia|null|null|  null|        null|
|   ala|null|  34|  6000|        null|
+------+----+----+------+------------+



In [72]:
# drop the columns:
df_pyspark=df_pyspark.drop('Add _2_years')
df_pyspark.show()

+------+----+----+------+
|  name| age| exp|salary|
+------+----+----+------+
|  adam|  20|   5|  1000|
|   ewa|  21|   3|  2000|
|marcin|  30|  14|  3000|
|   jan|  23|null|  4000|
|  null|null|  13|  5000|
| kasia|null|null|  null|
|   ala|null|  34|  6000|
+------+----+----+------+



In [73]:
# rename thecolumns:
df_pyspark=df_pyspark.withColumnRenamed('Name', 'New name')
df_pyspark.show()

+--------+----+----+------+
|New name| age| exp|salary|
+--------+----+----+------+
|    adam|  20|   5|  1000|
|     ewa|  21|   3|  2000|
|  marcin|  30|  14|  3000|
|     jan|  23|null|  4000|
|    null|null|  13|  5000|
|   kasia|null|null|  null|
|     ala|null|  34|  6000|
+--------+----+----+------+



In [82]:
# drop null values:
# na.drop(how='all' -> removes row only if all items are null/ how='any' -> default (empty parenthesis),removes row if any of the item is null)
dfnat=df_pyspark.na.drop(how='any', thresh=3) #thresh => at least how many not null values must be present
dfnat.show()

+--------+----+----+------+
|New name| age| exp|salary|
+--------+----+----+------+
|    adam|  20|   5|  1000|
|     ewa|  21|   3|  2000|
|  marcin|  30|  14|  3000|
|     jan|  23|null|  4000|
|     ala|null|  34|  6000|
+--------+----+----+------+



In [84]:
dfnas=df_pyspark.na.drop(how='any', subset=['exp']) #subset=> removes rows only where null value is in exp column
dfnas.show()

+--------+----+---+------+
|New name| age|exp|salary|
+--------+----+---+------+
|    adam|  20|  5|  1000|
|     ewa|  21|  3|  2000|
|  marcin|  30| 14|  3000|
|    null|null| 13|  5000|
|     ala|null| 34|  6000|
+--------+----+---+------+



In [91]:
dff=df_pyspark.na.fill("missing_val", subset='age')
dff.show()

+--------+----+----+------+
|New name| age| exp|salary|
+--------+----+----+------+
|    adam|  20|   5|  1000|
|     ewa|  21|   3|  2000|
|  marcin|  30|  14|  3000|
|     jan|  23|null|  4000|
|    null|null|  13|  5000|
|   kasia|null|null|  null|
|     ala|null|  34|  6000|
+--------+----+----+------+



In [99]:
df_pyspark.fillna({'New name':"missing",'age':0, 'exp':0, 'salary':0}).show()

+--------+---+---+------+
|New name|age|exp|salary|
+--------+---+---+------+
|    adam| 20|  5|  1000|
|     ewa| 21|  3|  2000|
|  marcin| 30| 14|  3000|
|     jan| 23|  0|  4000|
| missing|  0| 13|  5000|
|   kasia|  0|  0|     0|
|     ala|  0| 34|  6000|
+--------+---+---+------+



In [103]:
# imputer -> filling missing values with e.g. mean, median
from pyspark.ml.feature import Imputer

imputer=Imputer(inputCols=['age', 'exp', 'salary'],
               outputCols=[f'{c}_inputer' for c in ["age", "exp", "salary"]]).setStrategy('mean')

imputer.fit(df_pyspark).transform(df_pyspark).show()

+--------+----+----+------+-----------+-----------+--------------+
|New name| age| exp|salary|age_inputer|exp_inputer|salary_inputer|
+--------+----+----+------+-----------+-----------+--------------+
|    adam|  20|   5|  1000|         20|          5|          1000|
|     ewa|  21|   3|  2000|         21|          3|          2000|
|  marcin|  30|  14|  3000|         30|         14|          3000|
|     jan|  23|null|  4000|         23|         13|          4000|
|    null|null|  13|  5000|         23|         13|          5000|
|   kasia|null|null|  null|         23|         13|          3500|
|     ala|null|  34|  6000|         23|         34|          6000|
+--------+----+----+------+-----------+-----------+--------------+



In [109]:
# filter values:
df_pyspark.filter('salary>=4000').show()
# or
df_pyspark.filter( df_pyspark['salary']>=4000).show()

+--------+----+----+------+
|New name| age| exp|salary|
+--------+----+----+------+
|     jan|  23|null|  4000|
|    null|null|  13|  5000|
|     ala|null|  34|  6000|
+--------+----+----+------+

+--------+----+----+------+
|New name| age| exp|salary|
+--------+----+----+------+
|     jan|  23|null|  4000|
|    null|null|  13|  5000|
|     ala|null|  34|  6000|
+--------+----+----+------+



In [106]:
df_pyspark.filter('salary>=4000').select(['New name', 'exp']).show()

+--------+----+
|New name| exp|
+--------+----+
|     jan|null|
|    null|  13|
|     ala|  34|
+--------+----+



In [119]:
# multiple conditions:
df_pyspark.filter((df_pyspark['salary']>=4000) & (df_pyspark['salary']<6000)).show()

+--------+----+----+------+
|New name| age| exp|salary|
+--------+----+----+------+
|     jan|  23|null|  4000|
|    null|null|  13|  5000|
+--------+----+----+------+



In [121]:
# groupby:
df_pyspark.groupby('New name').max().show()

+--------+--------+--------+-----------+
|New name|max(age)|max(exp)|max(salary)|
+--------+--------+--------+-----------+
|    adam|      20|       5|       1000|
|    null|    null|      13|       5000|
|     ala|    null|      34|       6000|
|     ewa|      21|       3|       2000|
|  marcin|      30|      14|       3000|
|   kasia|    null|    null|       null|
|     jan|      23|    null|       4000|
+--------+--------+--------+-----------+



In [122]:
df_pyspark.groupby('New name').count().show()

+--------+-----+
|New name|count|
+--------+-----+
|    adam|    1|
|    null|    1|
|     ala|    1|
|     ewa|    1|
|  marcin|    1|
|   kasia|    1|
|     jan|    1|
+--------+-----+



In [123]:
# ML - simple example - salary prediction:
df_pyspark.columns

['New name', 'age', 'exp', 'salary']

In [132]:
# independent features are grouped:
from pyspark.ml.feature import VectorAssembler

featureassembler=VectorAssembler(inputCols=['age', 'exp'], outputCol='Independent Features', handleInvalid='skip') # omits null
                                                                                                                    #values
indf=featureassembler.transform(df_pyspark)
indf.show()

+--------+---+---+------+--------------------+
|New name|age|exp|salary|Independent Features|
+--------+---+---+------+--------------------+
|    adam| 20|  5|  1000|          [20.0,5.0]|
|     ewa| 21|  3|  2000|          [21.0,3.0]|
|  marcin| 30| 14|  3000|         [30.0,14.0]|
+--------+---+---+------+--------------------+



In [133]:
final=indf.select(['salary', 'Independent Features'])
final.show()

+------+--------------------+
|salary|Independent Features|
+------+--------------------+
|  1000|          [20.0,5.0]|
|  2000|          [21.0,3.0]|
|  3000|         [30.0,14.0]|
+------+--------------------+



In [136]:
from pyspark.ml.regression import LinearRegression

train, test=final.randomSplit([0.75, 0.25])
train.show()

+------+--------------------+
|salary|Independent Features|
+------+--------------------+
|  1000|          [20.0,5.0]|
|  3000|         [30.0,14.0]|
+------+--------------------+



In [137]:
model=LinearRegression(featuresCol='Independent Features', labelCol='salary')
model.fit(train)

LinearRegressionModel: uid=LinearRegression_26179adbaccb, numFeatures=2

In [140]:
prediction=model.evaluate(test)
prediction.show()

AttributeError: 'LinearRegression' object has no attribute 'evaluate'

In [13]:
spark.read.option