#import pyspark

In [575]:
#!pip install pyspark
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [576]:
data = spark.read.csv('DATA.csv')
data.show()

+-------+---+------+----------+
|    _c0|_c1|   _c2|       _c3|
+-------+---+------+----------+
|   Name|Age|Income|Experience|
|   John| 23| 50000|        23|
|   Mary| 42| 67000|         4|
|    Eva| 34| 90000|         2|
|Ishmael| 53|100000|        53|
|  Locus| 45| 80000|         5|
|Kenneth| 36| 90000|         7|
|    Ava| 38|110000|         2|
| Nicole| 34|170000|        10|
|    Lee| 35|200000|         2|
|  Ethan| 22|140000|         4|
| Olivia| 27|100000|         1|
|  Lucas| 27| 50000|         1|
| Sophia| 29| 30000|         2|
+-------+---+------+----------+



In [577]:
data = spark.read.csv('DATA.csv', header = 'True')

In [578]:
data.show()

+-------+---+------+----------+
|   Name|Age|Income|Experience|
+-------+---+------+----------+
|   John| 23| 50000|        23|
|   Mary| 42| 67000|         4|
|    Eva| 34| 90000|         2|
|Ishmael| 53|100000|        53|
|  Locus| 45| 80000|         5|
|Kenneth| 36| 90000|         7|
|    Ava| 38|110000|         2|
| Nicole| 34|170000|        10|
|    Lee| 35|200000|         2|
|  Ethan| 22|140000|         4|
| Olivia| 27|100000|         1|
|  Lucas| 27| 50000|         1|
| Sophia| 29| 30000|         2|
+-------+---+------+----------+



In [579]:
type(spark)

pyspark.sql.session.SparkSession

In [580]:
data.columns

['Name', 'Age', 'Income', 'Experience']

In [581]:
data.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Income: string (nullable = true)
 |-- Experience: string (nullable = true)



In [582]:
data.head(5)

[Row(Name='John', Age='23', Income='50000', Experience='23'),
 Row(Name='Mary', Age='42', Income='67000', Experience='4'),
 Row(Name='Eva', Age='34', Income='90000', Experience='2'),
 Row(Name='Ishmael', Age='53', Income='100000', Experience='53'),
 Row(Name='Locus', Age='45', Income='80000', Experience='5')]

In [583]:
  data.tail(5)

[Row(Name='Lee', Age='35', Income='200000', Experience='2'),
 Row(Name='Ethan', Age='22', Income='140000', Experience='4'),
 Row(Name='Olivia', Age='27', Income='100000', Experience='1'),
 Row(Name='Lucas', Age='27', Income='50000', Experience='1'),
 Row(Name='Sophia', Age='29', Income='30000', Experience='2')]

In [584]:
data.select(['Name', 'age'])

DataFrame[Name: string, age: string]

In [585]:
data.select(['Name', 'age']).show()

+-------+---+
|   Name|age|
+-------+---+
|   John| 23|
|   Mary| 42|
|    Eva| 34|
|Ishmael| 53|
|  Locus| 45|
|Kenneth| 36|
|    Ava| 38|
| Nicole| 34|
|    Lee| 35|
|  Ethan| 22|
| Olivia| 27|
|  Lucas| 27|
| Sophia| 29|
+-------+---+



In [586]:
data['Name']

Column<'Name'>

In [587]:
data.dtypes

[('Name', 'string'),
 ('Age', 'string'),
 ('Income', 'string'),
 ('Experience', 'string')]

In [588]:
data.describe().show()

+-------+------+-----------------+------------------+------------------+
|summary|  Name|              Age|            Income|        Experience|
+-------+------+-----------------+------------------+------------------+
|  count|    13|               13|                13|                13|
|   mean|  NULL|34.23076923076923| 98230.76923076923| 8.923076923076923|
| stddev|  NULL|8.917715011461231|48432.347740867444|14.511268830702678|
|    min|   Ava|               22|            100000|                 1|
|    max|Sophia|               53|             90000|                 7|
+-------+------+-----------------+------------------+------------------+



In [589]:
### Adding Columns in data frame
data = data.withColumn('New Col', data['age']+2)
data.show()

+-------+---+------+----------+-------+
|   Name|Age|Income|Experience|New Col|
+-------+---+------+----------+-------+
|   John| 23| 50000|        23|   25.0|
|   Mary| 42| 67000|         4|   44.0|
|    Eva| 34| 90000|         2|   36.0|
|Ishmael| 53|100000|        53|   55.0|
|  Locus| 45| 80000|         5|   47.0|
|Kenneth| 36| 90000|         7|   38.0|
|    Ava| 38|110000|         2|   40.0|
| Nicole| 34|170000|        10|   36.0|
|    Lee| 35|200000|         2|   37.0|
|  Ethan| 22|140000|         4|   24.0|
| Olivia| 27|100000|         1|   29.0|
|  Lucas| 27| 50000|         1|   29.0|
| Sophia| 29| 30000|         2|   31.0|
+-------+---+------+----------+-------+



In [590]:
data = data.drop('New Col')        #data = data.drop('[New Col]') doesn't work
data.show()

+-------+---+------+----------+
|   Name|Age|Income|Experience|
+-------+---+------+----------+
|   John| 23| 50000|        23|
|   Mary| 42| 67000|         4|
|    Eva| 34| 90000|         2|
|Ishmael| 53|100000|        53|
|  Locus| 45| 80000|         5|
|Kenneth| 36| 90000|         7|
|    Ava| 38|110000|         2|
| Nicole| 34|170000|        10|
|    Lee| 35|200000|         2|
|  Ethan| 22|140000|         4|
| Olivia| 27|100000|         1|
|  Lucas| 27| 50000|         1|
| Sophia| 29| 30000|         2|
+-------+---+------+----------+



In [591]:
data = data.withColumnRenamed('age', 'Age')
data.show()

+-------+---+------+----------+
|   Name|Age|Income|Experience|
+-------+---+------+----------+
|   John| 23| 50000|        23|
|   Mary| 42| 67000|         4|
|    Eva| 34| 90000|         2|
|Ishmael| 53|100000|        53|
|  Locus| 45| 80000|         5|
|Kenneth| 36| 90000|         7|
|    Ava| 38|110000|         2|
| Nicole| 34|170000|        10|
|    Lee| 35|200000|         2|
|  Ethan| 22|140000|         4|
| Olivia| 27|100000|         1|
|  Lucas| 27| 50000|         1|
| Sophia| 29| 30000|         2|
+-------+---+------+----------+



In [592]:
data.na.drop().show()

+-------+---+------+----------+
|   Name|Age|Income|Experience|
+-------+---+------+----------+
|   John| 23| 50000|        23|
|   Mary| 42| 67000|         4|
|    Eva| 34| 90000|         2|
|Ishmael| 53|100000|        53|
|  Locus| 45| 80000|         5|
|Kenneth| 36| 90000|         7|
|    Ava| 38|110000|         2|
| Nicole| 34|170000|        10|
|    Lee| 35|200000|         2|
|  Ethan| 22|140000|         4|
| Olivia| 27|100000|         1|
|  Lucas| 27| 50000|         1|
| Sophia| 29| 30000|         2|
+-------+---+------+----------+



In [593]:
data.na.drop(how="any").show()

+-------+---+------+----------+
|   Name|Age|Income|Experience|
+-------+---+------+----------+
|   John| 23| 50000|        23|
|   Mary| 42| 67000|         4|
|    Eva| 34| 90000|         2|
|Ishmael| 53|100000|        53|
|  Locus| 45| 80000|         5|
|Kenneth| 36| 90000|         7|
|    Ava| 38|110000|         2|
| Nicole| 34|170000|        10|
|    Lee| 35|200000|         2|
|  Ethan| 22|140000|         4|
| Olivia| 27|100000|         1|
|  Lucas| 27| 50000|         1|
| Sophia| 29| 30000|         2|
+-------+---+------+----------+



In [594]:
data.show()
data.na.drop(how="any", subset="Experience").show()

+-------+---+------+----------+
|   Name|Age|Income|Experience|
+-------+---+------+----------+
|   John| 23| 50000|        23|
|   Mary| 42| 67000|         4|
|    Eva| 34| 90000|         2|
|Ishmael| 53|100000|        53|
|  Locus| 45| 80000|         5|
|Kenneth| 36| 90000|         7|
|    Ava| 38|110000|         2|
| Nicole| 34|170000|        10|
|    Lee| 35|200000|         2|
|  Ethan| 22|140000|         4|
| Olivia| 27|100000|         1|
|  Lucas| 27| 50000|         1|
| Sophia| 29| 30000|         2|
+-------+---+------+----------+

+-------+---+------+----------+
|   Name|Age|Income|Experience|
+-------+---+------+----------+
|   John| 23| 50000|        23|
|   Mary| 42| 67000|         4|
|    Eva| 34| 90000|         2|
|Ishmael| 53|100000|        53|
|  Locus| 45| 80000|         5|
|Kenneth| 36| 90000|         7|
|    Ava| 38|110000|         2|
| Nicole| 34|170000|        10|
|    Lee| 35|200000|         2|
|  Ethan| 22|140000|         4|
| Olivia| 27|100000|         1|
|  Luca

In [595]:
data.na.fill('FILLED CELL', subset=['Experience', 'Income']).show()

+-------+---+------+----------+
|   Name|Age|Income|Experience|
+-------+---+------+----------+
|   John| 23| 50000|        23|
|   Mary| 42| 67000|         4|
|    Eva| 34| 90000|         2|
|Ishmael| 53|100000|        53|
|  Locus| 45| 80000|         5|
|Kenneth| 36| 90000|         7|
|    Ava| 38|110000|         2|
| Nicole| 34|170000|        10|
|    Lee| 35|200000|         2|
|  Ethan| 22|140000|         4|
| Olivia| 27|100000|         1|
|  Lucas| 27| 50000|         1|
| Sophia| 29| 30000|         2|
+-------+---+------+----------+



In [596]:
from pyspark.ml.feature import Imputer

data = data.withColumn('New Age', data.Age.cast('double'))
data = data.withColumn('New Experience', data.Experience.cast('double'))
data = data.withColumn('New Income', data.Experience.cast('double'))

imputer = Imputer(
    inputCols=['New Age', 'New Experience', 'New Income'],
    outputCols=["{}_imputed".format(c) for c in ['New Age', 'New Experience', 'New Income']]
    ).setStrategy("mean")

In [597]:
# Add imputation cols to df
imputer.fit(data).transform(data).show()

+-------+---+------+----------+-------+--------------+----------+---------------+----------------------+------------------+
|   Name|Age|Income|Experience|New Age|New Experience|New Income|New Age_imputed|New Experience_imputed|New Income_imputed|
+-------+---+------+----------+-------+--------------+----------+---------------+----------------------+------------------+
|   John| 23| 50000|        23|   23.0|          23.0|      23.0|           23.0|                  23.0|              23.0|
|   Mary| 42| 67000|         4|   42.0|           4.0|       4.0|           42.0|                   4.0|               4.0|
|    Eva| 34| 90000|         2|   34.0|           2.0|       2.0|           34.0|                   2.0|               2.0|
|Ishmael| 53|100000|        53|   53.0|          53.0|      53.0|           53.0|                  53.0|              53.0|
|  Locus| 45| 80000|         5|   45.0|           5.0|       5.0|           45.0|                   5.0|               5.0|
|Kenneth

In [598]:
data.dtypes

[('Name', 'string'),
 ('Age', 'string'),
 ('Income', 'string'),
 ('Experience', 'string'),
 ('New Age', 'double'),
 ('New Experience', 'double'),
 ('New Income', 'double')]

In [599]:
data.filter("Age<30").show()

+------+---+------+----------+-------+--------------+----------+
|  Name|Age|Income|Experience|New Age|New Experience|New Income|
+------+---+------+----------+-------+--------------+----------+
|  John| 23| 50000|        23|   23.0|          23.0|      23.0|
| Ethan| 22|140000|         4|   22.0|           4.0|       4.0|
|Olivia| 27|100000|         1|   27.0|           1.0|       1.0|
| Lucas| 27| 50000|         1|   27.0|           1.0|       1.0|
|Sophia| 29| 30000|         2|   29.0|           2.0|       2.0|
+------+---+------+----------+-------+--------------+----------+



In [600]:
data.filter('Age<30').select(['Name', 'Age']).show()

+------+---+
|  Name|Age|
+------+---+
|  John| 23|
| Ethan| 22|
|Olivia| 27|
| Lucas| 27|
|Sophia| 29|
+------+---+



In [601]:
data.filter((data['Age']<=35) | (data['Experience']<=5)).show()

+------+---+------+----------+-------+--------------+----------+
|  Name|Age|Income|Experience|New Age|New Experience|New Income|
+------+---+------+----------+-------+--------------+----------+
|  John| 23| 50000|        23|   23.0|          23.0|      23.0|
|  Mary| 42| 67000|         4|   42.0|           4.0|       4.0|
|   Eva| 34| 90000|         2|   34.0|           2.0|       2.0|
| Locus| 45| 80000|         5|   45.0|           5.0|       5.0|
|   Ava| 38|110000|         2|   38.0|           2.0|       2.0|
|Nicole| 34|170000|        10|   34.0|          10.0|      10.0|
|   Lee| 35|200000|         2|   35.0|           2.0|       2.0|
| Ethan| 22|140000|         4|   22.0|           4.0|       4.0|
|Olivia| 27|100000|         1|   27.0|           1.0|       1.0|
| Lucas| 27| 50000|         1|   27.0|           1.0|       1.0|
|Sophia| 29| 30000|         2|   29.0|           2.0|       2.0|
+------+---+------+----------+-------+--------------+----------+



In [602]:
data.filter((data['Age']<35) & (data['Experience']<=5)).show()

+------+---+------+----------+-------+--------------+----------+
|  Name|Age|Income|Experience|New Age|New Experience|New Income|
+------+---+------+----------+-------+--------------+----------+
|   Eva| 34| 90000|         2|   34.0|           2.0|       2.0|
| Ethan| 22|140000|         4|   22.0|           4.0|       4.0|
|Olivia| 27|100000|         1|   27.0|           1.0|       1.0|
| Lucas| 27| 50000|         1|   27.0|           1.0|       1.0|
|Sophia| 29| 30000|         2|   29.0|           2.0|       2.0|
+------+---+------+----------+-------+--------------+----------+



In [603]:
data.filter(~(data['Age']>34)).show()

+------+---+------+----------+-------+--------------+----------+
|  Name|Age|Income|Experience|New Age|New Experience|New Income|
+------+---+------+----------+-------+--------------+----------+
|  John| 23| 50000|        23|   23.0|          23.0|      23.0|
|   Eva| 34| 90000|         2|   34.0|           2.0|       2.0|
|Nicole| 34|170000|        10|   34.0|          10.0|      10.0|
| Ethan| 22|140000|         4|   22.0|           4.0|       4.0|
|Olivia| 27|100000|         1|   27.0|           1.0|       1.0|
| Lucas| 27| 50000|         1|   27.0|           1.0|       1.0|
|Sophia| 29| 30000|         2|   29.0|           2.0|       2.0|
+------+---+------+----------+-------+--------------+----------+



In [604]:
data.groupBy('Age').max().show()

+---+------------+-------------------+---------------+
|Age|max(New Age)|max(New Experience)|max(New Income)|
+---+------------+-------------------+---------------+
| 29|        29.0|                2.0|            2.0|
| 42|        42.0|                4.0|            4.0|
| 34|        34.0|               10.0|           10.0|
| 22|        22.0|                4.0|            4.0|
| 35|        35.0|                2.0|            2.0|
| 27|        27.0|                1.0|            1.0|
| 23|        23.0|               23.0|           23.0|
| 38|        38.0|                2.0|            2.0|
| 53|        53.0|               53.0|           53.0|
| 36|        36.0|                7.0|            7.0|
| 45|        45.0|                5.0|            5.0|
+---+------------+-------------------+---------------+



In [605]:
data.groupBy(['Name', 'Age']).mean().show()

+-------+---+------------+-------------------+---------------+
|   Name|Age|avg(New Age)|avg(New Experience)|avg(New Income)|
+-------+---+------------+-------------------+---------------+
|   John| 23|        23.0|               23.0|           23.0|
|Kenneth| 36|        36.0|                7.0|            7.0|
|Ishmael| 53|        53.0|               53.0|           53.0|
|    Ava| 38|        38.0|                2.0|            2.0|
| Olivia| 27|        27.0|                1.0|            1.0|
|    Lee| 35|        35.0|                2.0|            2.0|
|  Ethan| 22|        22.0|                4.0|            4.0|
|  Lucas| 27|        27.0|                1.0|            1.0|
|  Locus| 45|        45.0|                5.0|            5.0|
| Sophia| 29|        29.0|                2.0|            2.0|
|    Eva| 34|        34.0|                2.0|            2.0|
|   Mary| 42|        42.0|                4.0|            4.0|
| Nicole| 34|        34.0|               10.0|         

In [606]:
data.groupBy(['Name', 'Age']).count().show()

+-------+---+-----+
|   Name|Age|count|
+-------+---+-----+
|   John| 23|    1|
|Kenneth| 36|    1|
|Ishmael| 53|    1|
|    Ava| 38|    1|
| Olivia| 27|    1|
|    Lee| 35|    1|
|  Ethan| 22|    1|
|  Lucas| 27|    1|
|  Locus| 45|    1|
| Sophia| 29|    1|
|    Eva| 34|    1|
|   Mary| 42|    1|
| Nicole| 34|    1|
+-------+---+-----+



In [607]:
data.groupBy('Age').count().show()

+---+-----+
|Age|count|
+---+-----+
| 29|    1|
| 42|    1|
| 34|    2|
| 22|    1|
| 35|    1|
| 27|    2|
| 23|    1|
| 38|    1|
| 53|    1|
| 36|    1|
| 45|    1|
+---+-----+



In [608]:
data.agg({"Age": "max"}).show()

+--------+
|max(Age)|
+--------+
|      53|
+--------+



In [609]:
data.sort('Age').show()

+-------+---+------+----------+-------+--------------+----------+
|   Name|Age|Income|Experience|New Age|New Experience|New Income|
+-------+---+------+----------+-------+--------------+----------+
|  Ethan| 22|140000|         4|   22.0|           4.0|       4.0|
|   John| 23| 50000|        23|   23.0|          23.0|      23.0|
| Olivia| 27|100000|         1|   27.0|           1.0|       1.0|
|  Lucas| 27| 50000|         1|   27.0|           1.0|       1.0|
| Sophia| 29| 30000|         2|   29.0|           2.0|       2.0|
|    Eva| 34| 90000|         2|   34.0|           2.0|       2.0|
| Nicole| 34|170000|        10|   34.0|          10.0|      10.0|
|    Lee| 35|200000|         2|   35.0|           2.0|       2.0|
|Kenneth| 36| 90000|         7|   36.0|           7.0|       7.0|
|    Ava| 38|110000|         2|   38.0|           2.0|       2.0|
|   Mary| 42| 67000|         4|   42.0|           4.0|       4.0|
|  Locus| 45| 80000|         5|   45.0|           5.0|       5.0|
|Ishmael| 

In [610]:
data.sort(['Age', 'Name']).show()

+-------+---+------+----------+-------+--------------+----------+
|   Name|Age|Income|Experience|New Age|New Experience|New Income|
+-------+---+------+----------+-------+--------------+----------+
|  Ethan| 22|140000|         4|   22.0|           4.0|       4.0|
|   John| 23| 50000|        23|   23.0|          23.0|      23.0|
|  Lucas| 27| 50000|         1|   27.0|           1.0|       1.0|
| Olivia| 27|100000|         1|   27.0|           1.0|       1.0|
| Sophia| 29| 30000|         2|   29.0|           2.0|       2.0|
|    Eva| 34| 90000|         2|   34.0|           2.0|       2.0|
| Nicole| 34|170000|        10|   34.0|          10.0|      10.0|
|    Lee| 35|200000|         2|   35.0|           2.0|       2.0|
|Kenneth| 36| 90000|         7|   36.0|           7.0|       7.0|
|    Ava| 38|110000|         2|   38.0|           2.0|       2.0|
|   Mary| 42| 67000|         4|   42.0|           4.0|       4.0|
|  Locus| 45| 80000|         5|   45.0|           5.0|       5.0|
|Ishmael| 

In [611]:
from pyspark.ml.feature import VectorAssembler

featureassembler=VectorAssembler(inputCols=["New Age","New Experience"],outputCol="Independent Features")

In [612]:
data=featureassembler.transform(data)

In [613]:
data.show()

+-------+---+------+----------+-------+--------------+----------+--------------------+
|   Name|Age|Income|Experience|New Age|New Experience|New Income|Independent Features|
+-------+---+------+----------+-------+--------------+----------+--------------------+
|   John| 23| 50000|        23|   23.0|          23.0|      23.0|         [23.0,23.0]|
|   Mary| 42| 67000|         4|   42.0|           4.0|       4.0|          [42.0,4.0]|
|    Eva| 34| 90000|         2|   34.0|           2.0|       2.0|          [34.0,2.0]|
|Ishmael| 53|100000|        53|   53.0|          53.0|      53.0|         [53.0,53.0]|
|  Locus| 45| 80000|         5|   45.0|           5.0|       5.0|          [45.0,5.0]|
|Kenneth| 36| 90000|         7|   36.0|           7.0|       7.0|          [36.0,7.0]|
|    Ava| 38|110000|         2|   38.0|           2.0|       2.0|          [38.0,2.0]|
| Nicole| 34|170000|        10|   34.0|          10.0|      10.0|         [34.0,10.0]|
|    Lee| 35|200000|         2|   35.0|    

In [614]:
from pyspark.ml.regression import LinearRegression

In [615]:
train_data,test_data=data.na.drop().randomSplit([0.75,0.25])

In [616]:
reg = LinearRegression(featuresCol = "Independent Features", labelCol='New Income')

In [617]:
reg = reg.fit(train_data)

In [618]:
reg.coefficients

DenseVector([0.0, 1.0])

In [619]:
reg.intercept

0.0

In [620]:
### Prediction
pred=reg.evaluate(test_data)

In [622]:
pred.predictions.show()

+------+---+------+----------+-------+--------------+----------+--------------------+------------------+
|  Name|Age|Income|Experience|New Age|New Experience|New Income|Independent Features|        prediction|
+------+---+------+----------+-------+--------------+----------+--------------------+------------------+
|   Lee| 35|200000|         2|   35.0|           2.0|       2.0|          [35.0,2.0]|2.0000000000000018|
|  Mary| 42| 67000|         4|   42.0|           4.0|       4.0|          [42.0,4.0]| 4.000000000000002|
|Nicole| 34|170000|        10|   34.0|          10.0|      10.0|         [34.0,10.0]|              10.0|
|Olivia| 27|100000|         1|   27.0|           1.0|       1.0|          [27.0,1.0]|1.0000000000000013|
|Sophia| 29| 30000|         2|   29.0|           2.0|       2.0|          [29.0,2.0]|2.0000000000000013|
+------+---+------+----------+-------+--------------+----------+--------------------+------------------+



In [624]:
pred.meanAbsoluteError,pred.meanSquaredError, pred.r2

(1.2434497875801752e-15, 1.97215226305253e-30, 1.0)