#Import pyspark and create a spark session

In [191]:
#!pip install pyspark
from pyspark.sql import SparkSession

In [192]:
# Create a Spark session
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [193]:
# Read the data
data = spark.read.csv('DATA.csv')
data.show()

+-------+-----+---------+----------+
|    _c0|  _c1|      _c2|       _c3|
+-------+-----+---------+----------+
|   Name|  Age|   Income|Experience|
|   John|23.00| 50000.00|      3.00|
|   Mary|42.00| 67000.00|      8.00|
|    Eva|34.00| 90000.00|      6.00|
|Ishmael|53.00|100000.00|     10.00|
|  Locus|45.00| 80000.00|      9.00|
|Kenneth|36.00| 90000.00|      6.00|
|    Ava|38.00|110000.00|      7.00|
| Nicole|34.00|170000.00|      4.00|
|    Lee|35.00|200000.00|      5.00|
|  Ethan|22.00|140000.00|      1.00|
| Olivia|27.00|100000.00|      2.00|
|  Lucas|27.00| 50000.00|      2.00|
| Sophia|29.00| 30000.00|      4.00|
+-------+-----+---------+----------+



In [194]:
# Reads a CSV file ('DATA.csv') into a Spark DataFrame named 'data' with the first row as headers
data = spark.read.csv('DATA.csv', header = 'True')

In [195]:
data.show()

+-------+-----+---------+----------+
|   Name|  Age|   Income|Experience|
+-------+-----+---------+----------+
|   John|23.00| 50000.00|      3.00|
|   Mary|42.00| 67000.00|      8.00|
|    Eva|34.00| 90000.00|      6.00|
|Ishmael|53.00|100000.00|     10.00|
|  Locus|45.00| 80000.00|      9.00|
|Kenneth|36.00| 90000.00|      6.00|
|    Ava|38.00|110000.00|      7.00|
| Nicole|34.00|170000.00|      4.00|
|    Lee|35.00|200000.00|      5.00|
|  Ethan|22.00|140000.00|      1.00|
| Olivia|27.00|100000.00|      2.00|
|  Lucas|27.00| 50000.00|      2.00|
| Sophia|29.00| 30000.00|      4.00|
+-------+-----+---------+----------+



In [196]:
# Determines the type of the 'spark' variable, indicating the class or type of the Spark session or context
type(spark)

pyspark.sql.session.SparkSession

In [197]:
# Convert spark data frame to panda one
df_pandas = data.toPandas()
df_pandas

Unnamed: 0,Name,Age,Income,Experience
0,John,23.0,50000.0,3.0
1,Mary,42.0,67000.0,8.0
2,Eva,34.0,90000.0,6.0
3,Ishmael,53.0,100000.0,10.0
4,Locus,45.0,80000.0,9.0
5,Kenneth,36.0,90000.0,6.0
6,Ava,38.0,110000.0,7.0
7,Nicole,34.0,170000.0,4.0
8,Lee,35.0,200000.0,5.0
9,Ethan,22.0,140000.0,1.0


In [198]:
data.columns

['Name', 'Age', 'Income', 'Experience']

In [199]:
data.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Income: string (nullable = true)
 |-- Experience: string (nullable = true)



In [200]:
# Display the first 5 rows
data.head(5)

[Row(Name='John', Age='23.00', Income='50000.00', Experience='3.00'),
 Row(Name='Mary', Age='42.00', Income='67000.00', Experience='8.00'),
 Row(Name='Eva', Age='34.00', Income='90000.00', Experience='6.00'),
 Row(Name='Ishmael', Age='53.00', Income='100000.00', Experience='10.00'),
 Row(Name='Locus', Age='45.00', Income='80000.00', Experience='9.00')]

In [201]:
# Display the last 5 rows
data.tail(5)

[Row(Name='Lee', Age='35.00', Income='200000.00', Experience='5.00'),
 Row(Name='Ethan', Age='22.00', Income='140000.00', Experience='1.00'),
 Row(Name='Olivia', Age='27.00', Income='100000.00', Experience='2.00'),
 Row(Name='Lucas', Age='27.00', Income='50000.00', Experience='2.00'),
 Row(Name='Sophia', Age='29.00', Income='30000.00', Experience='4.00')]

In [202]:
# Select the 'Name' and 'age' columns
data.select(['Name', 'age'])

DataFrame[Name: string, age: string]

In [203]:
data.select(['Name', 'age']).show()

+-------+-----+
|   Name|  age|
+-------+-----+
|   John|23.00|
|   Mary|42.00|
|    Eva|34.00|
|Ishmael|53.00|
|  Locus|45.00|
|Kenneth|36.00|
|    Ava|38.00|
| Nicole|34.00|
|    Lee|35.00|
|  Ethan|22.00|
| Olivia|27.00|
|  Lucas|27.00|
| Sophia|29.00|
+-------+-----+



In [204]:
data['Name']

Column<'Name'>

In [205]:
data.dtypes

[('Name', 'string'),
 ('Age', 'string'),
 ('Income', 'string'),
 ('Experience', 'string')]

In [206]:
# Show descriptive statistics
data.describe().show()

+-------+------+-----------------+------------------+------------------+
|summary|  Name|              Age|            Income|        Experience|
+-------+------+-----------------+------------------+------------------+
|  count|    13|               13|                13|                13|
|   mean|  NULL|34.23076923076923| 98230.76923076923| 5.153846153846154|
| stddev|  NULL|8.917715011461231|48432.347740867444|2.8238907511373337|
|    min|   Ava|            22.00|         100000.00|              1.00|
|    max|Sophia|            53.00|          90000.00|              9.00|
+-------+------+-----------------+------------------+------------------+



In [207]:
# Adding columns in data frame
data = data.withColumn('New Col', data['age']+2)
data.show()

+-------+-----+---------+----------+-------+
|   Name|  Age|   Income|Experience|New Col|
+-------+-----+---------+----------+-------+
|   John|23.00| 50000.00|      3.00|   25.0|
|   Mary|42.00| 67000.00|      8.00|   44.0|
|    Eva|34.00| 90000.00|      6.00|   36.0|
|Ishmael|53.00|100000.00|     10.00|   55.0|
|  Locus|45.00| 80000.00|      9.00|   47.0|
|Kenneth|36.00| 90000.00|      6.00|   38.0|
|    Ava|38.00|110000.00|      7.00|   40.0|
| Nicole|34.00|170000.00|      4.00|   36.0|
|    Lee|35.00|200000.00|      5.00|   37.0|
|  Ethan|22.00|140000.00|      1.00|   24.0|
| Olivia|27.00|100000.00|      2.00|   29.0|
|  Lucas|27.00| 50000.00|      2.00|   29.0|
| Sophia|29.00| 30000.00|      4.00|   31.0|
+-------+-----+---------+----------+-------+



In [208]:
# Remove a column
data = data.drop('New Col')        #data = data.drop(['New Col']) doesn't work
data.show()

+-------+-----+---------+----------+
|   Name|  Age|   Income|Experience|
+-------+-----+---------+----------+
|   John|23.00| 50000.00|      3.00|
|   Mary|42.00| 67000.00|      8.00|
|    Eva|34.00| 90000.00|      6.00|
|Ishmael|53.00|100000.00|     10.00|
|  Locus|45.00| 80000.00|      9.00|
|Kenneth|36.00| 90000.00|      6.00|
|    Ava|38.00|110000.00|      7.00|
| Nicole|34.00|170000.00|      4.00|
|    Lee|35.00|200000.00|      5.00|
|  Ethan|22.00|140000.00|      1.00|
| Olivia|27.00|100000.00|      2.00|
|  Lucas|27.00| 50000.00|      2.00|
| Sophia|29.00| 30000.00|      4.00|
+-------+-----+---------+----------+



In [209]:
# Rename a column
data = data.withColumnRenamed('age', 'Age')
data.show()

+-------+-----+---------+----------+
|   Name|  Age|   Income|Experience|
+-------+-----+---------+----------+
|   John|23.00| 50000.00|      3.00|
|   Mary|42.00| 67000.00|      8.00|
|    Eva|34.00| 90000.00|      6.00|
|Ishmael|53.00|100000.00|     10.00|
|  Locus|45.00| 80000.00|      9.00|
|Kenneth|36.00| 90000.00|      6.00|
|    Ava|38.00|110000.00|      7.00|
| Nicole|34.00|170000.00|      4.00|
|    Lee|35.00|200000.00|      5.00|
|  Ethan|22.00|140000.00|      1.00|
| Olivia|27.00|100000.00|      2.00|
|  Lucas|27.00| 50000.00|      2.00|
| Sophia|29.00| 30000.00|      4.00|
+-------+-----+---------+----------+



In [210]:
# Dropping rows with missing values
data.na.drop().show()

+-------+-----+---------+----------+
|   Name|  Age|   Income|Experience|
+-------+-----+---------+----------+
|   John|23.00| 50000.00|      3.00|
|   Mary|42.00| 67000.00|      8.00|
|    Eva|34.00| 90000.00|      6.00|
|Ishmael|53.00|100000.00|     10.00|
|  Locus|45.00| 80000.00|      9.00|
|Kenneth|36.00| 90000.00|      6.00|
|    Ava|38.00|110000.00|      7.00|
| Nicole|34.00|170000.00|      4.00|
|    Lee|35.00|200000.00|      5.00|
|  Ethan|22.00|140000.00|      1.00|
| Olivia|27.00|100000.00|      2.00|
|  Lucas|27.00| 50000.00|      2.00|
| Sophia|29.00| 30000.00|      4.00|
+-------+-----+---------+----------+



In [211]:
# Dropping rows with any missing values
data.na.drop(how="any").show()

+-------+-----+---------+----------+
|   Name|  Age|   Income|Experience|
+-------+-----+---------+----------+
|   John|23.00| 50000.00|      3.00|
|   Mary|42.00| 67000.00|      8.00|
|    Eva|34.00| 90000.00|      6.00|
|Ishmael|53.00|100000.00|     10.00|
|  Locus|45.00| 80000.00|      9.00|
|Kenneth|36.00| 90000.00|      6.00|
|    Ava|38.00|110000.00|      7.00|
| Nicole|34.00|170000.00|      4.00|
|    Lee|35.00|200000.00|      5.00|
|  Ethan|22.00|140000.00|      1.00|
| Olivia|27.00|100000.00|      2.00|
|  Lucas|27.00| 50000.00|      2.00|
| Sophia|29.00| 30000.00|      4.00|
+-------+-----+---------+----------+



In [212]:
# dropping rows with any missing values in the 'Experience' column
data.show()
data.na.drop(how="any", subset="Experience").show()

+-------+-----+---------+----------+
|   Name|  Age|   Income|Experience|
+-------+-----+---------+----------+
|   John|23.00| 50000.00|      3.00|
|   Mary|42.00| 67000.00|      8.00|
|    Eva|34.00| 90000.00|      6.00|
|Ishmael|53.00|100000.00|     10.00|
|  Locus|45.00| 80000.00|      9.00|
|Kenneth|36.00| 90000.00|      6.00|
|    Ava|38.00|110000.00|      7.00|
| Nicole|34.00|170000.00|      4.00|
|    Lee|35.00|200000.00|      5.00|
|  Ethan|22.00|140000.00|      1.00|
| Olivia|27.00|100000.00|      2.00|
|  Lucas|27.00| 50000.00|      2.00|
| Sophia|29.00| 30000.00|      4.00|
+-------+-----+---------+----------+

+-------+-----+---------+----------+
|   Name|  Age|   Income|Experience|
+-------+-----+---------+----------+
|   John|23.00| 50000.00|      3.00|
|   Mary|42.00| 67000.00|      8.00|
|    Eva|34.00| 90000.00|      6.00|
|Ishmael|53.00|100000.00|     10.00|
|  Locus|45.00| 80000.00|      9.00|
|Kenneth|36.00| 90000.00|      6.00|
|    Ava|38.00|110000.00|      7.00|


In [213]:
# Filling missing values
data.na.fill('FILLED CELL', subset=['Experience', 'Income']).show()

+-------+-----+---------+----------+
|   Name|  Age|   Income|Experience|
+-------+-----+---------+----------+
|   John|23.00| 50000.00|      3.00|
|   Mary|42.00| 67000.00|      8.00|
|    Eva|34.00| 90000.00|      6.00|
|Ishmael|53.00|100000.00|     10.00|
|  Locus|45.00| 80000.00|      9.00|
|Kenneth|36.00| 90000.00|      6.00|
|    Ava|38.00|110000.00|      7.00|
| Nicole|34.00|170000.00|      4.00|
|    Lee|35.00|200000.00|      5.00|
|  Ethan|22.00|140000.00|      1.00|
| Olivia|27.00|100000.00|      2.00|
|  Lucas|27.00| 50000.00|      2.00|
| Sophia|29.00| 30000.00|      4.00|
+-------+-----+---------+----------+



In [214]:
from pyspark.ml.feature import Imputer

# Converts 'Age', 'Experience', and 'Income' columns to double and creates new columns 'New Age', 'New Experience', 'New Income'
data = data.withColumn('New Age', data.Age.cast('double'))
data = data.withColumn('New Experience', data.Experience.cast('double'))
data = data.withColumn('New Income', data.Experience.cast('double'))

# Initializes an Imputer with input and output columns, and sets the strategy to 'mean'
imputer = Imputer(
    inputCols=['New Age', 'New Experience', 'New Income'],
    outputCols=["{}_imputed".format(c) for c in ['New Age', 'New Experience', 'New Income']]
    ).setStrategy("mean")

In [215]:
# Add imputation cols to df
imputer.fit(data).transform(data).show()

+-------+-----+---------+----------+-------+--------------+----------+---------------+----------------------+------------------+
|   Name|  Age|   Income|Experience|New Age|New Experience|New Income|New Age_imputed|New Experience_imputed|New Income_imputed|
+-------+-----+---------+----------+-------+--------------+----------+---------------+----------------------+------------------+
|   John|23.00| 50000.00|      3.00|   23.0|           3.0|       3.0|           23.0|                   3.0|               3.0|
|   Mary|42.00| 67000.00|      8.00|   42.0|           8.0|       8.0|           42.0|                   8.0|               8.0|
|    Eva|34.00| 90000.00|      6.00|   34.0|           6.0|       6.0|           34.0|                   6.0|               6.0|
|Ishmael|53.00|100000.00|     10.00|   53.0|          10.0|      10.0|           53.0|                  10.0|              10.0|
|  Locus|45.00| 80000.00|      9.00|   45.0|           9.0|       9.0|           45.0|           

In [216]:
data.dtypes

[('Name', 'string'),
 ('Age', 'string'),
 ('Income', 'string'),
 ('Experience', 'string'),
 ('New Age', 'double'),
 ('New Experience', 'double'),
 ('New Income', 'double')]

# Filtering Data

In [217]:
data.filter("Age<30").show()

+------+-----+---------+----------+-------+--------------+----------+
|  Name|  Age|   Income|Experience|New Age|New Experience|New Income|
+------+-----+---------+----------+-------+--------------+----------+
|  John|23.00| 50000.00|      3.00|   23.0|           3.0|       3.0|
| Ethan|22.00|140000.00|      1.00|   22.0|           1.0|       1.0|
|Olivia|27.00|100000.00|      2.00|   27.0|           2.0|       2.0|
| Lucas|27.00| 50000.00|      2.00|   27.0|           2.0|       2.0|
|Sophia|29.00| 30000.00|      4.00|   29.0|           4.0|       4.0|
+------+-----+---------+----------+-------+--------------+----------+



In [218]:
data.filter('Age<30').select(['Name', 'Age']).show()

+------+-----+
|  Name|  Age|
+------+-----+
|  John|23.00|
| Ethan|22.00|
|Olivia|27.00|
| Lucas|27.00|
|Sophia|29.00|
+------+-----+



In [219]:
data.filter((data['Age']<=35) | (data['Experience']<=5)).show()

+------+-----+---------+----------+-------+--------------+----------+
|  Name|  Age|   Income|Experience|New Age|New Experience|New Income|
+------+-----+---------+----------+-------+--------------+----------+
|  John|23.00| 50000.00|      3.00|   23.0|           3.0|       3.0|
|   Eva|34.00| 90000.00|      6.00|   34.0|           6.0|       6.0|
|Nicole|34.00|170000.00|      4.00|   34.0|           4.0|       4.0|
|   Lee|35.00|200000.00|      5.00|   35.0|           5.0|       5.0|
| Ethan|22.00|140000.00|      1.00|   22.0|           1.0|       1.0|
|Olivia|27.00|100000.00|      2.00|   27.0|           2.0|       2.0|
| Lucas|27.00| 50000.00|      2.00|   27.0|           2.0|       2.0|
|Sophia|29.00| 30000.00|      4.00|   29.0|           4.0|       4.0|
+------+-----+---------+----------+-------+--------------+----------+



In [220]:
data.filter((data['Age']<35) & (data['Experience']<=5)).show()

+------+-----+---------+----------+-------+--------------+----------+
|  Name|  Age|   Income|Experience|New Age|New Experience|New Income|
+------+-----+---------+----------+-------+--------------+----------+
|  John|23.00| 50000.00|      3.00|   23.0|           3.0|       3.0|
|Nicole|34.00|170000.00|      4.00|   34.0|           4.0|       4.0|
| Ethan|22.00|140000.00|      1.00|   22.0|           1.0|       1.0|
|Olivia|27.00|100000.00|      2.00|   27.0|           2.0|       2.0|
| Lucas|27.00| 50000.00|      2.00|   27.0|           2.0|       2.0|
|Sophia|29.00| 30000.00|      4.00|   29.0|           4.0|       4.0|
+------+-----+---------+----------+-------+--------------+----------+



In [221]:
data.filter(~(data['Age']>34)).show()

+------+-----+---------+----------+-------+--------------+----------+
|  Name|  Age|   Income|Experience|New Age|New Experience|New Income|
+------+-----+---------+----------+-------+--------------+----------+
|  John|23.00| 50000.00|      3.00|   23.0|           3.0|       3.0|
|   Eva|34.00| 90000.00|      6.00|   34.0|           6.0|       6.0|
|Nicole|34.00|170000.00|      4.00|   34.0|           4.0|       4.0|
| Ethan|22.00|140000.00|      1.00|   22.0|           1.0|       1.0|
|Olivia|27.00|100000.00|      2.00|   27.0|           2.0|       2.0|
| Lucas|27.00| 50000.00|      2.00|   27.0|           2.0|       2.0|
|Sophia|29.00| 30000.00|      4.00|   29.0|           4.0|       4.0|
+------+-----+---------+----------+-------+--------------+----------+



In [222]:
data.groupBy('Age').max().show()

+-----+------------+-------------------+---------------+
|  Age|max(New Age)|max(New Experience)|max(New Income)|
+-----+------------+-------------------+---------------+
|38.00|        38.0|                7.0|            7.0|
|42.00|        42.0|                8.0|            8.0|
|29.00|        29.0|                4.0|            4.0|
|22.00|        22.0|                1.0|            1.0|
|27.00|        27.0|                2.0|            2.0|
|35.00|        35.0|                5.0|            5.0|
|36.00|        36.0|                6.0|            6.0|
|45.00|        45.0|                9.0|            9.0|
|34.00|        34.0|                6.0|            6.0|
|53.00|        53.0|               10.0|           10.0|
|23.00|        23.0|                3.0|            3.0|
+-----+------------+-------------------+---------------+



In [223]:
data.groupBy(['Name', 'Age']).mean().show()

+-------+-----+------------+-------------------+---------------+
|   Name|  Age|avg(New Age)|avg(New Experience)|avg(New Income)|
+-------+-----+------------+-------------------+---------------+
|    Ava|38.00|        38.0|                7.0|            7.0|
|  Lucas|27.00|        27.0|                2.0|            2.0|
|    Lee|35.00|        35.0|                5.0|            5.0|
|   John|23.00|        23.0|                3.0|            3.0|
| Olivia|27.00|        27.0|                2.0|            2.0|
|    Eva|34.00|        34.0|                6.0|            6.0|
|   Mary|42.00|        42.0|                8.0|            8.0|
|Ishmael|53.00|        53.0|               10.0|           10.0|
|  Ethan|22.00|        22.0|                1.0|            1.0|
|Kenneth|36.00|        36.0|                6.0|            6.0|
| Nicole|34.00|        34.0|                4.0|            4.0|
| Sophia|29.00|        29.0|                4.0|            4.0|
|  Locus|45.00|        45

In [224]:
data.groupBy(['Name', 'Age']).count().show()

+-------+-----+-----+
|   Name|  Age|count|
+-------+-----+-----+
|    Ava|38.00|    1|
|  Lucas|27.00|    1|
|    Lee|35.00|    1|
|   John|23.00|    1|
| Olivia|27.00|    1|
|    Eva|34.00|    1|
|   Mary|42.00|    1|
|Ishmael|53.00|    1|
|  Ethan|22.00|    1|
|Kenneth|36.00|    1|
| Nicole|34.00|    1|
| Sophia|29.00|    1|
|  Locus|45.00|    1|
+-------+-----+-----+



In [225]:
data.groupBy('Age').count().show()

+-----+-----+
|  Age|count|
+-----+-----+
|38.00|    1|
|42.00|    1|
|29.00|    1|
|22.00|    1|
|27.00|    2|
|35.00|    1|
|36.00|    1|
|45.00|    1|
|34.00|    2|
|53.00|    1|
|23.00|    1|
+-----+-----+



In [226]:
data.agg({"Age": "max"}).show()

+--------+
|max(Age)|
+--------+
|   53.00|
+--------+



In [227]:
data.sort('Age').show()

+-------+-----+---------+----------+-------+--------------+----------+
|   Name|  Age|   Income|Experience|New Age|New Experience|New Income|
+-------+-----+---------+----------+-------+--------------+----------+
|  Ethan|22.00|140000.00|      1.00|   22.0|           1.0|       1.0|
|   John|23.00| 50000.00|      3.00|   23.0|           3.0|       3.0|
| Olivia|27.00|100000.00|      2.00|   27.0|           2.0|       2.0|
|  Lucas|27.00| 50000.00|      2.00|   27.0|           2.0|       2.0|
| Sophia|29.00| 30000.00|      4.00|   29.0|           4.0|       4.0|
|    Eva|34.00| 90000.00|      6.00|   34.0|           6.0|       6.0|
| Nicole|34.00|170000.00|      4.00|   34.0|           4.0|       4.0|
|    Lee|35.00|200000.00|      5.00|   35.0|           5.0|       5.0|
|Kenneth|36.00| 90000.00|      6.00|   36.0|           6.0|       6.0|
|    Ava|38.00|110000.00|      7.00|   38.0|           7.0|       7.0|
|   Mary|42.00| 67000.00|      8.00|   42.0|           8.0|       8.0|
|  Loc

In [228]:
data.sort(['Age', 'Name']).show()

+-------+-----+---------+----------+-------+--------------+----------+
|   Name|  Age|   Income|Experience|New Age|New Experience|New Income|
+-------+-----+---------+----------+-------+--------------+----------+
|  Ethan|22.00|140000.00|      1.00|   22.0|           1.0|       1.0|
|   John|23.00| 50000.00|      3.00|   23.0|           3.0|       3.0|
|  Lucas|27.00| 50000.00|      2.00|   27.0|           2.0|       2.0|
| Olivia|27.00|100000.00|      2.00|   27.0|           2.0|       2.0|
| Sophia|29.00| 30000.00|      4.00|   29.0|           4.0|       4.0|
|    Eva|34.00| 90000.00|      6.00|   34.0|           6.0|       6.0|
| Nicole|34.00|170000.00|      4.00|   34.0|           4.0|       4.0|
|    Lee|35.00|200000.00|      5.00|   35.0|           5.0|       5.0|
|Kenneth|36.00| 90000.00|      6.00|   36.0|           6.0|       6.0|
|    Ava|38.00|110000.00|      7.00|   38.0|           7.0|       7.0|
|   Mary|42.00| 67000.00|      8.00|   42.0|           8.0|       8.0|
|  Loc

# Machine Learning

In [229]:
from pyspark.ml.feature import VectorAssembler

featureassembler=VectorAssembler(inputCols=["New Age","New Experience"],outputCol="Independent Features")

In [230]:
data=featureassembler.transform(data)

In [231]:
data.show()

+-------+-----+---------+----------+-------+--------------+----------+--------------------+
|   Name|  Age|   Income|Experience|New Age|New Experience|New Income|Independent Features|
+-------+-----+---------+----------+-------+--------------+----------+--------------------+
|   John|23.00| 50000.00|      3.00|   23.0|           3.0|       3.0|          [23.0,3.0]|
|   Mary|42.00| 67000.00|      8.00|   42.0|           8.0|       8.0|          [42.0,8.0]|
|    Eva|34.00| 90000.00|      6.00|   34.0|           6.0|       6.0|          [34.0,6.0]|
|Ishmael|53.00|100000.00|     10.00|   53.0|          10.0|      10.0|         [53.0,10.0]|
|  Locus|45.00| 80000.00|      9.00|   45.0|           9.0|       9.0|          [45.0,9.0]|
|Kenneth|36.00| 90000.00|      6.00|   36.0|           6.0|       6.0|          [36.0,6.0]|
|    Ava|38.00|110000.00|      7.00|   38.0|           7.0|       7.0|          [38.0,7.0]|
| Nicole|34.00|170000.00|      4.00|   34.0|           4.0|       4.0|          

In [232]:
from pyspark.ml.regression import LinearRegression

In [233]:
train_data,test_data=data.na.drop().randomSplit([0.75,0.25])

In [234]:
reg = LinearRegression(featuresCol = "Independent Features", labelCol='New Income')

In [235]:
reg = reg.fit(train_data)

In [236]:
reg.coefficients

DenseVector([-0.0, 1.0])

In [237]:
reg.intercept

1.2465905049208366e-13

In [238]:
### Prediction
pred=reg.evaluate(test_data)

In [239]:
pred.predictions.show()

+------+-----+---------+----------+-------+--------------+----------+--------------------+------------------+
|  Name|  Age|   Income|Experience|New Age|New Experience|New Income|Independent Features|        prediction|
+------+-----+---------+----------+-------+--------------+----------+--------------------+------------------+
|  John|23.00| 50000.00|      3.00|   23.0|           3.0|       3.0|          [23.0,3.0]|3.0000000000000355|
|   Lee|35.00|200000.00|      5.00|   35.0|           5.0|       5.0|          [35.0,5.0]| 4.999999999999996|
|Nicole|34.00|170000.00|      4.00|   34.0|           4.0|       4.0|          [34.0,4.0]|3.9999999999999853|
|Sophia|29.00| 30000.00|      4.00|   29.0|           4.0|       4.0|          [29.0,4.0]| 4.000000000000015|
+------+-----+---------+----------+-------+--------------+----------+--------------------+------------------+



In [240]:
pred.meanAbsoluteError,pred.meanSquaredError, pred.r2

(1.7430501486614958e-14, 4.311617885098592e-28, 1.0)