In [1]:
# Import findspark 
import findspark

# Initialize and provide path
findspark.init("D:\spark")

In [2]:
# Import SparkSession
from pyspark.sql import SparkSession
from pyspark import SparkContext
import pyspark

In [3]:
# Import SparkSession
from pyspark.sql import SparkSession

# Build the SparkSession
spark = SparkSession.builder \
   .master("local") \
   .appName("Linear Regression Model I") \
   .config("spark.executor.memory", "1gb") \
   .getOrCreate()

sc = spark.sparkContext

In [4]:
from pyspark.sql.types import *

In [5]:
#create the structure of schema
schema = StructType().add("id","integer").add("name","string").add("qualification","string").add("age", "integer").add("gender", "string")

In [6]:
#create data
data = [
    (1,'John',"B.A.", 20, "Male"),
    (2,'Martha',"B.Com.", 20, "Female"),
    (3,'Mona',"B.Com.", 21, "Female"),
    (4,'Harish',"B.Sc.", 22, "Male"),
    (5,'Jonny',"B.A.", 22, "Male"),
    (6,'Maria',"B.A.", 23, "Female"),
    (7,'Monalisa',"B.A.", 21, "Female")
]

In [7]:
#create dataframe
df = spark.createDataFrame(data, schema=schema)
#columns of dataframe
df.columns

['id', 'name', 'qualification', 'age', 'gender']

In [8]:
#columns of dataframe
df.show()

+---+--------+-------------+---+------+
| id|    name|qualification|age|gender|
+---+--------+-------------+---+------+
|  1|    John|         B.A.| 20|  Male|
|  2|  Martha|       B.Com.| 20|Female|
|  3|    Mona|       B.Com.| 21|Female|
|  4|  Harish|        B.Sc.| 22|  Male|
|  5|   Jonny|         B.A.| 22|  Male|
|  6|   Maria|         B.A.| 23|Female|
|  7|Monalisa|         B.A.| 21|Female|
+---+--------+-------------+---+------+



In [12]:
df.select('name').show()

+--------+
|    name|
+--------+
|    John|
|  Martha|
|    Mona|
|  Harish|
|   Jonny|
|   Maria|
|Monalisa|
+--------+



In [13]:
df.groupby("age").count().sort("age", ascending=False).show()

+---+-----+
|age|count|
+---+-----+
| 23|    1|
| 22|    2|
| 21|    2|
| 20|    2|
+---+-----+



In [9]:
#import required library
from pyspark.ml.feature import StringIndexer

In [10]:
qualification_indexer = StringIndexer(inputCol="qualification", outputCol="qualificationIndex")
#Fits a model to the input dataset with optional parameters.
df1 = qualification_indexer.fit(df).transform(df)
df1.show()

+---+--------+-------------+---+------+------------------+
| id|    name|qualification|age|gender|qualificationIndex|
+---+--------+-------------+---+------+------------------+
|  1|    John|         B.A.| 20|  Male|               0.0|
|  2|  Martha|       B.Com.| 20|Female|               1.0|
|  3|    Mona|       B.Com.| 21|Female|               1.0|
|  4|  Harish|        B.Sc.| 22|  Male|               2.0|
|  5|   Jonny|         B.A.| 22|  Male|               0.0|
|  6|   Maria|         B.A.| 23|Female|               0.0|
|  7|Monalisa|         B.A.| 21|Female|               0.0|
+---+--------+-------------+---+------+------------------+



In [11]:
gender_indexer = StringIndexer(inputCol="gender", outputCol="genderIndex")
#Fits a model to the input dataset with optional parameters.
df2 = gender_indexer.fit(df).transform(df)
df2.show()

+---+--------+-------------+---+------+-----------+
| id|    name|qualification|age|gender|genderIndex|
+---+--------+-------------+---+------+-----------+
|  1|    John|         B.A.| 20|  Male|        1.0|
|  2|  Martha|       B.Com.| 20|Female|        0.0|
|  3|    Mona|       B.Com.| 21|Female|        0.0|
|  4|  Harish|        B.Sc.| 22|  Male|        1.0|
|  5|   Jonny|         B.A.| 22|  Male|        1.0|
|  6|   Maria|         B.A.| 23|Female|        0.0|
|  7|Monalisa|         B.A.| 21|Female|        0.0|
+---+--------+-------------+---+------+-----------+



In [14]:
#import module
from pyspark.ml import Pipeline

In [15]:
schema = StructType().add("id","integer").add("name","string").add("qualification","string").add("age", "integer").add("gender", "string")
df = spark.createDataFrame(data, schema=schema)

In [16]:
qualification_indexer = StringIndexer(inputCol="qualification", outputCol="qualificationIndex")
gender_indexer = StringIndexer(inputCol="gender", outputCol="genderIndex")
pipeline = Pipeline(stages=[qualification_indexer, gender_indexer])
model = pipeline.fit(df).transform(df)
model.show()

+---+--------+-------------+---+------+------------------+-----------+
| id|    name|qualification|age|gender|qualificationIndex|genderIndex|
+---+--------+-------------+---+------+------------------+-----------+
|  1|    John|         B.A.| 20|  Male|               0.0|        1.0|
|  2|  Martha|       B.Com.| 20|Female|               1.0|        0.0|
|  3|    Mona|       B.Com.| 21|Female|               1.0|        0.0|
|  4|  Harish|        B.Sc.| 22|  Male|               2.0|        1.0|
|  5|   Jonny|         B.A.| 22|  Male|               0.0|        1.0|
|  6|   Maria|         B.A.| 23|Female|               0.0|        0.0|
|  7|Monalisa|         B.A.| 21|Female|               0.0|        0.0|
+---+--------+-------------+---+------+------------------+-----------+



In [17]:
#spark.stop()