<a href="https://colab.research.google.com/github/kelvinfoo123/Projects-using-Spark/blob/main/Stroke_analysis_using_Pyspark_(Beginner).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark 
import pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 47 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 64.3 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=2835f6ae07c0abb298617554a262de08eb7f655e724c5e38a440ed762dcc0892
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [3]:
from pyspark.sql import SparkSession 

spark = SparkSession.builder.master("local[*]").getOrCreate()

In [4]:
data = spark.read.csv("stroke.csv", inferSchema = True, header = True)
data.show(5)

+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
|   id|gender| age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|
+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
| 9046|  Male|67.0|           0|            1|         Yes|      Private|         Urban|           228.69|36.6|formerly smoked|     1|
|51676|Female|61.0|           0|            0|         Yes|Self-employed|         Rural|           202.21| N/A|   never smoked|     1|
|31112|  Male|80.0|           0|            1|         Yes|      Private|         Rural|           105.92|32.5|   never smoked|     1|
|60182|Female|49.0|           0|            0|         Yes|      Private|         Urban|           171.23|34.4|         smokes|     1|
| 1665|Female|79.0|           1|            0|         

In [6]:
print("The shape of the data is ", (data.count(), len(data.columns)))

The shape of the data is  (5110, 12)


In [7]:
data.printSchema()

root
 |-- id: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- ever_married: string (nullable = true)
 |-- work_type: string (nullable = true)
 |-- Residence_type: string (nullable = true)
 |-- avg_glucose_level: double (nullable = true)
 |-- bmi: string (nullable = true)
 |-- smoking_status: string (nullable = true)
 |-- stroke: integer (nullable = true)



In [8]:
# Convert the data type of age to integer 
data = data.withColumn('age', data['age'].cast('integer'))

# Convert the data type of bmi to integer 
data = data.withColumn('bmi', data['bmi'].cast('integer'))

# Convert the data type of average glucose level to integer 
data = data.withColumn('avg_glucose_level', data['avg_glucose_level'].cast('integer'))

In [9]:
# Summary statistics 
data.describe().show()

+-------+-----------------+------+------------------+------------------+-------------------+------------+---------+--------------+------------------+------------------+--------------+-------------------+
|summary|               id|gender|               age|      hypertension|      heart_disease|ever_married|work_type|Residence_type| avg_glucose_level|               bmi|smoking_status|             stroke|
+-------+-----------------+------+------------------+------------------+-------------------+------------+---------+--------------+------------------+------------------+--------------+-------------------+
|  count|             5110|  5110|              5110|              5110|               5110|        5110|     5110|          5110|              5110|              4909|          5110|               5110|
|   mean|36517.82935420744|  null| 43.21526418786693|0.0974559686888454|0.05401174168297456|        null|     null|          null|105.65831702544031|28.448563862293746|          null| 

In [10]:
# Focus on summary statistics of age and average glucose level 
data.describe(['age', 'avg_glucose_level']).show()

+-------+------------------+------------------+
|summary|               age| avg_glucose_level|
+-------+------------------+------------------+
|  count|              5110|              5110|
|   mean| 43.21526418786693|105.65831702544031|
| stddev|22.633865752854746|45.275290309509536|
|    min|                 0|                55|
|    max|                82|               271|
+-------+------------------+------------------+



## **Data Analysis**

In [11]:
# Count number of people with and without stroke 
data.groupby('stroke').count().show()

+------+-----+
|stroke|count|
+------+-----+
|     1|  249|
|     0| 4861|
+------+-----+



In [12]:
# Average age of people with and without stroke 
data.groupby('Stroke').agg({'age': 'mean'}).show()

+------+-----------------+
|Stroke|         avg(age)|
+------+-----------------+
|     1|67.72690763052209|
|     0|41.95967907837893|
+------+-----------------+



In [13]:
# Min age of people with and without stroke 
data.groupby('Stroke').agg({'age': 'min'}).show()

+------+--------+
|Stroke|min(age)|
+------+--------+
|     1|       1|
|     0|       0|
+------+--------+



In [14]:
# Check for missing values 

from pyspark.sql.functions import isnan, when, count, col 
data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]).show()

+---+------+---+------------+-------------+------------+---------+--------------+-----------------+---+--------------+------+
| id|gender|age|hypertension|heart_disease|ever_married|work_type|Residence_type|avg_glucose_level|bmi|smoking_status|stroke|
+---+------+---+------------+-------------+------------+---------+--------------+-----------------+---+--------------+------+
|  0|     0|  0|           0|            0|           0|        0|             0|                0|201|             0|     0|
+---+------+---+------------+-------------+------------+---------+--------------+-----------------+---+--------------+------+



In [15]:
# Fill missing values in bmi using average bmi 

# Mean of bmi 
data.select('bmi').summary('50%', 'mean').show()

+-------+------------------+
|summary|               bmi|
+-------+------------------+
|    50%|                28|
|   mean|28.448563862293746|
+-------+------------------+



In [16]:
data = data.na.fill({'bmi': 28})

In [17]:
# Remove redundant features 
data = data.drop('id')
data.show(5)

+------+---+------------+-------------+------------+-------------+--------------+-----------------+---+---------------+------+
|gender|age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level|bmi| smoking_status|stroke|
+------+---+------------+-------------+------------+-------------+--------------+-----------------+---+---------------+------+
|  Male| 67|           0|            1|         Yes|      Private|         Urban|              228| 36|formerly smoked|     1|
|Female| 61|           0|            0|         Yes|Self-employed|         Rural|              202| 28|   never smoked|     1|
|  Male| 80|           0|            1|         Yes|      Private|         Rural|              105| 32|   never smoked|     1|
|Female| 49|           0|            0|         Yes|      Private|         Urban|              171| 34|         smokes|     1|
|Female| 79|           1|            0|         Yes|Self-employed|         Rural|              174| 24|   never

In [19]:
# Mean of all numeric variables wrt stroke
num_cols = [col[0] for col in data.dtypes if col[1] != 'string']

for col in [col.lower() for col in num_cols]: 
  data.groupby('stroke').agg({col: 'mean'}).show()

+------+-----------------+
|stroke|         avg(age)|
+------+-----------------+
|     1|67.72690763052209|
|     0|41.95967907837893|
+------+-----------------+

+------+-------------------+
|stroke|  avg(hypertension)|
+------+-------------------+
|     1|0.26506024096385544|
|     0|0.08887060275663444|
+------+-------------------+

+------+-------------------+
|stroke| avg(heart_disease)|
+------+-------------------+
|     1|0.18875502008032127|
|     0|0.04710964822053076|
+------+-------------------+

+------+----------------------+
|stroke|avg(avg_glucose_level)|
+------+----------------------+
|     1|     132.0441767068273|
|     0|     104.3067270109031|
+------+----------------------+

+------+-----------------+
|stroke|         avg(bmi)|
+------+-----------------+
|     1|29.70281124497992|
|     0|28.36576836041967|
+------+-----------------+

+------+-----------+
|stroke|avg(stroke)|
+------+-----------+
|     1|        1.0|
|     0|        0.0|
+------+-----------+



In [24]:
# Categorize the age column 
data = data.withColumn('age', when(data['age'] < 30, 'Adult'). 
                       when((data['age'] > 30) & (data['age'] < 40), "Middle age").
                       otherwise('Old age'))

data.show(5)

+------+-------+------------+-------------+------------+-------------+--------------+-----------------+---+---------------+------+
|gender|    age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level|bmi| smoking_status|stroke|
+------+-------+------------+-------------+------------+-------------+--------------+-----------------+---+---------------+------+
|  Male|Old age|           0|            1|         Yes|      Private|         Urban|              228| 36|formerly smoked|     1|
|Female|Old age|           0|            0|         Yes|Self-employed|         Rural|              202| 28|   never smoked|     1|
|  Male|Old age|           0|            1|         Yes|      Private|         Rural|              105| 32|   never smoked|     1|
|Female|Old age|           0|            0|         Yes|      Private|         Urban|              171| 34|         smokes|     1|
|Female|Old age|           1|            0|         Yes|Self-employed|         Rura