In [None]:
# Visualizing and testing p-value for variable correlation

In [None]:
# import required libraries
import pandas as pd                # statistic calc
import matplotlib.pyplot as plt    # visualizations
from scipy.stats import ttest_ind  # hypoth testing

In [None]:
# use Pandas to read raw csv and load to dataframe
f = "/content/drive/MyDrive/Data1501/Datasets/stroke.csv"
raw_csv = pd.read_csv(f)
df = pd.DataFrame(raw_csv)
df.iloc[:10]  # check top 10 rows

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1192,Female,31,0,0,No,Govt_job,Rural,70.66,27.2,never smoked,0
1,77,Female,13,0,0,No,children,Rural,85.81,18.6,Unknown,0
2,59200,Male,18,0,0,No,Private,Urban,60.56,33.0,never smoked,0
3,24905,Female,65,0,0,Yes,Private,Urban,205.77,46.0,formerly smoked,1
4,24257,Male,4,0,0,No,children,Rural,90.42,16.2,Unknown,0
5,57210,Female,28,0,0,Yes,Private,Rural,131.8,30.3,never smoked,0
6,61103,Female,64,1,0,Yes,Self-employed,Urban,190.92,31.4,never smoked,0
7,6480,Male,62,0,0,No,Govt_job,Urban,93.55,31.7,never smoked,0
8,62983,Female,26,0,0,Yes,Private,Urban,138.02,20.3,smokes,0
9,50784,Male,63,0,0,Yes,Private,Rural,228.56,27.4,never smoked,1


In [None]:
# use groupby() to index avg_glucose_lvl by strokes
grouped = df.groupby("avg_glucose_level")["stroke"]
grouped  # check if mapped groupby() obj is returned

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7932fabee9b0>

In [None]:
# get the mean of avg_glucose_lvl grouped by strokes
group_mean = grouped.mean()
group_mean  # check data

In [None]:
# horizontal boxplot: distribution of data of glucose data across strokes
df.boxplot(column="avg_glucose_level", by="stroke", vert=False)
# add detail to title to help users understand the graph
plt.title("Distribution of Average Glucose Levels Across Strokes")
plt.ylabel("0: NO STROKE \n 1: HAD STROKE")  # label x-axis
plt.xlabel("Average Glucose Level")          # label y-axis
plt.xticks(range(25, 300, 25))
plt.show()                                   # show without obj info

In [None]:
# BOXPLOT SUMMARY:
  # The plot shows that significantly more strokes were observed for
  # higher glucose levels. However, there are several outliers for
  # sample members that had higher glucose levels but did not report
  # having had a stroke.

# WHAT IS A BOXPLOT?
  # The box itself represents the location of the middle 50% of data.
  # Ie, it shows where the median is as well as the 1st and 3rd quartile.
  # The whisker to the left (below) the box shows how far the minimum
    # value is located from the 1st quartile.
  # The whisker to the right (above) the box shows how far the maximum
    # value is located from the 3rd quartile.
  # Any 'o' on the line after the min/max value refers to outlier values
    # that fall outside the range of distribution.

# WHAT DOES THIS BOXPLOT IMPLY?
  # The top box (1) for shows the distribution of average glucose levels
  # for sample members with a history of stroke(s).

  # The bottom box (0) shows the same distribution for sample members who
  # had not had a stroke.

  # The whiskers being longer to the right of both boxes shows that
  # when values fall higher than the 3rd quartile, they tend to fall
  # much higher. This is compared to values lower than the 1st quartile
  # tending to fall much closer to the boundary. This implies that
  # when people do have higher than average glucose levels, they
  # tend to have much higher than average.

  # For the top box (stroke reported), the length of the 3rd quartile
  # is much longer. This shows that more respondents who had reported
  # having had a stroke also had glucose levels that weer higher than
  # the median value, or higher than average average glucose levels.
  # This implies that higher glucose levels is closely related with
  # whether or not a person had a stroke.

  # The large number of outliers present in the lower box (no stroke),
  # shows that many people had average glucose levels even higher than
  # the normal distribution maximum value but had never had a stroke.

  # These two observations lead to the inference that having an average
  # glucose level that is higher than average is a strong indicator for
  # the presence of a stroke but it is not causally related.

In [None]:
# FORMULATE HYPOTHESES:
  # Null Hypothesis:
    # There is no significant difference between the avg glucose levels
    # for those who had a stroke vs. the avg glucose levels for those who
    # have not had a stroke.
  # Alternate Hypothesis:
    # There is significant difference in the avg glucose levels for people
    # who had a stroke vs. the avg glucose levels for people who had not
    # had a stroke.

In [None]:
# TEST HYPOTHESES:
# Use whole dataframe to include all factors, re-index by stroke
group_DF = df.groupby("stroke")

# get avg_glucose_level data for people who reported a stroke
stroke_gluc = group_DF.get_group(1)["avg_glucose_level"]

# get avg_glucose_level data for people who hadn't had a stroke
nostroke_gluc = group_DF.get_group(0)["avg_glucose_level"]

# assess the sample means to test population means
ttest_ind(stroke_gluc, nostroke_gluc)

TtestResult(statistic=6.342704697599813, pvalue=3.2548797729425533e-10, df=1134.0)

In [None]:
# INTERPRET p-VALUE:
  # The observed p-value indicates that the null hypothesis is not
  # likely to occur in the population by chance. Therefore, we accept
  # the alternate hypothesis.

# WHY:
  # This means that the sample population, in which the average glucose
  # levels tended to be higher than the mean for respondents who had
  # a stroke, is likely to be the same for the population from which
  # the sample was taken. Thus, it can be concluded that the same
  # significance would hold for the population and people who had
  # a stroke tend to have higher than average glucose levels, and
  # people who have hgiher than average glucose levels tend to have
  # increased risk of stroke compared to people who have lower
  # average glucose levels.