use tick marks around words to make words look like `code`

look into beautiful soup on Python

## Packages used in this notebook

In [1]:
import pandas as pd
from scipy import stats

## Import data

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/juanklopper/TutorialData/main/heart_failure.csv')

In [3]:
df.head()
# ejection_fraction has to do with the heart's ability to pump blood

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,hypertension,platelets,serum_creatinine,serum_sodium,sex,smoking,time,death
0,75.0,No,582,No,20,Yes,265000.0,1.9,130,Male,No,4,Yes
1,55.0,No,7861,No,38,No,263358.03,1.1,136,Male,No,6,Yes
2,65.0,No,146,No,20,No,162000.0,1.3,129,Male,Yes,7,Yes
3,50.0,Yes,111,No,20,No,210000.0,1.9,137,Male,No,7,Yes
4,65.0,Yes,160,Yes,20,No,327000.0,2.7,116,Female,No,8,Yes


In [4]:
df.shape

(299, 13)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    object 
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    object 
 4   ejection_fraction         299 non-null    int64  
 5   hypertension              299 non-null    object 
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    object 
 10  smoking                   299 non-null    object 
 11  time                      299 non-null    int64  
 12  death                     299 non-null    object 
dtypes: float64(3), int64(4), object(6)
memory usage: 30.5+ KB


In [None]:
# List the columns in df
df.columns

## Exploring categorical variables

In [6]:
# Determine the unique classes for the anaemia column
# The possible values that can go in the column are also known as the sample space
df['anaemia'].unique()

array(['No', 'Yes'], dtype=object)

In [7]:
# Determine the value counts of the classes in the aneamia column
# note that you can use jsut df here because anaemia is ONE WORD
df['anaemia'].value_counts()

anaemia
No     170
Yes    129
Name: count, dtype: int64

In [9]:
# use the normalize parameter to get the proportions
df.anaemia.value_counts(normalize=True)

anaemia
No     0.568562
Yes    0.431438
Name: proportion, dtype: float64

In [10]:
df['diabetes'].value_counts(normalize=True)

diabetes
No     0.58194
Yes    0.41806
Name: proportion, dtype: float64

In [11]:
# use groupby to get comparative statistics
df.groupby('diabetes').anaemia.value_counts()

diabetes  anaemia
No        No         98
          Yes        76
Yes       No         72
          Yes        53
Name: count, dtype: int64

In [12]:
# Express the proportions as percentages
df.groupby('diabetes').anaemia.value_counts(normalize=True) * 100

diabetes  anaemia
No        No         56.321839
          Yes        43.678161
Yes       No         57.600000
          Yes        42.400000
Name: proportion, dtype: float64

In [13]:
# create contingency table aka table of observed data
# crosstab is a function in pandas
pd.crosstab(df.diabetes, df.anaemia)

anaemia,No,Yes
diabetes,Unnamed: 1_level_1,Unnamed: 2_level_1
No,98,76
Yes,72,53


In [14]:
pd.crosstab(df.diabetes, df.anaemia, normalize=True)

anaemia,No,Yes
diabetes,Unnamed: 1_level_1,Unnamed: 2_level_1
No,0.327759,0.254181
Yes,0.240803,0.177258


## Exploring numerical variables
* measures of central tendency (mean, median, mode) 
* measures of dispersion (range, variance, stddev, interquartile range)

In [15]:
df.age.mean()

60.83389297658862

In [16]:
# determine the range of the age column
# note the syntax for max and min is df.column_name.max() and df.column_name.min()
df.age.max() - df.age.min()

55.0

Make sure ddof (degrees of freedom) EQUALS 1! when using std and var!!

In [17]:
# pandas does not have a built-in function for IQR, but the scipy library does
stats.iqr(df.age)

19.0

Correlation between numerical variables

In [18]:
# calculate the correlation between age and ejection_fraction variables
df[['age', 'ejection_fraction']].corr()

Unnamed: 0,age,ejection_fraction
age,1.0,0.060098
ejection_fraction,0.060098,1.0


In [19]:
# describe the age column 
df.age.describe()

count    299.000000
mean      60.833893
std       11.894809
min       40.000000
25%       51.000000
50%       60.000000
75%       70.000000
max       95.000000
Name: age, dtype: float64

In [20]:
df.groupby('death').age.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
death,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,203.0,58.761906,10.63789,40.0,50.0,60.0,65.0,90.0
Yes,96.0,65.215281,13.214556,42.0,55.0,65.0,75.0,95.0


# Quiz

In [3]:
# Question 1
import pandas as pd

In [4]:
# Question 2
# you use the read_csv function to read a csv file into a pandas dataframe
# general syntax: df = pd.read_csv('insert_link')
# example
df = pd.read_csv('https://raw.githubusercontent.com/juanklopper/TutorialData/main/heart_failure.csv')

In [5]:
# 3. How do you display the first 5 rows of a DataFrame in pandas?
df.head() # use the head function to display the first five rows of the dataframe

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,hypertension,platelets,serum_creatinine,serum_sodium,sex,smoking,time,death
0,75.0,No,582,No,20,Yes,265000.0,1.9,130,Male,No,4,Yes
1,55.0,No,7861,No,38,No,263358.03,1.1,136,Male,No,6,Yes
2,65.0,No,146,No,20,No,162000.0,1.3,129,Male,Yes,7,Yes
3,50.0,Yes,111,No,20,No,210000.0,1.9,137,Male,No,7,Yes
4,65.0,Yes,160,Yes,20,No,327000.0,2.7,116,Female,No,8,Yes


In [8]:
# 4. How do you calculate the mean of a column named `Age` in a DataFrame named `df`?
df.Age.mean()

60.83389297658862

In [None]:
# 5. How do you calculate the median of a column named `Salary` in a DataFrame named `df`?
df.Salary.median()

In [None]:
# 6. How do you calculate the standard deviation of a column named `Score` in a DataFrame named `df`?
df.Score.std(ddof=1)

In [None]:
# 7. How do you find the number of missing values in each column of a DataFrame named `df`?
df.info() 
# this will give you the number of non-null values in each column
# you can subtract the non-null values from the total number of values to get the number of missing values

In [None]:
# 8. How do you calculate the correlation between two columns, `Age` and `Salary`, in a DataFrame named `df`?
df[['Age', 'Salary']].corr()
#or
df.Age.corr(df['Salary'])

In [None]:
# 9. How do you select a subset of a DataFrame `df` where the column `Age` is greater than 30?
df[df.Age > 30]

In [None]:
# 10. How do you calculate the range (maximum - minimum) of a column named `Score` in a DataFrame named `df`?
df.Score.max() - df.Score.min()

In [None]:
# 11. How do you group a DataFrame `df` by a column named `Department` and calculate the mean of `Salary` within each group?
df.groupby('Department').Salary.mean()

In [None]:
# 12. How do you group a DataFrame `df` by two columns, `Department` and `Job Title`, and count the number of rows within each group?
df.groupby(['Department', 'Job Title']).size()

In [None]:
# 13. How do you use the groupby method to find the maximum `Age` in each `Department` in a DataFrame `df`?
df.groupby('Department').Age.max()

In [None]:
# 14. How do you create a cross-tabulation table that shows the frequency count of `Department` (rows) and `Job Title` (columns) in a DataFrame `df`?
pd.crosstab(df.Department, df['Job Title'])


In [10]:
# 15. How do you create a cross-tabulation table that shows the mean `Salary` for each combination of `Department` (rows) and `Job Title` (columns) in a DataFrame `df`?
pd.crosstab(df.Department, df['Job Title'], values=df.Salary, aggfunc='mean')

AttributeError: 'DataFrame' object has no attribute 'Department'