## Import Libraries

In [4]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from pandas import DataFrame
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA

## Read Data Set from CSV and Verify

In [5]:
med_df = pd.read_csv('medical_raw_data.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'medical_raw_data.csv'

## Exploration of Data: 

### What is the Shape of the DataFrame (Rows & Columns):

In [None]:
med_df.shape

### DataFrame Inspection: View Information (Data Types, Index, Column Count, Memory Usage): 

In [None]:
med_df.info(verbose=True)

### Inspect Start and End of data:

In [None]:
med_df.head()

In [None]:
med_df.tail()

## Detect Missing Data: 

#### Check if Rows Have Data or Not (True if NA, Nulls, NaN):

In [None]:
med_df.isnull().any()

#### How Many NaNs

In [None]:
med_df.isnull().sum()

#### List Only Columns Containing At Least One Null

In [None]:
med_df.loc[:, med_df.isnull().any()]

#### Any Rows with All Nulls?

In [None]:
med_df.isnull().all(axis=1).any()

In [None]:
med_df[med_df.isnull().all(axis=1)]

### Fill Missing Values and Verify

In [None]:
med_df[['Children', 'Age', 'Income', 'Soft_drink', 'Overweight', 'Anxiety', 'Initial_days']]=med_df[['Children', 'Age', 'Income', 'Soft_drink', 'Overweight', 'Anxiety', 'Initial_days']].fillna(med_df.mean())

In [None]:
rows_to_fill = med_df.isnull().any(axis=1)
med_df[rows_to_fill]

#### Fill Missing Data With Mean

#### Backfill Soft Drink Series

In [None]:
med_df[['Soft_drink']] = med_df[['Soft_drink']].fillna(method='bfill')

#### Verify Missing Values Filled

In [None]:
med_df.loc[:, med_df.isnull().any()]

#### Verify Nulls are Filled (Different Way)

In [None]:
med_df[['Children', 'Age', 'Income', 'Soft_drink', 'Overweight', 'Anxiety', 'Initial_days']].info()

In [None]:
med_df.info()

### Delete Redundant Column

In [None]:
med_df.drop(columns='Unnamed: 0', inplace=True)
med_df.head()

### Using Boxplots to Help Detect Outliers

In [None]:
med_df.boxplot()

In [None]:
Children_bp = med_df['Children']
Children_bp.plot.box()
Children_bp.describe()

In [None]:
q1 = Children_bp.quantile(.25)
q3 = Children_bp.quantile(.75)
# set interquartile range 
iqr = q3 - q1
# position whiskers
pmin = q1 - 1.5 * iqr
pmax = q3 + 1.5 * iqr
# Replace outliers w/NaNs rather than remove the whole row
children_mean = Children_bp.where(Children_bp.between(pmin, pmax), Children_bp.mean())
children_mode = Children_bp.where(Children_bp.between(pmin, pmax), Children_bp.mode())
new_children = Children_bp.where(Children_bp.between(pmin, pmax))

In [None]:
compare = pd.DataFrame({'before':Children_bp, 'NaN':new_children, 'mean':children_mean, 'mode':children_mode})
compare.plot.box()
compare.describe()

In [None]:
Age_bp = med_df['Age']
Age_bp.plot.box()
Age_bp.describe()

In [None]:
q1 = Age_bp.quantile(.25)
q3 = Age_bp.quantile(.75)
# set interquartile range 
iqr = q3 - q1
# position whiskers
pmin = q1 - 1.5 * iqr
pmax = q3 + 1.5 * iqr
# Replace outliers w/NaNs rather than remove the whole row
age_mean = Age_bp.where(Age_bp.between(pmin, pmax), Age_bp.mean())
age_mode = Age_bp.where(Age_bp.between(pmin, pmax), Age_bp.mode())
new_age = Age_bp.where(Age_bp.between(pmin, pmax))

In [None]:
compare = pd.DataFrame({'before':Age_bp, 'NaN':new_age, 'mean':age_mean, 'mode':age_mode})
compare.plot.box()
compare.describe()

In [None]:
Income_bp = med_df['Income']
Income_bp.plot.box()
Income_bp.describe()

In [None]:
q1 = Income_bp.quantile(.25)
q3 = Income_bp.quantile(.75)
# set interquartile range 
iqr = q3 - q1
# position whiskers
pmin = q1 - 1.5 * iqr
pmax = q3 + 1.5 * iqr
# Replace outliers w/NaNs rather than remove the whole row
income_mean = Income_bp.where(Income_bp.between(pmin, pmax), Income_bp.mean())
income_mode = Income_bp.where(Income_bp.between(pmin, pmax), Income_bp.mode())
new_income = Income_bp.where(Income_bp.between(pmin, pmax))

In [None]:
compare = pd.DataFrame({'before':Income_bp, 'mean':income_mean, 'mode':income_mode, 'NaN':new_income})
compare.plot.box()
compare.describe()

In [None]:
Initial_days_bp = med_df['Initial_days']
Initial_days_bp.plot.box()
Initial_days_bp.describe()

In [None]:
q1 = Initial_days_bp.quantile(.25)
q3 = Initial_days_bp.quantile(.75)
# set interquartile range 
iqr = q3 - q1
# position whiskers
pmin = q1 - 1.5 * iqr
pmax = q3 + 1.5 * iqr
# Replace outliers w/NaNs rather than remove the whole row
initial_days_mean = Initial_days_bp.where(Initial_days_bp.between(pmin, pmax), Initial_days_bp.mean())
initial_days_mode = Initial_days_bp.where(Initial_days_bp.between(pmin, pmax), Initial_days_bp.mode())
new_initial_days = Initial_days_bp.where(Initial_days_bp.between(pmin, pmax))

In [None]:
compare = pd.DataFrame({'before':Initial_days_bp, 'mean':initial_days_mean, 'mode':initial_days_mode, 'NaN':new_initial_days})
compare.plot.box()
compare.describe()

#### Save Changes Perminant 

In [None]:
compare = pd.DataFrame({'Children':med_df['Children'], 'C_mean':children_mean, 'Age':med_df['Age'], 'A_mean':age_mean, 'Income':med_df['Income'], 'I_mean':income_mean, 'Initial_days':med_df['Initial_days'], 'ID_mean':initial_days_mean})
compare.plot.box()
compare.describe()

In [None]:
med_df['Children'].mean()

In [None]:
med_df['Children'] = children_mean

In [None]:
med_df['Children'].mean()

In [None]:
med_df['Children'].where(Children_bp.between(pmin, pmax), med_df['Children'].mean())
med_df['Children'].mean()

In [None]:
med_df['Children'] = children_mean
med_df['Age'] = age_mean
med_df['Income'] = income_mean
med_df['Initial_days'] = initial_days_mean

In [None]:
med_df[['Children', 'Age', 'Income', 'Initial_days']].boxplot()
med_df[['Children', 'Age', 'Income', 'Initial_days']].describe()

In [None]:
Income_bp = med_df['Income']
Income_bp.plot.box()
Income_bp.describe()

In [None]:
med_df.duplicated().any()

#### Verify Range and Series Counts All Match

In [None]:
med_df.info()

## Converting Data Types

In [None]:
med_df[['Overweight', 'Anxiety']].info(verbose=True)

In [None]:
med_df[['Overweight', 'Anxiety']] = med_df[['Overweight', 'Anxiety']].astype(object)

In [None]:
med_df[['Overweight', 'Anxiety']].info(verbose=True)

## Fixing Indices

#### Check if UID has any duplicates:

In [None]:
med_df[med_df.duplicated('UID')]

In [None]:
med_df.set_index('UID', drop=True, inplace=True)
med_df.head()

#### Rename Some Column Headers for Consistancy

In [None]:
med_df.rename(
    columns={"CaseOrder": "Case_order", "HighBlood": "High_blood", "BackPain": "Back_pain","TotalCharge": "Total_charge", "Item1": "Timely_admission", "Item2": "Timely_treatment", "Item3": "Timely_visits", "Item4": "Reliability", "Item5": "Options", "Item6": "Hours_of_treatment", "Item7": "Courteous_staff", "Item8": "Active_listening_evidence_from_dr"},
    inplace=True)
med_df.head()

## PCA: Use Only Series with Int and Float Data Types

In [None]:
med_df = med_df[['Zip', 'Lat', 'Lng', 'Population', 'Children', 'Age', 'Income', 'VitD_levels', 'Doc_visits', 'Full_meals_eaten', 'VitD_supp', 'Initial_days', 'Total_charge', 'Additional_charges', 'Timely_admission', 'Timely_treatment', 'Timely_visits', 'Reliability', 'Options', 'Hours_of_treatment', 'Courteous_staff', 'Active_listening_evidence_from_dr']]

In [None]:
med_df.head()

### Verify Data Set Contains Numbers and NaNs Removed

In [None]:
med_df.info()

### Normalize the Data

In [None]:
med_df_normalized = (med_df - med_df.mean()) / med_df.std()

#### Import

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

### Choose Number of Components to Extract

In [None]:
pca = PCA(n_components = med_df.shape[1])

### Call PCA Application and Create New Data Set of Components

In [None]:
pca.fit(med_df_normalized)
med_df_pca = pd.DataFrame(pca.transform(med_df_normalized),
                        columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20', 'PC21', 'PC22'])

### Create Scree Plot

In [None]:
plt.plot(pca.explained_variance_ratio_)
plt.xlabel('number of components')
plt.ylabel('explained variance')
plt.show()

In [None]:
cov_matrix=np.dot(med_df_normalized.T, med_df_normalized) / med_df.shape[0]
eigenvalues = [np.dot(eigenvector.T, np.dot(cov_matrix, eigenvector)) for eigenvector in pca.components_]

In [None]:
plt.plot(eigenvalues)
plt.xlabel('number of components')
plt.ylabel('eigenvalue')
# draw horizontal line @1
x_coordinates = [0, 21]
y_coordinates = [1, 1]
# draw vertical line @7
x_horizontal = [7,7]
y_horizontal = [0,3]
plt.plot(x_coordinates, y_coordinates, 'r--', x_horizontal, y_horizontal, 'g--')
plt.show()

#### Display PCA

In [None]:
loadings = pd.DataFrame(pca.components_.T,
            columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20', 'PC21', 'PC22'],
            index=med_df.columns)

loadings

### PCAs to Keep

#### Extract 7 PCA Components

In [None]:
pca_keep = PCA(n_components = 7)

### Call PCA Application and Create New PCAs to Keep Display

In [None]:
pca_keep.fit(med_df_normalized)
med_df_pca_keep = pd.DataFrame(pca_keep.transform(med_df_normalized),
                        columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7'])

In [None]:
loadings = pd.DataFrame(pca_keep.components_.T,
            columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7'],
            index=med_df.columns)

#Styling - Create heatmap
cm = sns.light_palette("green", as_cmap=True)

s = loadings.style.background_gradient(cmap=cm, subset=pd.IndexSlice[:, ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7']])
s

#### Export Cleaned Data

In [None]:
med_df.to_csv('medical_raw_cleaned.csv')