# Week 2 - Exploratory Data Analysis (EDA) Exercise

We will start by importing the requisite libraries:
* *Pandas* for Data Loading and Exploration
* *Matplotlib, Seaborn* for Visualization.

In [0]:
# import requisite libraries 
import pandas as pd
import matplotlib.pyplot as plt

In [0]:
#Load in the data from the web 
url = 'https://raw.githubusercontent.com/hantswilliams/AHI_506_Research/master/Week2/final_contracts_encounters.csv'
df = pd.read_csv(url, error_bad_lines=False)

In [0]:
#Lets look at the first 5 records 
df.head(5)

In [0]:
#Lets look at the bottom 5 records 
df.tail(5)

In [0]:
#Lets look at a random selection of 5 records 
df.sample(5)

In [0]:
#Lets look at the shape of our data [#rows and features]
df.shape

In [0]:
#Lets look at the feeatures of data
df.info()

# VARIABLE DISTRIBUTION

In [0]:
#Lets look at variable distributions
df.hist(figsize=(30,30))
#set a large figsize if you have > 9 variables
plt.tight_layout()
plt.show()

In [0]:
#Lets now look at some summary statistics 
df.describe()

In [0]:
#Lets now create a new datafrom to look at the summary statistics 
df_2 = df.describe()
df_2

In [0]:
#Lets see if there are any duplicate values 
df.duplicated().sum()

# NULL VALUES

In [0]:
#Lets now check for NULL values / exlcuding 0  -> this will provide a % of missing values for each feature 
null = df.isna().sum()/len(df)
null[null > 0].sort_values()

# OUTLIERS 

In [0]:
#Outliers for CONTINUOUS DATA - AGE 
#Lets first pull out a couple continuous variables
continuous = df[['contracts_age', 'membership_activedays_todate']]
continuous_labels = list(continuous.columns)

In [0]:
#Now lets create a plot that will take our continuous data and plot it for us
i = 1
plt.figure(figsize=(15,30))
for var in continuous_labels: #plotting boxplot for each variable
   plt.subplot(round(len(continuous_labels),0)/3+3,4,i)
   plt.boxplot(continuous[var],whis=5)
   plt.title(var)
   i+=1
plt.tight_layout()
plt.show()

# NEW FEATURES 

## Lets create a NEW FEATURE - Encounters that is grouped together  // encounters_total_med_human

In [0]:
#Lets create a FUNCTION that dictates how it should look/work
def myencounters(row):
    if row['encounters_total_number_of_doctors_visits'] == 1 :
        return '1 Medical Encounter'
    elif (row['encounters_total_number_of_doctors_visits'] > 1) & (row['encounters_total_number_of_doctors_visits'] <= 3) :
        return '1 to 2 Medical Encounters'
    elif row['encounters_total_number_of_doctors_visits'] > 3 :
        return '4 or more Medical Encounters'

df['encounters_total_med_human'] = df.apply( lambda row : myencounters(row), axis = 1)
df['encounters_total_med_human'] = df['encounters_total_med_human'].fillna('0 Medical Encounters')


In [0]:
## Lets create a NEW FEATURE - Completed PSIs // psi_completed_count_group

In [0]:
def psicompletion(row):
    if row['psi_count_numberical'] == 1 :
        return '1 PSI completed'
    elif (row['psi_count_numberical'] == 2) :
        return '2 PSIs completed'
    elif row['psi_count_numberical'] >= 3 :
        return '3 or more PSIs completed'

df['psi_completed_count_group'] = df.apply( lambda row : psicompletion(row), axis = 1)

In [0]:
# How we check that both of these are now part of our dataset? lets do a list of our columns 
list(df)
# We should see that the last columns of our dataframe now contain those new values