In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import pearsonr

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

import re
import math

# import dependencies
import ipywidgets as widgets
from IPython.display import display, HTML

# Part 1: Grabbing Data

In [3]:
def f(s): #fxn for converting one of the target variables to int
    if s == 'Excellent,':
        return 5
    elif s == 'Very good,':
        return 4
    elif s=='Good,':
        return 3
    elif s=='Fair, or':
        return 2
    elif s=='Poor?':
        return 1
    else:
        return s

def create_pca(train_df,target_df):
    subset = train_df.copy()
    target_cols_df = target_df.copy()

    #split up dataset subset and target
    x_train, x_test, y_train, y_test = train_test_split(subset, target_cols_df, test_size=1/7.0, random_state=0)

    ##Create PCA
    scaler = StandardScaler()

    # Fit on training set only.
    scaler.fit(x_train)
    # Apply transform to both the training set and the test set.
    train = scaler.transform(x_train)
    test = scaler.transform(x_test)

    # Make an instance of the Model. Capturing 95% of the variance
    pca = PCA(.95)

    pca.fit(train)

    pca_train = pca.transform(train)
    pca_test = pca.transform(test)

    #look at the PCA data and find the top least important variables
    pca_df = pd.DataFrame(pca.components_,columns=subset.columns)

    return pca_train, pca_test, y_train, y_test, pca_df, [pca, scaler]

def create_rf(pca_train, pca_test, y_train, n_estimators =100 ):
    #Create a Gaussian Classifier
    rf=RandomForestClassifier(n_estimators)

    #Train the model using the training sets y_pred=clf.predict(X_test)
    rf.fit(pca_train,y_train)

    y_pred=rf.predict(pca_test)

    return rf, y_pred
def group_age(x):

    try:
        v = int(x)
    except:
        try:
            if math.isnan(x):
                return 0
        except:
            return '80+'
    
    if v < 18:
        return '17-'
    elif v >= 18 and v < 30:
        return '18-29'
    elif v >= 30 and v < 40:
        return '30-39'
    elif v >= 40 and v < 50:
        return '40-49'
    elif v >= 50 and v < 60:
        return '50-59'
    elif v >= 60 and v < 70:
        return '60-69'
    elif v >= 70 and v < 80:
        return '70-79'
    elif int(x) >= 80:
        return '80+'
    else:
        return 0

def group_edu(x):
    ged = 'GED'
    no_dip = 'No Diploma|no diploma'
    college = 'More than high school'
    idk = 'Don\'t Know'
    hs = 'High school graduate|High School Graduate'
    num = '\d+'

    if type(x) == float:
        return 'Other'

    if re.search(ged, x):
        return 'GED'
    elif re.search(no_dip,x):
        return 'High School'
    elif re.search(college,x):
        return 'College'
    elif re.search(idk, x):
        return 'Other'
    elif re.search(hs,x):
        return 'HS Grad'
    elif re.search(num,x):
        a = re.search(num,x)
        ind1 = a.span()[0]
        ind2 = a.span()[1]
        grade = int(x[ind1])

        if ind2 > 1:
            return 'Grade School'
        elif grade >= 9:
            return 'High School'
        elif grade < 9:
            return 'Grade School'
    else:
        return x

def group_pir(x):
    if type(x) == float:
        return 'Other'
    elif re.search('greater', x):
        return '5+'
    else:
        return x

#raw_data_filled['RIDAGEYR'] = raw_data_filled['RIDAGEYR'].apply(lambda x: group_age(x) )
#raw_data_filled['DMDEDUC3'] = raw_data_filled['DMDEDUC3'].apply(lambda x: group_edu(x) )
#raw_data_filled['INDFMPIR'] = raw_data_filled['INDFMPIR'].apply(lambda x: group_pir(x) )

In [4]:
##creating Train_df1 and target_df
#grabbing dataset
raw_data_df = pd.read_csv('/Users/kevin/Desktop/OMSA/CSE6242/Project/Project Visual/all_nhanes_filtered.csv')

#replace don't know with NULL
raw_data_df  = raw_data_df.replace(re.compile('(Don\'t know|Refused)'), float('NaN'))

#specifying target columns
target_cols = ['MCQ220','MCQ160E','MCQ160F','HUQ010' ]

#Getting non cat columns
non_cat_cols = []
for c in raw_data_df.columns:
    if raw_data_df[c].dtype != object:
        non_cat_cols.append(c)

#put this after grabbing non-cat cols to avoid grabbing target variables
raw_data_df = raw_data_df.replace("Yes", 1).replace("No", 2).replace(9,float('NaN')).replace(7,float('NaN')) \
                        .replace('9',float('NaN')).replace('7',float('NaN'))#.drop(['SEQN'], axis =1 )

raw_data_df['RIDAGEYR'] = raw_data_df['RIDAGEYR'].apply(lambda x: group_age(x) )
raw_data_df['DMDEDUC3'] = raw_data_df['DMDEDUC3'].apply(lambda x: group_edu(x) )
raw_data_df['INDFMPIR'] = raw_data_df['INDFMPIR'].apply(lambda x: group_pir(x) )

#keeping only non cat columns and filling in answers
subset_df = raw_data_df.copy()
subset_df = subset_df[non_cat_cols].fillna(0)

df_dict= {}#this is a dict which holds the dfs to train on for each target variable
#for each target variable, remove rows in entire df whenever there is a NULL or Don't Know
for col in target_cols:
    t_c = raw_data_df[col]
    if col == 'HUQ010':
        t_c =  t_c.apply(lambda x : f(x))

    df = pd.concat([subset_df.copy(), t_c ], axis = 1) #merging dataset and target variable
    df = df[df[col].notna()]
    df[col] = df[col].astype(int)
    df_dict[col] = df



  raw_data_df = pd.read_csv('/Users/kevin/Desktop/OMSA/CSE6242/Project/Project Visual/all_nhanes_filtered.csv')


In [15]:
raw_data_df[['Year_Start','MCQ160F']].groupby('MCQ160F').count()

Unnamed: 0_level_0,Year_Start
MCQ160F,Unnamed: 1_level_1
1,2120
2,51022
1,78
2,1806


# Part 2: Training RF Models Based on Non-Null Data

In [182]:
corr_dict = {} # this is a dict of all of the correlations for each target variable

for col in target_cols:
    corr_dict[col] = df_dict[col].corr()[col]

In [183]:
#find the top 25 variables per target variable, highest positive or negative correlations
top_25 = pd.DataFrame()

for col in target_cols:
    d = corr_dict[col].sort_values(ascending = False, key = abs).head(25).reset_index()
    d= d.rename(columns = {'index':str(col) +'var'})
    d = d.drop(labels=[0], axis = 0).reset_index(drop = True) #need to remove the first row bc that row is the var itself
    top_25 = pd.concat([top_25,d], axis = 1)


In [184]:
### PCA LOOP STARTS HERE
pca_dict = {} #dict that holds all of the trained PCA models for each target variable

for col in target_cols:
    var_names = top_25[str(col) + 'var'] #getting top 25 variables for each target variable
    
    train = df_dict[col][var_names]
    target = df_dict[col][col]
    
    #create PCA to evaluate the least important variables
    pca_train, pca_test, y_train, y_test, pca_df, pca_model = create_pca(train,target)

    #For each variable, look at how many times it has a value contributing to less than .1 of the variance for that principle component
    counts = {}
    for (columnName, columnData) in pca_df.iteritems():
        count = 0
        for v in columnData.values:
            if v<.1:
                count += 1
        counts[columnName] = count
    max_counts = max(counts.values())

    #Grab the top offending variables and remove them from names
    remove_counts = {k:v for k,v in counts.items() if v  == max_counts }
    keep_names = [name for name in var_names if name not in remove_counts]

    #run PCA again with reduced variables
    train = df_dict[col][keep_names]
    pca_train, pca_test, y_train, y_test, pca_df, pca_model = create_pca(train,target)

    pca_dict[col] = {'pca_train': pca_train, 'pca_test': pca_test, 
                    'y_train': y_train, 'y_test': y_test,
                    'pca_df': pca_df,'keep_names': keep_names,
                    'model': pca_model}

### RF MODEL LOOP STARTS HERE
rf_dict = {} #dict that holds all of the RF models + pca_dict entry for each target variable

for k,v in pca_dict.items():
    model, y_pred = create_rf(v['pca_train'], v['pca_test'], v['y_train'], n_estimators = 100)
    rf_dict[k] = {'model': model, 'y_pred':y_pred, 'pca':pca_dict[k]}

for k,v in rf_dict.items():
    print(str(k) + " Accuracy:",metrics.accuracy_score(v['pca']['y_test'], v['y_pred']))




KeyboardInterrupt: 

# Part 3: Fill in Missing Data Using RF Models

In [None]:
def rf_predict(rf_model, pca_model, row):
     pca = pca_model[0]
     scaler = pca_model[1]
     train = scaler.transform(row)
     train = pca.transform(train)

     return rf_model.predict(train)

In [None]:
########### THIS BLOCK TAKES 25MIN TO RUN ####################

filled_dict= {}#this is a dict which holds the dfs with filled in values for null

for col in target_cols:
    t_c = raw_data_df[col]
    if col == 'HUQ010':
        t_c =  t_c.apply(lambda x : f(x))
    df = pd.concat([subset_df.copy()[rf_dict[col]['pca']['keep_names']], t_c ], axis = 1) #merging dataset and target variable
    df = df.replace("Yes", 1) .replace("No", 0)
    

    for ind, row in df.iterrows():
        try:
            check = int(row[-1])
        except:
            check = row[-1]
        try:
            if math.isnan(check):
                row_df = df.iloc[[ind],:-1]
                ans = rf_predict(rf_dict[col]['model'],rf_dict[col]['pca']['model'], row_df) #inputting every column except for the last(target)
                df.at[ind,col] = ans[0]
        except:
            print('offending ', row[-1])

    df[col] = df[col].astype(int)
    filled_dict[col] = df

In [None]:
### Add back Year column to each dataframe in filled_dict

graph_dict = {} ##dict holding all DFs to be graphed
year_df = raw_data_df[['Year_Start']].copy()

for k,v in filled_dict.items():
    filled = pd.concat([year_df, v[k]], axis = 1)
    unfilled = df_dict[k][['Year_Start',k]]
    graph_dict[k] = {'filled':filled, 'unfilled':unfilled}

In [None]:
for col in target_cols:
    print(col, graph_dict[col]['filled'][col].unique())

MCQ220 [2 1]
MCQ160E [2 1]
MCQ160F [2 1]
HUQ010 [5 4 3 2 1]


In [None]:
#Getting Summary Statistics for each columns. May not end up using this
summed_dict = {}

for col in target_cols:
    df_filled = graph_dict[col]['filled'].groupby(['Year_Start', col]).size().to_frame().reset_index()
    df_filled = df_filled.rename(columns = {col:'answer',0:'count'})
    df_sum = graph_dict[col]['filled'].groupby(['Year_Start']).count().reset_index()
    df_sum = df_sum.rename(columns = {col:'total_count'})
    df_filled = df_filled.merge(df_sum, how = 'inner', on = 'Year_Start')
    df_filled['pct'] = df_filled['count']/df_filled['total_count']

    df_unfilled = graph_dict[col]['unfilled'].groupby(['Year_Start', col]).size().to_frame().reset_index()
    df_unfilled = df_unfilled.rename(columns = {col:'answer',0:'count'})
    df_sum = graph_dict[col]['unfilled'].groupby(['Year_Start']).count().reset_index()
    df_sum = df_sum.rename(columns = {col:'total_count'})
    df_unfilled = df_unfilled.merge(df_sum, how = 'inner', on = 'Year_Start')
    df_unfilled['pct'] = df_unfilled['count']/df_unfilled['total_count']

    summed_dict[col] = {'filled': df_filled, 'unfilled' : df_unfilled}


In [None]:
pivot_dict = {}

for col in target_cols:
    df = summed_dict[col]['filled']
    df = df.pivot(columns = 'answer', index = 'Year_Start')

    du = summed_dict[col]['unfilled']
    du = du.pivot(columns = 'answer', index = 'Year_Start')

    pivot_dict[col] = {'filled':df, 'unfilled': du}
    

In [None]:
###Save datframe to pickle file
# try:
#     import pickle as pickle
# except ImportError:  # Python 3.x
#     import pickle

# with open('rf_dict.p', 'wb') as fp:
#     pickle.dump(rf_dict, fp, protocol=pickle.HIGHEST_PROTOCOL)

###Load pickle
# with open('pivot_data.p', 'rb') as fp:
#     data = pickle.load(fp)

In [None]:
df = pd.DataFrame(raw_data_df.columns, columns = {'Variable'})
var_desc_df = pd.read_csv('/Users/kevin/Desktop/OMSA/CSE6242/Project/Project Visual/VariableDescriptions.csv')
desc_df = df.merge(var_desc_df, how = "left", on = 'Variable')

desc_df.to_csv('/Users/kevin/Desktop/OMSA/CSE6242/Project/Project Visual/DescriptionsDF.csv')

In [264]:
#Creating a copy of raw data df that has the filled in values for our target variables
raw_data_filled = raw_data_df.copy()

for col in target_cols:
    raw_data_filled[col + '_filled'] = filled_dict[col][col]

for col in target_cols:
    raw_data_filled = raw_data_filled.rename(columns = {col: col+'_unfilled'})


In [265]:
raw_data_filled.to_csv('/Users/kevin/Desktop/OMSA/CSE6242/Project/Project Visual/raw_data_filled.csv')

#Finish group_edu, then start graphing by demo and by age and education

In [None]:
#Code Below is from experimenting and prior data explorations

In [None]:
# def plot_chart(col, year):
#     # Create figure and axis #1
#     fig, (ax1, ax3) = plt.subplots(1,2, figsize=(15, 5))

#     # x axis values and ticks
#     year_list = list(pivot_dict[col]['filled'].index)
#     idx = year_list.index(year) 
#     xtick = year_list[idx:]
#     xpct = year_list[idx:]
#     #for all other variables
#     xbarl = [i-.2 for i in year_list[idx:] ] 
#     xbarr = [i+.2 for i in year_list[idx:] ] 
#     #for H variable
#     hb1 = [i-.4 for i in year_list[idx:] ] 
#     hb2 = [i-.2 for i in year_list[idx:] ] 
#     hb3 = [i for i in year_list[idx:] ] 
#     hb4 = [i+.2 for i in year_list[idx:] ] 
#     hb5 = [i+.4 for i in year_list[idx:] ] 

#     #set titles
#     ax1.set_title('Predicted_Answers' + " " + col)
#     ax3.set_title('Non-Null_Answers' + " " + col)

#     ax3.set_axisbelow(True)

#     ### LEFT PLOT
#     # plot line chart on axis #1
#     if col != 'HUQ010':
#         ax1.plot(xpct, pivot_dict[col]['filled']['pct'][1].loc[year:], alpha=.75, color='orange') 
#         ax1.plot(xpct, pivot_dict[col]['filled']['pct'][2].loc[year:], alpha=.75, color='blue') 
#     ax1.set_ylabel('%')
#     ax1.set_ylim(0, 1.5)
#     ax1.set_xticks(xtick)

#     # set up twin(shared) axis
#     ax2 = ax1.twinx()

#     # plot bar chart on axis #2
#     if col == 'HUQ010':
    
#         ax2.bar(hb1, pivot_dict['HUQ010']['filled']['count'].loc[year:,:].reset_index(drop=True)[1], width=0.25, alpha=0.75, color='red', zorder = 2)
#         ax2.bar(hb2, pivot_dict['HUQ010']['filled']['count'].loc[year:,:].reset_index(drop=True)[2], width=0.25, alpha=0.75, color='blue', zorder = 2)
#         ax2.bar(hb3, pivot_dict['HUQ010']['filled']['count'].loc[year:,:].reset_index(drop=True)[3], width=0.25, alpha=0.75, color='green', zorder = 2)
#         ax2.bar(hb4, pivot_dict['HUQ010']['filled']['count'].loc[year:,:].reset_index(drop=True)[4], width=0.25, alpha=0.75, color='orange', zorder = 2)
#         ax2.bar(hb5, pivot_dict['HUQ010']['filled']['count'].loc[year:,:].reset_index(drop=True)[5], width=0.25, alpha=0.75, color='brown', zorder = 2)
        
#         ax2.legend(['1','2','3','4','5'], loc="upper right")
#     else:
#         ax1.legend(['Pct No', 'Pct Yes'], loc="upper left")
#         ax2.legend(['No', 'Yes'], loc="upper right")
        
#         ax2.bar(xbarl, pivot_dict[col]['filled']['count'][1].loc[year:], width=0.5, alpha=0.75, color='orange', zorder = 2)
#         ax2.bar(xbarr, pivot_dict[col]['filled']['count'][2].loc[year:], width=0.5, alpha=0.75, color='blue', zorder = 2)

#     # ax2.grid(zorder = 0)
#     ax2.set_ylim(0, 15000)
#     ax2.set_ylabel('total_answers')
#     ###

#     ### RIGHT PLOT
#     # plot line chart on axis #3
#     if col != 'HUQ010':
#         ax3.plot(xpct, pivot_dict[col]['unfilled']['pct'][1].loc[year:], alpha=.75, color='orange') 
#         ax3.plot(xpct, pivot_dict[col]['unfilled']['pct'][2].loc[year:], alpha=.75, color='blue')
    
#     ax3.set_ylabel('%')
#     ax3.set_ylim(0, 1.5) 
#     ax3.set_xticks(xtick)


#     # set up twin(shared) axis
#     ax4 = ax3.twinx()

#     # plot bar chart on axis #4
#     if col == 'HUQ010':
#         ax4.bar(hb1, pivot_dict['HUQ010']['unfilled']['count'].loc[year:,:].reset_index(drop=True)[1], width=0.25, alpha=0.75, color='red', zorder = 2)
#         ax4.bar(hb2, pivot_dict['HUQ010']['unfilled']['count'].loc[year:,:].reset_index(drop=True)[2], width=0.25, alpha=0.75, color='blue', zorder = 2)
#         ax4.bar(hb3, pivot_dict['HUQ010']['unfilled']['count'].loc[year:,:].reset_index(drop=True)[3], width=0.25, alpha=0.75, color='green', zorder = 2)
#         ax4.bar(hb4, pivot_dict['HUQ010']['unfilled']['count'].loc[year:,:].reset_index(drop=True)[4], width=0.25, alpha=0.75, color='orange', zorder = 2)
#         ax4.bar(hb5, pivot_dict['HUQ010']['unfilled']['count'].loc[year:,:].reset_index(drop=True)[5], width=0.25, alpha=0.75, color='brown', zorder = 2)

#         ax4.legend(['1','2','3','4','5'], loc="upper right")
#     else:
#         ax3.legend(['Pct No', 'Pct Yes'], loc="upper left")
        
#         ax4.bar(xbarl, pivot_dict[col]['unfilled']['count'][1].loc[year:], width=0.5, alpha=.75, color='orange', zorder = 2)
#         ax4.bar(xbarr, pivot_dict[col]['unfilled']['count'][2].loc[year:], width=0.5, alpha=.75, color='blue', zorder = 2)
#         ax4.legend(['No','Yes'], loc="upper right")

#     # ax4.grid(False, zorder=0) # turn off grid #2
#     ax4.set_ylim(0, 15000)
#     ax4.set_ylabel('total_answers')
#     #legend
    
#     ###

#     #spacing the plots apart
#     fig.tight_layout(pad = 3)
#     plt.show()

In [None]:
# #dropdownlist variables
# years = list(pivot_dict[col]['filled'].index)
# variables = target_cols.copy()
# #years.insert(0,'All')
# years.insert(0,'Choose')
# variables.insert(0,'Choose')

# #creating widget objects
# output = widgets.Output()
# dropdown_variable = widgets.Dropdown(options = variables, description = 'Variable')
# dropdown_year = widgets.Dropdown(options = years, description = 'Year')


# #function for handling event
# def common_filtering(variable = None, year = None):
#     output.clear_output()
#     if variable != 'Choose' and year != 'Choose':
#         with output:
#             plot_chart(variable, year)

# #event handler functions
# def dropdown_variable_eventhandler(change):
#     common_filtering(change.new, dropdown_year.value)

# def dropdown_year_eventhandler(change):
#     common_filtering(dropdown_variable.value, change.new)

# dropdown_variable.observe(dropdown_variable_eventhandler, names='value')
# dropdown_year.observe(dropdown_year_eventhandler, names='value')

# display(dropdown_variable)
# display(dropdown_year)
# display(output)

In [None]:


# #dataset variable descriptions
# var_desc_df = pd.read_csv('/Users/kevin/Desktop/OMSA/CSE6242/Project/Project Visual/VariableDescriptions.csv')


# #creating df with only cat cols and desc
# cat_df = pd.DataFrame(data = cat_cols, columns = ['Variable'])
# cat_df = cat_df.merge(var_desc_df, how = "left", on = 'Variable')
# keep_cat_cols = ['RIAGENDR','HUQ010','MCQ160F','MCQ160E','MCQ160C','MCQ220']


# #creating df with all cols and desc
# col_df = pd.DataFrame(data = data_df.columns, columns = ['Variable'])
# col_df = col_df.merge(var_desc_df, how = "left", on = 'Variable')


In [None]:


# #remove sequence id col
# print('\n')
# for c in target_cols_df.columns:
#     print(c)
#     print(target_cols_df[c].dtype)
#     print(target_cols_df[c].unique())

In [None]:
# # corr, _ = pearsonr(df_final[], data2)
# #getting correlation matrix for all variable
# corr_df = df_final.corr()
# corr_df.head()

# corr_df.to_csv('/Users/kevin/Desktop/OMSA/CSE6242/Project/Project Visual/test.csv')
# cat_vars = ['RIDEXPRG','MCQ010','MCQ160C','MCQ160E','MCQ160F','MCQ220',
#             'VTQ200A','KIQ201','VTQ280A','VTQ280B','VTQ280C','VTQ280D',
#             'VTQ280E','VTQ280F','VTQ280G','VTQ280H','VTQ281A','VTQ281C','VTQ281E']

# d = corr_df.filter(items = cat_vars, axis=0)
# d.head()

In [None]:
# #finding highly correlated variables
# threshold = [.5, -.5] 
# top_pairs = []
# for row_ind, row in corr_df.iterrows():
#     for ind, value in row.items():
#         if (value > threshold[0] or value < threshold[1]) and (row_ind != ind) :
#             a = [row_ind, ind]
#             a.sort()
#             if a not in top_pairs:
#                 top_pairs.append(a)

# print(top_pairs[:10])
# print(len(top_pairs))

In [None]:
# #getting variable descriptions
# top_pairs_df = pd.DataFrame(data = top_pairs, columns = ['p1','p2'])
# merged_df = top_pairs_df.merge(var_desc_df, how='left',left_on='p1',right_on='Variable').merge(var_desc_df, how = 'left', left_on = 'p2',right_on = 'Variable')
# merged_df = merged_df[['p1','p2','SAS_Label_x','SAS_Label_y']].rename(columns = {'SAS_Label_x':'p1_label','SAS_Label_y':'p2_label'})
# #adding correlation
# merged_df['corr'] = merged_df[['p1','p2']].apply(lambda x: corr_df.loc[x.p1,x.p2], axis=1)
# merged_df = merged_df[['p1','p2','corr','p1_label','p2_label']].sort_values(['p1','corr'], ascending= [True, False], ignore_index=True)
# merged_df.head(10)

In [None]:
# merged_df.to_csv('/Users/kevin/Desktop/OMSA/CSE6242/Project/Project Visual/top_pairs.csv')

In [None]:
# #Counting how many times each variable shows up
# pair_count = {}
# for pair in top_pairs:
#     for item in pair:
#         if item not in pair_count.keys():
#             pair_count[item] = 1
#         else:
#             pair_count[item] += 1

# #sorting from greatest to least
# top_pair_count_sorted = sorted(pair_count.items(), key=lambda kv:
#                  (kv[1], kv[0]), reverse = True)

# max_value = max(pair_count.values())
# top_pair_count_sorted[:10]

# #Grabbing the most commonly shown up variables within the highly correlated pairs of 
# top_vars = []
# for pair in top_pair_count_sorted:
#     if pair[1] == max_value:
#         top_vars.append(pair[0])

# top_vars

In [None]:
# top_vars_dict = {var:[] for var in top_vars}

# for pair in pairs:
#     if pair[0] in top_vars:
#         top_vars_dict[pair[0]].append(pair)
#     if pair[1] in top_vars:
#         top_vars_dict[pair[1]].append(pair)

#top_vars_dict

In [None]:
# df_final_top = df_final[top_vars]

# df_final_top.corr()

In [None]:
# pairs_df = pd.DataFrame(data = pairs, columns=['P1','P2'])
# pairs_df.to_csv('/Users/kevin/Desktop/OMSA/CSE6242/Project/Project Visual/high_corr_vars.csv')


#plt.subplot(1, 3, 1) # row 1, col 2 index 1
# plt.scatter(df_final['LBXMCHSI'], df_final['LBXMCVSI'])

# plt.subplot(1, 3, 2) # index 2
# plt.scatter(df_final['LBDSBUSI'], df_final['LBXSBU'])

# plt.subplot(1, 3, 3) # index 2
# plt.scatter(df_final['PHAALCMN'], df_final['PHAANTMN'])