In [30]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Load data
data = pd.read_csv('data.csv')

# Get whether should be verbose run
verbose = False

# Get data info
if (verbose):
    data.info()
    print(f'People with diabetes: {data[data["censor of diabetes at followup(1, Yes; 0, No)"] == 1].shape[0]}')

In [31]:
if (verbose):
    data.columns

In [32]:
clean_data = data.copy()

# Remove all unnecessary columns
clean_data = clean_data.drop(columns=['id', 'site', 'year of followup', 'Diabetes diagnosed during followup（1,Yes）'])
# NOTE: after closer inspection of documentation, the censor is the baseline used by the group to
#    determine if someone has diabetes

# Rename columns to be more accessible
rename_dict = {
    'Age (y)': 'age',
    'Gender(1, male; 2, female)': 'sex',
    'height(cm)': 'height',
    'weight(kg)': 'weight',
    'BMI(kg/m2)': 'bmi',
    'SBP(mmHg)': 'sbp',
    'DBP(mmHg)': 'dbp',
    'FPG (mmol/L)': 'fpg',
    'Cholesterol(mmol/L)': 'chol',
    'Triglyceride(mmol/L)': 'tg',
    'HDL-c(mmol/L)': 'hdlc',
    'LDL(mmol/L)': 'ldl',
    'ALT(U/L)': 'alt',
    'AST(U/L)': 'ast',
    'BUN(mmol/L)': 'bun',
    'CCR(umol/L)': 'ccr',
    'FPG of final visit(mmol/L)': 'fpg_final',
    'censor of diabetes at followup(1, Yes; 0, No)': 'diabetes',
    'smoking status(1,current smoker;2, ever smoker;3,never smoker)': 'smoker',
    'drinking status(1,current drinker;2, ever drinker;3,never drinker)': 'drinker',
    'family histroy of diabetes(1,Yes;0,No)': 'fam_hist'
}
clean_data = clean_data.rename(columns=rename_dict)
if (verbose):
    clean_data.info()
    print(f'People with diabetes: {clean_data[clean_data["diabetes"] == 1].shape[0]}')

In [33]:
# Get only three standard deviations within mean
def rmv3StdNull(dt, col, col_name, verbose=False, null_rem=True, avg='mean', std_rem=True):
    
    # Get mean and standard deviation
    dt_mean = dt[col].mean()
    dt_std = dt[col].std()

    # Print distribution data about column
    if (verbose):
        print('BEFORE:')
        print(f' Range of DBP: {dt[col].min()} - {dt[col].max()}')
        print(f' Mean of DBP: {dt_mean}')
        print(f' Standard deviation of DBP: {dt_std}')
        print(f' Number of nulls: {(dt.loc[dt[col].isna()]).shape[0]}')
        print(f' Min of DBP: {dt[col].min()}')
        print(f' Max of DBP: {dt[col].max()}')
        fig, ax = plt.subplots()
        labels = ['No Diabetes', 'Diabetes']
        colors = ['#E69F00', '#56B4E9', '#009E73', '#0072B2', '#D55E00', 'tomato']
        for i in [0, 1]:
            ax.hist(dt['dbp'][dt['diabetes'] == i], bins = 50, label=labels[i], color=colors[i])
            ax.set_title(f'Distribution of {col_name}, Before Cleaning')
            ax.set_xlabel(f'{col_name}')
            ax.set_ylabel('Count')
            ax.legend()
        print()

    if (null_rem):
        # Since such small amount, just remove nulls
        dt = dt.loc[dt[col].notna()]
    else:
        # Replace all nulls with average
        if (avg=='mean'):
            dt_mean = dt[col].loc[dt[col].notna()].mean()
            dt[col].loc[dt[col].isna()] = dt_mean
        elif (avg=='mode'):
            dt_mode = dt[col].loc[dt[col].notna()].mode()
            dt[col].loc[dt[col].isna()] = dt_mode

    # Remove all data more than 3 standard deviations from the mean, thus not dealing with extreme outliers
    if (std_rem):
        dt = dt.loc[dt[col] < dt_mean + 3 * dt_std]
        dt = dt.loc[dt[col] > dt_mean - 3 * dt_std]

    # Make sure eliminated
    dt_mean = dt[col].mean()
    dt_std = dt[col].std()
    if (verbose):
        print('AFTER:')
        print(f' Range of DBP: {dt[col].min()} - {dt[col].max()}')
        print(f' Mean of DBP: {dt_mean}')
        print(f' Standard deviation of DBP: {dt_std}')
        print(f' Number of nulls: {(dt.loc[dt[col].isna()]).shape[0]}')
        print(f' Min of DBP: {dt[col].min()}')
        print(f' Max of DBP: {dt[col].max()}')
        fig, ax = plt.subplots()
        for i in [0, 1]:
            ax.hist(dt['dbp'][dt['diabetes'] == i], bins = 50, label=labels[i], color=colors[i])
            ax.set_title(f'Distribution of {col_name}, Before Cleaning')
            ax.set_xlabel(f'{col_name}')
            ax.set_ylabel('Count')
            ax.legend()
    
    # Return cleaned data
    return dt
    

In [34]:
# Clean sex
clean_data['sex'] = clean_data['sex'].replace({1: 'male', 2: 'female'})

# Check to see columns are all filled
if (verbose):
    print(f'Sexes in dataset: {clean_data["sex"].unique()}')

In [35]:
# Clean height
if (verbose):
    print(f'Range of heights: {clean_data["height"].min()} - {clean_data["height"].max()}')
    print(f'Number of nulls: {(clean_data.loc[clean_data["height"].isna()]).shape[0]}')

# Since such small amount, just remove nulls
clean_data = clean_data.loc[clean_data["height"].notna()]

# Make sure eliminated
if (verbose):
    print(f'Number of nulls, after cleaning: {(clean_data.loc[clean_data["height"].isna()]).shape[0]}')

In [36]:
# Clean weight
if (verbose):
    print(f'Range of weights: {clean_data["weight"].min()} - {clean_data["weight"].max()}')
    print(f'Number of nulls: {(clean_data.loc[clean_data["weight"].isna()]).shape[0]}')

# Nothing to clean

In [37]:
# Clean BMI
if (verbose):
    print(f'Range of BMI: {clean_data["bmi"].min()} - {clean_data["bmi"].max()}')
    print(f'Number of nulls: {(clean_data.loc[clean_data["bmi"].isna()]).shape[0]}')

# Nothing to clean

In [38]:
# Clean SBP
clean_data = rmv3StdNull(clean_data, 'sbp', 'Systolic Blood Pressure', verbose=verbose)

In [39]:
# Clean DBP
clean_data = rmv3StdNull(clean_data, 'dbp', 'Diastolic Blood Pressure', verbose=verbose)

In [40]:
# Clean FPG
clean_data = rmv3StdNull(clean_data, 'fpg', 'FPG', verbose=verbose, null_rem=False, std_rem=False)

In [41]:
# Clean cholesterol
clean_data = rmv3StdNull(clean_data, 'chol', 'Cholesterol', verbose=verbose, std_rem=False)

In [42]:
# Clean triglyceride
clean_data = rmv3StdNull(clean_data, 'tg', 'Triglyceride', verbose=verbose, std_rem=False)

In [43]:
# Clean HDLC
clean_data = rmv3StdNull(clean_data, 'hdlc', 'High Density Lipoprotein Cholesterol', verbose=verbose, null_rem=False)

In [44]:
# Clean LDL
clean_data = rmv3StdNull(clean_data, 'ldl', 'Low Density Lipoprotein', verbose=verbose, null_rem=False)

In [45]:
# Clean ALT
clean_data = rmv3StdNull(clean_data, 'alt', 'Alanine Transaminase', verbose=verbose, null_rem=False, std_rem=False)

In [46]:
# Clean AST
clean_data = rmv3StdNull(clean_data, 'ast', 'AST', verbose=verbose, null_rem=False, std_rem=False)

In [47]:
# Clean BUN
clean_data = rmv3StdNull(clean_data, 'bun', 'Blood Urea Nitrogen', verbose=verbose, null_rem=False, std_rem=False)

In [48]:
# Clean CCR
clean_data = rmv3StdNull(clean_data, 'ccr', 'Creatine Reduction Ratio', verbose=verbose, null_rem=False, std_rem=False)

In [49]:
# Clean FPG
clean_data = rmv3StdNull(clean_data, 'fpg_final', 'FPG of Final Visit', verbose=verbose, null_rem=False, std_rem=False)

In [50]:
# Get data info
if (verbose):
    clean_data.info()
    print(f'People with diabetes: {clean_data[clean_data["diabetes"] == 1].shape[0]}')

In [51]:
# Clean smoker column
clean_data['smoker'].loc[clean_data['smoker'].isna()] = 0
clean_data['smoker'] = clean_data['smoker'].replace({0: 'no info', 1: 'current smoker', 2: 'former smoker', 3: 'never smoker'})

# Check to see columns are all filled
if (verbose):
    print(f'Smokers in dataset: {clean_data["smoker"].unique()}')

In [52]:
# Clean drinker column
clean_data['drinker'].loc[clean_data['drinker'].isna()] = 0
clean_data['drinker'] = clean_data['drinker'].replace({0: 'no info', 1: 'current drinker', 2: 'former drinker', 3: 'never drinker'})

# Check to see columns are all filled
if (verbose):
    print(f'Drinkers in dataset: {clean_data["drinker"].unique()}')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [53]:
# Clean family history column
clean_data = clean_data.loc[clean_data['fam_hist'].notna()]
clean_data['fam_hist'] = clean_data['fam_hist'].replace({0: 'no family history of diabetes', 1: 'family history of diabetes'})

# Check to see columns are all filled
if (verbose):
    print(f'Family history of diabetes in dataset: {clean_data["fam_hist"].unique()}')

In [54]:
# Clean diabetes column
clean_data = clean_data.loc[clean_data['diabetes'].notna()]
clean_data['diabetes'] = clean_data['diabetes'].replace({0: 'no', 1: 'yes'})

# Check to see columns are all filled
if (verbose):
    print(f'Diabetes diagnoses in dataset: {clean_data["diabetes"].unique()}')

In [55]:
# Get data info
if (verbose):
    clean_data.info()
    print(f'People with diabetes: {clean_data[clean_data["diabetes"] == "yes"].shape[0]}')

In [56]:
rename_dict = {
    'Age (y)': 'age',
    'Gender(1, male; 2, female)': 'sex',
    'height(cm)': 'height',
    'weight(kg)': 'weight',
    'BMI(kg/m2)': 'bmi',
    'SBP(mmHg)': 'sbp',
    'DBP(mmHg)': 'dbp',
    'FPG (mmol/L)': 'fpg',
    'Cholesterol(mmol/L)': 'chol',
    'Triglyceride(mmol/L)': 'tg',
    'HDL-c(mmol/L)': 'hdlc',
    'LDL(mmol/L)': 'ldl',
    'ALT(U/L)': 'alt',
    'AST(U/L)': 'ast',
    'BUN(mmol/L)': 'bun',
    'CCR(umol/L)': 'ccr',
    'FPG of final visit(mmol/L)': 'fpg_final',
    'censor of diabetes at followup(1, Yes; 0, No)': 'diabetes',
    'smoking status(1,current smoker;2, ever smoker;3,never smoker)': 'smoker',
    'drinking status(1,current drinker;2, ever drinker;3,never drinker)': 'drinker',
    'family histroy of diabetes(1,Yes;0,No)': 'fam_hist'
}

# Separate out the categorical and numeric data
numeric_data = clean_data[['age', 'height', 'weight', 'bmi', 'sbp', 'dbp', 'fpg', 'chol',
    'tg', 'hdlc', 'ldl', 'alt', 'ast', 'bun', 'ccr', 'fpg_final']]
categorical_data = clean_data[['sex', 'smoker', 'drinker', 'fam_hist']]

# Create columns for each categorical data column
categorical_data = pd.get_dummies(categorical_data, drop_first=True)

# Standardize numeric data columns
ss = StandardScaler()
ss.fit(numeric_data)
numeric_data[:] = ss.transform(numeric_data)

# Create standardized dataset
standardized = pd.concat([categorical_data, numeric_data, clean_data['diabetes']], axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numeric_data[:] = ss.transform(numeric_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


In [57]:
# Save cleaned data
clean_data.to_csv('data_clean.csv')
standardized.to_csv('data_standardized.csv')