# 2. Data cleaning

### Import Packages

In [67]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-darkgrid')

  plt.style.use('seaborn-darkgrid')


### Load Data

In [68]:
df = pd.read_csv('diabetes.csv')

### Remove Duplicates

In [77]:
df_no_dup = df.copy()
df_no_dup.drop_duplicates() # Remove the duplicates

df_no_dup.shape # Check the remaining number of observations

(768, 9)

### Convert Missing Values to Nan

In [94]:
# Zero's in the following columns Glucose, BloodPressure, SkinThickness, Insulin and BMI are impossible and are actually missing values
df_nan = df_no_dup.copy()
df_nan[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df_nan[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)
df_nan.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


### Imputing Missing Values

In [95]:
df_imputed = df_nan.copy()

# import Imputer 
from sklearn.impute import SimpleImputer

# Create an imputer object that looks for 'Nan' values, then replaces them with the mean value of the feature by columns (axis=0)
# strategy is median because mean results in values to high
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='median')

# Train the imputor on the dataset
mean_imputer = mean_imputer.fit(df_imputed[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']])

# Apply the imputer to the dataset (This imputer can also be used on future datasets)
df_imputed[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']] = mean_imputer.transform(df_imputed[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']].values)
df_imputed.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,125.0,33.6,0.627,50.0,1
1,1.0,85.0,66.0,29.0,125.0,26.6,0.351,31.0,0
2,8.0,183.0,64.0,29.0,125.0,23.3,0.672,32.0,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1


### changing floats and ints to ints64

In [None]:
# These columns are not meant to be floats.

# columns = []

# for col in columns:
#     if not df_imputed[col].empty:
#         df_imputed[col] = df_imputed[col].astype(np.int64)



### 3.4 Log transformation 

In [None]:

# Enkel shom_am omdat dit de meest extreme waarden heeft. 

from scipy.stats import skew


df_imputed['shop_am'] = df_imputed['shop_am'].astype(np.float64) #omzetten naar float

skewed = skew(df_imputed['shop_am'].dropna().astype(float))
if skewed > 0.75:
    df_imputed['shop_am'] = np.log1p(df_imputed['shop_am'])

df_imputed['shop_am'] = df_imputed['shop_am'].astype(np.int64)


##### same for scores #####
df_imputed_scores['shop_am'] = df_imputed_scores['shop_am'].astype(np.float64) #omzetten naar float

skewed = skew(df_imputed_scores['shop_am'].dropna().astype(float))
if skewed > 0.75:
    df_imputed_scores['shop_am'] = np.log1p(df_imputed_scores['shop_am'])

df_imputed_scores['shop_am'] = df_imputed_scores['shop_am'].astype(np.int64)

In [None]:
# showing the log transformed data

df_imputed['shop_am'].head()

0    0
1    0
2    0
3    0
4    7
Name: shop_am, dtype: int64

# 4. Shuffle data

In [None]:
df_shuffle = df_imputed.sample(frac = 1, random_state=123) # shuffle the data

##### same for scores #####
df_shuffle_scores = df_imputed_scores.sample(frac = 1, random_state=123) # shuffle the data

## 5. New cleaned csv

In [None]:
df_shuffle.to_csv('data_cleaned.csv', index=False)
df_shuffle_scores.to_csv('score_cleaned.csv', index=False)

NameError: name 'df_shuffle' is not defined