# Data Preprocessing Steps

1. Reading Data
2. Exploring Data / Data Insight
3. Cleansing Data
4. Outlier Detection and Removing
5. Data Transformation (Normalize Data / Rescale Data)
6. Categorical into Numerical 
7. Dimensionality Reduction(PCA)
8. Handling Imbalanced Data
9. Feature Selection
10. Data Splitting

In [6]:
import matplotlib.pyplot as plt #for data visualizing
import seaborn as sns 
color = sns.color_palette()

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# 1: Reading Data

In [7]:
data = pd.read_csv('echocardiogram.csv')

data.head()

Unnamed: 0,survival,alive,age,pericardialeffusion,fractionalshortening,epss,lvdd,wallmotion-score,wallmotion-index,mult,name,group,aliveat1
0,11.0,0.0,71.0,0.0,0.26,9.0,4.6,14.0,1.0,1.0,name,1,0.0
1,19.0,0.0,72.0,0.0,0.38,6.0,4.1,14.0,1.7,0.588,name,1,0.0
2,16.0,0.0,55.0,0.0,0.26,4.0,3.42,14.0,1.0,1.0,name,1,0.0
3,57.0,0.0,60.0,0.0,0.253,12.062,4.603,16.0,1.45,0.788,name,1,0.0
4,19.0,1.0,57.0,0.0,0.16,22.0,5.75,18.0,2.25,0.571,name,1,0.0


In [11]:
data.head(2)

Unnamed: 0,survival,alive,age,pericardialeffusion,fractionalshortening,epss,lvdd,wallmotion-score,wallmotion-index,mult,name,group,aliveat1
0,11.0,0.0,71.0,0.0,0.26,9.0,4.6,14.0,1.0,1.0,name,1,0.0
1,19.0,0.0,72.0,0.0,0.38,6.0,4.1,14.0,1.7,0.588,name,1,0.0


In [12]:
data.head(30)

Unnamed: 0,survival,alive,age,pericardialeffusion,fractionalshortening,epss,lvdd,wallmotion-score,wallmotion-index,mult,name,group,aliveat1
0,11.0,0.0,71.0,0.0,0.26,9.0,4.6,14.0,1.0,1.0,name,1,0.0
1,19.0,0.0,72.0,0.0,0.38,6.0,4.1,14.0,1.7,0.588,name,1,0.0
2,16.0,0.0,55.0,0.0,0.26,4.0,3.42,14.0,1.0,1.0,name,1,0.0
3,57.0,0.0,60.0,0.0,0.253,12.062,4.603,16.0,1.45,0.788,name,1,0.0
4,19.0,1.0,57.0,0.0,0.16,22.0,5.75,18.0,2.25,0.571,name,1,0.0
5,26.0,0.0,68.0,0.0,0.26,5.0,4.31,12.0,1.0,0.857,name,1,0.0
6,13.0,0.0,62.0,0.0,0.23,31.0,5.43,22.5,1.875,0.857,name,1,0.0
7,50.0,0.0,60.0,0.0,0.33,8.0,5.25,14.0,1.0,1.0,name,1,0.0
8,19.0,0.0,46.0,0.0,0.34,0.0,5.09,16.0,1.14,1.003,name,1,0.0
9,25.0,0.0,54.0,0.0,0.14,13.0,4.49,15.5,1.19,0.93,name,1,0.0


In [13]:
data.tail()

Unnamed: 0,survival,alive,age,pericardialeffusion,fractionalshortening,epss,lvdd,wallmotion-score,wallmotion-index,mult,name,group,aliveat1
128,7.5,1.0,64.0,0.0,0.24,12.9,4.72,12.0,1.0,0.857,name,,
129,41.0,0.0,64.0,0.0,0.28,5.4,5.47,11.0,1.1,0.714,name,,
130,36.0,0.0,69.0,0.0,0.2,7.0,5.05,14.5,1.21,0.857,name,,
131,22.0,0.0,57.0,0.0,0.14,16.1,4.36,15.0,1.36,0.786,name,,
132,20.0,0.0,62.0,0.0,0.15,0.0,4.51,15.5,1.409,0.786,name,,


In [14]:
data.shape

(133, 13)

In [16]:
data.tail(20)

Unnamed: 0,survival,alive,age,pericardialeffusion,fractionalshortening,epss,lvdd,wallmotion-score,wallmotion-index,mult,name,group,aliveat1
113,36.0,0.0,61.0,0.0,0.27,9.0,4.06,12.0,1.0,0.857,name,,
114,25.0,0.0,59.0,1.0,0.4,9.2,5.36,12.0,1.0,0.857,name,,
115,27.0,0.0,57.0,0.0,0.29,9.4,4.77,9.0,1.0,0.64,name,,
116,34.0,0.0,62.0,1.0,0.19,28.9,6.63,19.5,1.95,0.714,name,,
117,37.0,0.0,,0.0,0.26,0.0,4.38,9.0,1.0,0.64,name,,
118,34.0,0.0,54.0,0.0,0.43,9.3,4.79,10.0,1.0,0.714,name,,
119,28.0,1.0,62.0,1.0,0.24,28.6,5.86,21.5,1.95,0.786,name,,
120,28.0,0.0,,0.0,0.23,19.1,5.49,12.0,1.2,0.71,name,,
121,17.0,0.0,64.0,0.0,0.15,6.6,4.17,14.0,1.27,0.786,name,,
122,38.0,0.0,57.0,1.0,0.12,0.0,2.32,16.5,1.375,0.857,name,,


In [19]:
data.sample()

Unnamed: 0,survival,alive,age,pericardialeffusion,fractionalshortening,epss,lvdd,wallmotion-score,wallmotion-index,mult,name,group,aliveat1
131,22.0,0.0,57.0,0.0,0.14,16.1,4.36,15.0,1.36,0.786,name,,


In [20]:
data.sample(30)

Unnamed: 0,survival,alive,age,pericardialeffusion,fractionalshortening,epss,lvdd,wallmotion-score,wallmotion-index,mult,name,group,aliveat1
32,3.0,1.0,,0.0,,12.0,,6.0,3.0,0.14,name,2.0,
60,38.0,0.0,68.0,0.0,0.29,,4.41,14.0,1.167,0.857,name,2.0,
98,33.0,0.0,59.0,0.0,0.5,9.1,3.42,18.0,1.5,0.857,name,2.0,
31,1.0,1.0,52.0,1.0,0.17,17.2,5.32,14.0,1.17,0.857,name,2.0,
14,0.5,1.0,62.0,0.0,0.12,23.0,5.8,11.67,2.33,0.358,name,1.0,1.0
129,41.0,0.0,64.0,0.0,0.28,5.4,5.47,11.0,1.1,0.714,name,,
8,19.0,0.0,46.0,0.0,0.34,0.0,5.09,16.0,1.14,1.003,name,1.0,0.0
25,29.0,0.0,54.0,0.0,0.3,7.0,3.85,10.0,1.667,0.43,name,2.0,
50,,,,,,,,,,,,,
47,20.0,1.0,59.0,0.0,0.03,21.3,6.29,17.0,1.31,0.928,name,2.0,0.0


In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133 entries, 0 to 132
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   survival              130 non-null    float64
 1   alive                 131 non-null    float64
 2   age                   126 non-null    float64
 3   pericardialeffusion   132 non-null    float64
 4   fractionalshortening  124 non-null    float64
 5   epss                  117 non-null    float64
 6   lvdd                  121 non-null    float64
 7   wallmotion-score      128 non-null    float64
 8   wallmotion-index      130 non-null    float64
 9   mult                  129 non-null    float64
 10  name                  131 non-null    object 
 11  group                 110 non-null    object 
 12  aliveat1              75 non-null     float64
dtypes: float64(11), object(2)
memory usage: 13.6+ KB


In [22]:
data.describe()

Unnamed: 0,survival,alive,age,pericardialeffusion,fractionalshortening,epss,lvdd,wallmotion-score,wallmotion-index,mult,aliveat1
count,130.0,131.0,126.0,132.0,124.0,117.0,121.0,128.0,130.0,129.0,75.0
mean,22.182923,0.328244,62.813722,0.765152,0.216734,12.164769,4.763157,14.438125,1.378,0.786202,0.346667
std,15.858267,0.471377,8.34211,6.697225,0.107513,7.370159,0.810013,5.018566,0.45185,0.225661,0.506534
min,0.03,0.0,35.0,0.0,0.01,0.0,2.32,2.0,1.0,0.14,0.0
25%,7.875,0.0,57.0,0.0,0.15,7.0,4.23,11.0,1.0,0.714,0.0
50%,23.5,0.0,62.0,0.0,0.205,11.0,4.65,14.0,1.216,0.786,0.0
75%,33.0,1.0,67.75,0.0,0.27,16.1,5.3,16.5,1.5075,0.857,1.0
max,57.0,1.0,86.0,77.0,0.61,40.0,6.78,39.0,3.0,2.0,2.0


# 2: Data Cleaning

Handling Missing Values
-
- Imputation: Filling missing values with mean.

In [2]:
import pandas as pd
data = pd.read_csv('echocardiogram.csv')


In [3]:
data.isnull().sum()

survival                 3
alive                    2
age                      7
pericardialeffusion      1
fractionalshortening     9
epss                    16
lvdd                    12
wallmotion-score         5
wallmotion-index         3
mult                     4
name                     2
group                   23
aliveat1                58
dtype: int64

In [4]:
import pandas as pd
import numpy as np

# Separate numeric and non-numeric columns
numeric_cols = data.select_dtypes(include=[np.number])
non_numeric_cols = data.select_dtypes(exclude=[np.number])

# Fill missing values in numeric columns with the mean
numeric_cols.fillna(numeric_cols.mean(), inplace=True)   # Numerical features
#non_numeric_cols.fillna(non_numeric_cols.mode(), inplace=True) # Categorical features


# Combine back with non-numeric columns
data = pd.concat([numeric_cols, non_numeric_cols], axis=1)

# Check for any remaining missing values
missing_values = data.isnull().sum()
print(missing_values)


survival                 0
alive                    0
age                      0
pericardialeffusion      0
fractionalshortening     0
epss                     0
lvdd                     0
wallmotion-score         0
wallmotion-index         0
mult                     0
aliveat1                 0
name                     2
group                   23
dtype: int64


In [5]:
import pandas as pd
import numpy as np

# Assuming 'data' is your dataframe
# Separate numeric and non-numeric columns
numeric_cols = data.select_dtypes(include=[np.number])
non_numeric_cols = data.select_dtypes(exclude=[np.number])

# Fill missing values in numeric columns with the mean
numeric_cols.fillna(numeric_cols.mean(), inplace=True)   # Numerical features

# Fill missing values in non-numeric columns (categorical) with the mode (most frequent value)
for col in non_numeric_cols.columns:
    non_numeric_cols[col].fillna(non_numeric_cols[col].mode()[0], inplace=True)  # Categorical features

# Combine back with non-numeric columns
data = pd.concat([numeric_cols, non_numeric_cols], axis=1)

# Check for any remaining missing values
missing_values = data.isnull().sum()
print(missing_values)


survival                0
alive                   0
age                     0
pericardialeffusion     0
fractionalshortening    0
epss                    0
lvdd                    0
wallmotion-score        0
wallmotion-index        0
mult                    0
aliveat1                0
name                    0
group                   0
dtype: int64


In [None]:
data.shape

Removal: Deleting rows with missing values.
-

In [None]:
data = pd.read_csv('echocardiogram.csv')
data.isnull().sum()

In [None]:
data.shape

In [None]:
# Drop rows with any missing values
data.dropna(inplace=True)

# Check for any remaining missing values
missing_values = data.isnull().sum()
print(missing_values)


In [None]:
data.shape

Removing Duplicates
-

In [None]:
data = pd.read_csv('echocardiogram.csv')
data.shape


In [None]:
data.drop_duplicates(inplace=True)
data.shape

# 3: Outlier Detection and Removal

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv('echocardiogram.csv')

# Checking the structure of the dataset
#print(data.info())
data.describe()

In [None]:
0.25-1.5*0.5

In [None]:
0.75 + 1.5 * 0.5

In [None]:
# Identify numeric columns
numeric_cols = data.select_dtypes(include=[np.number])

# Calculate IQR
Q1 = numeric_cols.quantile(0.25)
Q3 = numeric_cols.quantile(0.75)
IQR = Q3 - Q1

# Filter out outliers        # 0.25-1.5*0.5 = -0.5                      #  0.75 + 1.5 * 0.5 = 1.5

data_cleaned = data[~((numeric_cols < (Q1 - 1.5 * IQR)) | (numeric_cols > (Q3 + 1.5 * IQR))).any(axis=1)]

# Visualizing the data before and after outlier removal
plt.figure(figsize=(20, 6))

# Box plot before removing outliers
plt.subplot(1, 2, 1)
numeric_cols.boxplot()
plt.title("Before Outlier Removal")

# # Box plot after removing outliers
# plt.subplot(1, 2, 2)
# data_cleaned.select_dtypes(include=[np.number]).boxplot()
# plt.title("After Outlier Removal")

plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(20, 6))

# Box plot after removing outliers
plt.subplot(1, 2, 2)
data_cleaned.select_dtypes(include=[np.number]).boxplot()
plt.title("After Outlier Removal")

plt.tight_layout()
plt.show()


In [None]:
data_cleaned.shape

In [None]:
data_cleaned.head()

# 4. Data Transformation


Key Differences
-
Range of Values:

Normalization: Values are scaled to a fixed range, typically [0, 1].
Standardization: Values are rescaled to have a mean of 0 and a standard deviation of 1.
Effect on Distribution:

Normalization: Compresses or stretches the data to fit within the specified range, potentially altering the original distribution.
Standardization: Preserves the shape of the original distribution but changes the scale.
Use Cases:

Normalization: Suitable for distance-based algorithms, like k-nearest neighbors and neural networks.
Standardization: Suitable for algorithms that assume a normal distribution, like linear regression and logistic regression.


Normalization/Standardization
-
- Normalization
Definition:
Normalization rescales the data to a fixed range, typically [0, 1] or [-1, 1].

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
data = pd.read_csv('echocardiogram.csv')

# Separate numeric and non-numeric columns
numeric_cols = data.select_dtypes(include=[np.number])
non_numeric_cols = data.select_dtypes(exclude=[np.number])

# Apply MinMaxScaler to the numeric columns
scaler = MinMaxScaler()
scaled_numeric_data = scaler.fit_transform(numeric_cols)

# Convert the scaled numeric data back to a DataFrame
scaled_numeric_df = pd.DataFrame(scaled_numeric_data, columns=numeric_cols.columns)

# Combine the scaled numeric columns with the non-numeric columns
scaled_data = pd.concat([scaled_numeric_df, non_numeric_cols.reset_index(drop=True)], axis=1)

# Check the shape of the scaled data
print(scaled_data.shape)
print()
print('*' * 60)
scaled_data.head()


Standardization
-
Definition:
Standardization rescales the data so that it has a mean of 0 and a standard deviation of 1.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_csv('echocardiogram.csv')

# Separate numeric and non-numeric columns
numeric_cols = data.select_dtypes(include=[np.number])
non_numeric_cols = data.select_dtypes(exclude=[np.number])

# Apply StandardScaler to the numeric columns
scaler = StandardScaler()
scaled_numeric_data = scaler.fit_transform(numeric_cols)

# Convert the scaled numeric data back to a DataFrame
scaled_numeric_df = pd.DataFrame(scaled_numeric_data, columns=numeric_cols.columns)

# Combine the scaled numeric columns with the non-numeric columns
scaled_data = pd.concat([scaled_numeric_df, non_numeric_cols.reset_index(drop=True)], axis=1)

# Check the shape of the scaled data
print(scaled_data.shape)
print()
print('*' * 60)
scaled_data.head()


# 5: One-Hot Encoding


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_csv('echocardiogram.csv')
data.head(2)

In [None]:
data["name"].unique()

In [None]:
data.group.unique()

In [None]:
data.aliveat1.unique()

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_csv('echocardiogram.csv')

# Separate numeric and non-numeric columns
cat_features = [feature for feature in data.columns if data[feature].dtype == 'O']

# Convert categorical columns using pd.get_dummies
data1 = pd.get_dummies(cat_features)
data1

In [None]:
data1.info()

In [None]:
cat_features

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_csv('echocardiogram.csv')

# Separate numeric and non-numeric columns
cat_features = [feature for feature in data.columns if data[feature].dtype == 'O']

# Convert categorical columns using pd.get_dummies
data1 = pd.get_dummies(data, columns=cat_features)

# Combine the scaled numeric columns with the non-numeric columns (if any)
scaled_data = pd.concat([data, data1], axis=1)

# Check the shape of the scaled data
print(scaled_data.shape)
print()
print('*' * 70)

scaled_data.head()


In [None]:
data.columns

In [None]:
scaled_data.columns

In [None]:
data1.head()

# 6: Data Reduction

Dimensionality Reduction
-
PCA (Principal Component Analysis)

In [None]:
scaled_data.shape

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load the dataset
data = pd.read_csv('echocardiogram.csv')

# Handle missing values (if any)
data.fillna(data.mean(numeric_only=True), inplace=True)

# Separate categorical and numeric columns
cat_features = [feature for feature in data.columns if data[feature].dtype == 'O']
numeric_features = [feature for feature in data.columns if data[feature].dtype != 'O']

# Convert categorical columns using pd.get_dummies
data = pd.get_dummies(data, columns=cat_features)

# Standardize the numeric columns
scaler = StandardScaler()
data[numeric_features] = scaler.fit_transform(data[numeric_features].values)

# Apply PCA
pca = PCA(n_components=15)
data_pca = pca.fit_transform(data)

# Check the shape of the PCA data
print(data_pca.shape)
print(data_pca[:5])  # Print the first 5 rows of the PCA-transformed data

###########################


# Plot the original data (first two numeric features)
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.scatter(data[numeric_features[0]], data[numeric_features[1]], alpha=0.5)
plt.title('Original Data')
plt.xlabel(numeric_features[0])
plt.ylabel(numeric_features[1])

# Apply PCA
pca = PCA(n_components=15)  # Reducing to 2 components for visualization
data_pca = pca.fit_transform(data)

# Plot the PCA transformed data
plt.subplot(1, 2, 2)
plt.scatter(data_pca[:, 0], data_pca[:, 1], alpha=0.5)
plt.title('PCA Transformed Data')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')

plt.tight_layout()
plt.show()


In [None]:
type(data_pca)

In [None]:
data_pca.ndim

In [None]:
data_pca.shape

# 7: Handling Imbalanced Data
    
- Resampling Techniques
- Oversampling

In [None]:
data = pd.read_csv('echocardiogram.csv')

data.alive.value_counts(True)

In [None]:
data.shape

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('echocardiogram.csv')

# Handle missing values (if any)
data.fillna(data.mean(numeric_only=True), inplace=True)

# Separate categorical and numeric columns
cat_features = [feature for feature in data.columns if data[feature].dtype == 'O']
numeric_features = [feature for feature in data.columns if data[feature].dtype != 'O']

# Convert categorical columns using pd.get_dummies
data = pd.get_dummies(data, columns=cat_features)

# Standardize the numeric columns
scaler = StandardScaler()
data[numeric_features] = scaler.fit_transform(data[numeric_features].values)

# Convert the target variable to categorical if necessary
# Assuming 'alive' is the target column, you may need to binarize or discretize it
if data['alive'].dtype != 'int64' and data['alive'].dtype != 'bool':
    # Example: Binarize 'alive' if it's not already binary
    # Replace this logic with appropriate binning or thresholding based on your dataset
    data['alive'] = (data['alive'] > 0.5).astype(int)

# Separate features and target
X = data.drop(columns=['alive'])  # Replace 'alive' with your target column
y = data['alive']  # Replace 'alive' with your target column

# Encode the target variable if it's not already encoded
if y.dtype == 'O':
    le = LabelEncoder()
    y = le.fit_transform(y)

print(X.shape, y.shape)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combine resampled data
data_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['alive'])], axis=1)
data_resampled.head()

In [None]:
data_resampled.alive.value_counts(True)

In [None]:
data_resampled.shape

Undersampling
-

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('echocardiogram.csv')

# Handle missing values (if any)
data.fillna(data.mean(numeric_only=True), inplace=True)

# Separate categorical and numeric columns
cat_features = [feature for feature in data.columns if data[feature].dtype == 'O']
numeric_features = [feature for feature in data.columns if data[feature].dtype != 'O']

# Convert categorical columns using pd.get_dummies
data = pd.get_dummies(data, columns=cat_features)

# Standardize the numeric columns
scaler = StandardScaler()
data[numeric_features] = scaler.fit_transform(data[numeric_features].values)

# Convert the target variable to categorical if necessary
# Assuming 'alive' is the target column, you may need to binarize or discretize it
if data['alive'].dtype != 'int64' and data['alive'].dtype != 'bool':
    # Example: Binarize 'alive' if it's not already binary
    # Replace this logic with appropriate binning or thresholding based on your dataset
    data['alive'] = (data['alive'] > 0.5).astype(int)

# Separate features and target
X = data.drop(columns=['alive'])  # Replace 'alive' with your target column
y = data['alive']  # Replace 'alive' with your target column

# Encode the target variable if it's not already encoded
if y.dtype == 'O':
    le = LabelEncoder()
    y = le.fit_transform(y)

print(X.shape, y.shape)

# Apply SMOTE Undersampling
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()
X_resampled, y_resampled = rus.fit_resample(X, y)

# Combine resampled data
data_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['alive'])], axis=1)
data_resampled.head()


In [None]:
data_resampled.alive.value_counts()

In [None]:
data_resampled.shape

# Target Encoder

In [None]:
import pandas as pd
from category_encoders import TargetEncoder

# Example dataset
data = {'animal': ['cat', 'dog', 'mouse', 'dog', 'cat'], 'target': [1, 0, 1, 0, 1]}
df = pd.DataFrame(data)

target_encoder = TargetEncoder(cols=['animal'])
target_encoded = target_encoder.fit_transform(df['animal'], df['target'])
print(target_encoded)


# 8: Splitting Data

In [None]:
from sklearn.model_selection import train_test_split
data = pd.read_csv('echocardiogram.csv')

X = data.drop('alive', axis=1)
y = data['alive']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape