In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [16]:
df = pd.read_csv('Data/Churn_Modelling.csv')

In [17]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  str    
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  str    
 5   Gender           10000 non-null  str    
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), str(3)
memory usage: 1.1 MB


In [4]:
print(df.isnull().sum())

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64


# 1. Handling missing values

In [19]:
df.loc[0:10, 'Age'] = np.nan

In [20]:
print(df.isnull().sum())

RowNumber           0
CustomerId          0
Surname             0
CreditScore         0
Geography           0
Gender              0
Age                11
Tenure              0
Balance             0
NumOfProducts       0
HasCrCard           0
IsActiveMember      0
EstimatedSalary     0
Exited              0
dtype: int64


### 1.1 Delete Columns

In [None]:
updated_df = df.dropna(axis=1)

In [None]:
updated_df.info()

we deleted the age column completely !! 

Should only be used if there are too many null values

### 1.2 Delete Rows

In [None]:
updated_df = df.dropna(axis=0)

In [None]:
updated_df.info()

### 1.3 Filling the missing values - imputation

In [7]:
mean = df['Age'].mean()

In [8]:
median = df['Age'].median()

In [None]:
update_df = df
update_df['Age'] = update_df['Age'].fillna(mean) #or median 
# if we have many outliers ==> median 
# if we have less outliers ==> mean

In [11]:
update_df.info()

<class 'pandas.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  str    
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  str    
 5   Gender           10000 non-null  str    
 6   Age              10000 non-null  float64
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(3), int64(8), str(3)
memory usage: 1.1 MB


### 1.4 Forward & Backword Filling - imputation

In [None]:
df['Age'] = df['Age'].bfill()

In [15]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  str    
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  str    
 5   Gender           10000 non-null  str    
 6   Age              10000 non-null  float64
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(3), int64(8), str(3)
memory usage: 1.1 MB


# 2. Feature Scaling

In [24]:
import seaborn as sns

In [25]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  str    
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  str    
 5   Gender           10000 non-null  str    
 6   Age              9989 non-null   float64
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(3), int64(8), str(3)
memory usage: 1.1 MB


In [26]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [27]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,,2,125510.82,1,1,1,79084.1,0


In [28]:
new_df = pd.DataFrame(df, columns=['Age', 'Tenure'])

In [31]:
new_df.head(15)

Unnamed: 0,Age,Tenure
0,,2
1,,1
2,,8
3,,1
4,,2
5,,8
6,,7
7,,4
8,,4
9,,2


In [32]:
new_df['Age'] = new_df['Age'].fillna(new_df['Age'].mean())

In [33]:
new_df.head(15)

Unnamed: 0,Age,Tenure
0,38.921414,2
1,38.921414,1
2,38.921414,8
3,38.921414,1
4,38.921414,2
5,38.921414,8
6,38.921414,7
7,38.921414,4
8,38.921414,4
9,38.921414,2


### 2.1 Normalization:

In [36]:
scaler = MinMaxScaler() # Instantiating the MinMaxScaler() function
normalized_df = scaler.fit_transform(new_df)
print(normalized_df)

[[0.2827218  0.2       ]
 [0.2827218  0.1       ]
 [0.2827218  0.8       ]
 ...
 [0.24324324 0.7       ]
 [0.32432432 0.3       ]
 [0.13513514 0.4       ]]


### 2.2 Standardization:

In [37]:
scaler = StandardScaler()
standardized_df = scaler.fit_transform(new_df)
print(standardized_df)

[[ 0.         -1.04175968]
 [ 0.         -1.38753759]
 [ 0.          1.03290776]
 ...
 [-0.27863284  0.68712986]
 [ 0.29362336 -0.69598177]
 [-1.0416411  -0.35020386]]


# 3. Outliers Treatment

In [38]:
import statistics

In [39]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,,2,125510.82,1,1,1,79084.1,0


In [None]:
def find_anomalies(data):
  anomalies = []

  random_data_std = statistics.stdev(df)
  random_data_mean = statistics.mean(df)

  anomaly_cut_off = random_data_std * 3

  lower_limit = random_data_mean - anomaly_cut_off
  upper_limit = random_data_mean + anomaly_cut_off

  for outlier in df:
    if outlier > upper_limit or outlier < lower_limit:
      anomalies.append(outlier)
  return anomalies