# Dealing with **Outliers**

In [16]:
import pandas as pd
import numpy as np

In [17]:
data= pd.DataFrame({'age':[20,21,22,23,24,25,26,27,28,29,30,50]})
data

Unnamed: 0,age
0,20
1,21
2,22
3,23
4,24
5,25
6,26
7,27
8,28
9,29


In [9]:
# calculate mean and standard deviation and find outliers using Z-Score
mean=np.mean(data.age)
std=np.std(data.age)
data['z-score']= (data.age-mean)/std
data

Unnamed: 0,age,z-score
0,20,-0.938954
1,21,-0.806396
2,22,-0.673838
3,23,-0.54128
4,24,-0.408721
5,25,-0.276163
6,26,-0.143605
7,27,-0.011047
8,28,0.121512
9,29,0.25407


In [14]:
print('-------------------------------------')
print(f'Here is data with outlier:\n {data}')
print('-------------------------------------')
print(f"Here are the outliers based on the z-score threshold, 3:\n {data[data['z-score'] > 3]}")

-------------------------------------
Here is data with outlier:
     age   z-score
0    20 -0.938954
1    21 -0.806396
2    22 -0.673838
3    23 -0.541280
4    24 -0.408721
5    25 -0.276163
6    26 -0.143605
7    27 -0.011047
8    28  0.121512
9    29  0.254070
10   30  0.386628
11   50  3.037793
-------------------------------------
Here are the outliers based on the z-score threshold, 3:
     age   z-score
11   50  3.037793


In [15]:
# remove outliers
data= data[data['z-score']<=3]
print(f'Here is data without outliers:\n{data}')

Here is data without outliers:
    age   z-score
0    20 -0.938954
1    21 -0.806396
2    22 -0.673838
3    23 -0.541280
4    24 -0.408721
5    25 -0.276163
6    26 -0.143605
7    27 -0.011047
8    28  0.121512
9    29  0.254070
10   30  0.386628


In [19]:
# finding outliers using IQR method
# Calculate the first and third quartile
data = pd.DataFrame({'Age': [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 50]})
Q1 = np.percentile(data['Age'], 25, interpolation = 'midpoint')
Q3 = np.percentile(data['Age'], 75, interpolation = 'midpoint')

In [20]:
# Calculate the IQR
IQR = Q3 - Q1

In [21]:
#Calculate the lower and upper bound
lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3 + (1.5 * IQR)

In [22]:
print("----------------------------------------")
print(f"Here is the data with outliers:\n {data}")
print("----------------------------------------")
# Step 7: Print the outliers
print(f"Here are the outliers based on the IQR threshold:\n {data[(data['Age'] < lower_bound) | (data['Age'] > upper_bound)]}")
print("----------------------------------------")

----------------------------------------
Here is the data with outliers:
     Age
0    20
1    21
2    22
3    23
4    24
5    25
6    26
7    27
8    28
9    29
10   30
11   50
----------------------------------------
Here are the outliers based on the IQR threshold:
     Age
11   50
----------------------------------------


In [23]:
# Step 8: Remove the outliers
data = data[(data['Age'] >= lower_bound) & (data['Age'] <= upper_bound)]

In [24]:
print(f"Here is the data without outliers:\n {data}")

Here is the data without outliers:
     Age
0    20
1    21
2    22
3    23
4    24
5    25
6    26
7    27
8    28
9    29
10   30


In [25]:
# find outliers using k-Means method
# Import library
from sklearn.cluster import KMeans

# Sample data
data = [[2, 2], [3, 3], [3, 4], [30, 30], [31, 31], [32, 32]]

# Create a K-means model with two clusters (normal and outlier)
kmeans = KMeans(n_clusters=2, n_init=10)
kmeans.fit(data)

# Predict cluster labels
labels = kmeans.predict(data)

# Identify outliers based on cluster labels
outliers = [data[i] for i, label in enumerate(labels) if label == 1]

# print data
print("Data:", data)
print("Outliers:", outliers)
# Remove outliers
data = [data[i] for i, label in enumerate(labels) if label == 0]
print("Data without outliers:", data)

Data: [[2, 2], [3, 3], [3, 4], [30, 30], [31, 31], [32, 32]]
Outliers: [[30, 30], [31, 31], [32, 32]]
Data without outliers: [[2, 2], [3, 3], [3, 4]]
