<a href="https://colab.research.google.com/github/koleshjr/ALL_MY_TEMPLATES/blob/main/Outlier_Detection_template.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Outlier Detection Notebook

#### Load Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# import sys
# sys.path.append("/home/pavithra/Pictures/learning/ML/kaggle/")
# sys.path
import kaggle_utils_py

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.impute import KNNImputer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

#### Deal with missing values using KNN imputer and Z-score

In [None]:


%%time
data = pd.read_csv('../input/song-popularity-prediction/train.csv', index_col=0)
# as you seen in the EDA notebook this data has lots of missing values. i am gonna fill those valus using KNNimputer.
missing_features = kaggle_utils_py.find_features_with_missing_values(data)
knn_imptr = KNNImputer(n_neighbors=2)
data[missing_features] = knn_imptr.fit_transform(data[missing_features])
data.isna().sum()



#### Outlier Detection using IQR method and Z-score methods
* Method is based on the distribution type

In [None]:


# find the number of outliers each continues feature has.
# find the number of outliers on each features.
cotinuous_column_list = ['acousticness', 'danceability', 'energy',
                        'liveness', 'loudness', 'speechiness', 'tempo', 'audio_valence',
                        ] # we are not using the 'song_duration_ms' because it follows a normal distribution so need to use z-score method.
outliers = kaggle_utils_py.find_outliers_iqr_method(data[cotinuous_column_list])
display(outliers)
kaggle_utils_py.find_outlier_z_score_method(data["song_duration_ms"], col_name="song_duration_ms") # col name is must if you giving a single feature for test



#### Method 1: Use models that are not sensitive to outliers 
* Tree Based Models

#### Method 2: Trimming/ Dropping them

In [None]:
column = ['song_duration_ms', 'energy',
                        'liveness', 'loudness', 'speechiness', 'tempo',]
df, num_of_outliers = kaggle_utils_py.find_outliers_iqr_method(data[column], new_feature=True)
# The features will be added which says how may outliers in that particular row.
print("Shape of the original data --->",df.shape)
df.head()


In [None]:
# remove the rows which has outliers.
d = df.copy()
d.drop(df[df['num_of_outliers'] > 0].index , axis=0, inplace=True)
X_trimmed = d.drop(['num_of_outliers'], axis=1)
# drop the same rows from target.
y_trimmed = data.iloc[:,-1].drop(df[df['num_of_outliers'] > 0].index , axis=0)
print("Shape of the X --->",X_trimmed.shape)
print("Shape of the y --->",y_trimmed.shape)

#### Method 3: Flooring and Capping using IQR

In [None]:
column = ['song_duration_ms', 'energy',
                        'liveness', 'loudness', 'speechiness', 'tempo',]
lower_limit_df, upper_limit_df = kaggle_utils_py.find_outliers_iqr_method(data[column], return_limits=True)

df_flooring_capping = data[column].copy()

# replace with upper_limit if the value greater than that.
def check_upper(row, upper_value):
    if row > upper_value:
        return upper_value[0]
    return row

# replce with lower_limit if the value lower than that.
def check_lower(row, lower_value):
    if row < lower_value:
        return lower_value[0]
    return row

# a = pd.Series([1,2,3,4,5])
# a = a.apply(check_upper, args=(4,))
# a
for col in column:
    lower_limit = np.array(lower_limit_df[lower_limit_df['Features'] == col]['lower limit'])
    upper_limit = np.array(upper_limit_df[upper_limit_df['Features'] == col]['upper limit'])
    df_flooring_capping[col] = df_flooring_capping[col].apply(check_upper, args=(upper_limit,))
    df_flooring_capping[col] = df_flooring_capping[col].apply(check_lower, args=(lower_limit,))
df_flooring_capping.head()

In [None]:


# check the outliers on transforemd data.
column = ['song_duration_ms', 'energy',
                        'liveness', 'loudness', 'speechiness', 'tempo',]
print("before transformation")
display(kaggle_utils_py.find_outlier_z_score_method(data[column]))
print("after transformation")
kaggle_utils_py.find_outlier_z_score_method(df_flooring_capping[column]) # looks good



#### Method 4: Treat them as missing values and impute them
* You can use mean or median

Method 5: Transformation
* Log transformation
* Box Cox transformation

###### Log Transformation best for right skewed data

In [None]:
column_right_skewed = ['liveness', 'speechiness', ]
df = data[column_right_skewed]
for col in column_right_skewed:
    df[col] = np.log(df[col])

# plot the 2 features
sns.displot(data['liveness'])
plt.title("Feature before transformation")
sns.despine()
plt.show()

sns.displot(df['liveness'])
plt.title("Feature after transformation")
sns.despine()
plt.show()

####### Check outliers on log transformed data

In [None]:


# check the outliers on log transforemd data.
print("before transformation")
display(kaggle_utils_py.find_outlier_z_score_method(data['liveness']))
print("after transformation")
kaggle_utils_py.find_outlier_z_score_method(df['liveness']) # looks good



###### Cube root transformation also good for right skewed data

In [None]:
column_right_skewed = ['liveness', 'speechiness', ]
df = data[column_right_skewed]
for col in column_right_skewed:
    df[col] = (df[col] ** (1/3))

# plot the 2 features
sns.displot(data['liveness'])
plt.title("Feature before transformation")
sns.despine()
plt.show()

sns.displot(df['liveness'])
plt.title("Feature after transformation")
sns.despine()
plt.show()

In [2]:
# check the outliers on cube root transforemd data.
print("before transformation")
display(kaggle_utils_py.find_outlier_z_score_method(data['liveness']))
print("after transformation")
kaggle_utils_py.find_outlier_z_score_method(df['liveness']) # looks good


##### Box Cox transformation

In [None]:


import scipy

column = ['liveness', 'speechiness', ]
df = data[column]
for col in column:
    df[col], filterd_lmbda = scipy.stats.boxcox(data[col], lmbda=None)

# plot the 2 features
sns.displot(data['liveness'])
plt.title("Feature before transformation")
sns.despine()
plt.show()

sns.displot(df['liveness'])
plt.title("Feature after transformation")
sns.despine()
plt.show()



In [None]:


# check the outliers on Box Coxed transforemd data.
print("before transformation")
display(kaggle_utils_py.find_outlier_z_score_method(data['liveness']))
print("after transformation")
kaggle_utils_py.find_outlier_z_score_method(df['liveness']) # looks good

