In [9]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import QuantileTransformer


In [10]:
df = pd.read_csv("rfm_data.csv")
df.rename(columns = {'LAST_TO_END':'LAST_TO_END (R)', 'FLIGHT_COUNT':'FLIGHT_COUNT (F)', 'SEG_KM_SUM':'SEG_KM_SUM (M)'}, inplace=True)

quantile_trans = QuantileTransformer(n_quantiles=1000, output_distribution='normal')
df_scaled = pd.DataFrame(quantile_trans.fit_transform(df), columns=df.columns)
df_scaled

Unnamed: 0,LAST_TO_END (R),FLIGHT_COUNT (F),SEG_KM_SUM (M)
0,-5.199338,5.199338,5.199338
1,-1.343287,3.235539,5.199338
2,-1.152175,3.196163,5.199338
3,-0.057742,1.125776,5.199338
4,-1.505747,3.357813,5.199338
...,...,...,...
62983,0.754664,-5.199338,-2.489786
62984,-0.106841,-0.465405,-0.205580
62985,-0.673702,-5.199338,-1.383623
62986,1.050924,-5.199338,-0.890172


In [13]:
def cluster_maker(dataframe, n_cluster_lts):
    kmeans = KMeans(n_clusters=n_cluster_lts, init='k-means++', random_state=42).fit(dataframe)

    dataframe['distance_to_centroid'] = np.min(kmeans.transform(dataframe), axis=1)
    dataframe['cluster_id'] = kmeans.labels_

    return dataframe

In [None]:
def separate_dataframe(dataframe, n_cluster_lts, sort_column='distance_to_centroid'):
    cluster_dfs = {}

    for i in range(n_cluster_lts):
        cluster_dfs[i] = [dataframe[dataframe['cluster_id'] == i]]
        cluster_dfs[i][0] = cluster_dfs[i][0].sort_values(sort_column, ascending=False)
        
    return cluster_dfs

In [None]:
def trimmed_outlier(dataframe, n_cluster_lts, trim_percent, distance_column='distance_to_centroid'):
    dataframe_dict = separate_dataframe(dataframe=dataframe, n_cluster_lts=n_cluster_lts)
    outlier_dfs = {}

    for i in range(n_cluster_lts):
        
        rows_to_keep = int((trim_percent) / 100 *len(dataframe_dict[i][0]))
        threshold = dataframe_dict[i][0][distance_column].head(rows_to_keep).values[-1]

        outliers = dataframe_dict[i][0][dataframe_dict[i][0][distance_column] > threshold]

        dataframe_dict[i][0] = dataframe_dict[i][0][dataframe_dict[i][0][distance_column] <= threshold]

        outlier_dfs[i] = outliers.copy()

    inliers = pd.concat([dataframe_dict[i][0] for i in range(n_cluster_lts)], ignore_index=True)
    outliers = pd.concat([outlier_dfs[i] for i in range(n_cluster_lts)], ignore_index=True)

    return inliers, outliers

In [None]:
def best_percentage(dataframe, n_cluster_lts):
    trim_percent = [5, 10, 15, 20, 25, 30]
    best_silhouette_score = 0
    best_trimmed_df = None
    best_trim_percent = 0
    best_inliers = None
    best_outliers = None

    for percent in trim_percent:
        inliers, outliers = trimmed_outlier(dataframe=dataframe, n_cluster_lts=n_cluster_lts, trim_percent=percent)

        if inliers is not None:

            relevant_cols = inliers.iloc[:, :-2]
            labels = inliers.iloc[:, -1]
            score = silhouette_score(relevant_cols, labels)

            if score > best_silhouette_score:
                best_silhouette_score = score
                best_trim_percent = percent
                
                outliers['cluster_id'] = outliers.iloc[:, -1].replace(to_replace=(0, 1), value=("0-out", "1-out"))
                best_inliers = inliers
                best_outliers = outliers
    
    
    best_trimmed_df = pd.concat([best_inliers, best_outliers], axis=0)

    return best_trim_percent, best_silhouette_score, best_trimmed_df

In [None]:
def K_MEANS_LTS(dataframe, n_cluster_lts):
    df_cluster = cluster_maker(dataframe=dataframe, n_cluster_lts=n_cluster_lts)
    best_trim_percent, best_silhouette_score, best_df = best_percentage(df_cluster, n_cluster_lts=n_cluster_lts)

    return best_trim_percent, best_silhouette_score, best_df

best_percent, best_silhouette_score, best_trimmed_df = K_MEANS_LTS(df, 2)
print("Best Percentage: ", best_percent)
print("Best Silhouette Score: ", best_silhouette_score)
print("=================================")
best_trimmed_df

In [None]:
best_trimmed_df.drop('distance_to_centroid', axis=1, inplace=True)

In [None]:
import seaborn as sns 

plt.figure(figsize=(8,8))
sns.pairplot(data=best_trimmed_df, hue='cluster_id')

In [None]:
best_trimmed_df_filtered = best_trimmed_df.loc[~best_trimmed_df['cluster_id'].isin(["1-out", "0-out"]), :]
best_trimmed_df_filtered

In [None]:
best_trimmed_df

In [None]:
df_real = df.loc[:, df.columns.isin(['LAST_TO_END', 'FLIGHT_COUNT', 'SEG_KM_SUM'])]
df_real

In [None]:
best_trimmed_df

In [None]:
from sklearn.decomposition import PCA
import seaborn as sns 

pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_real)

df_pca = pd.DataFrame(df_pca, columns=["PC1", "PC2"])
df_pca['cluster_id'] = best_trimmed_df['cluster_id']
df_pca