# Question 1

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN, KMeans
from sklearn import metrics
import seaborn as sns

In [None]:
# read the data
df1 = pd.read_csv("./HW3_1_data.csv")

In [None]:
# show the head rows of the data
df1.head()

In [None]:
# drop the rows that have null values
df1.dropna(inplace=True)

In [None]:
# to fetch the x and y data
X1 = df1.iloc[:, 0].values.reshape(-1, 1)
y1 = df1.iloc[:, -1].values.reshape(-1, 1)

In [None]:
# plot the scatter picture, and to see how many clusters can be
plt.scatter(X1, y1)
# 7 clusters

In [None]:
# calculate the eps and min_samples in the DBSCAN parameters list

# rs = []
# eps = np.arange(0.1, 4, 0.1)
# min_samples = np.arange(2, 20, 1)
#
# best_score = 0
# best_score_eps = 0
# best_score_min_samples = 0
#
# for i in eps:
#     for j in min_samples:
#         try:
#             db = DBSCAN(eps=i, min_samples=j, n_jobs=-1).fit(df1)
#             labels = db.labels_
#             k = metrics.silhouette_score(df1, labels) # to get the current silhouette_score
#             ratio = len(labels[labels[:] == -1]) / len(labels)  # to calculate the ratio of noise and the total point
#             n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)  # obtain the number of clusters
#             rs.append([i, j, k, ratio, n_clusters_])
#
#             if k > best_score:
#                 best_score = k
#                 best_score_eps = i
#                 best_score_min_samples = j
#             else:
#                 db = ''
#         except Exception:
#             db = ''
# rs = pd.DataFrame(rs)
#rs.columns = ['eps', 'min_samples', 'score', 'ratio', 'n_clusters']
# sns.relplot(x="eps", y="min_samples", size='score', data=rs)
# sns.relplot(x="eps", y="min_samples", size='ratio', data=rs)
# print(best_score_eps,best_score_min_samples)
# to get the best parameters of the DBSCAN is 1.3 and 7

In [None]:
# set the DBSCAN parameters according to the above
dbscan1 = DBSCAN(eps=1.3, min_samples=7)

In [None]:
# to plot the results of the classification
y_pred1 = dbscan1.fit_predict(df1)
plt.scatter(X1, y1, c=y_pred1)
plt.show()

In [None]:
# use SpectralClustering to predict and classification
from sklearn.cluster import SpectralClustering

# n_cluster is 7 as we can see
y_p = SpectralClustering(n_clusters=7).fit_predict(df1)
plt.scatter(X1, y1, c=y_p)
plt.show()

# Question 2

In [None]:
# read the data
df2 = pd.read_csv("./HW3_2_data.csv")

In [None]:
# show the head rows of the data
df2.head()

In [None]:
# describe the data
df2.describe()
# we can see that the data is biased

### data processing

In [None]:
# CUST_ID is not useful, drop it
df2.drop(['CUST_ID'], axis=1, inplace=True)

In [None]:
# Because the credit limit has just 1 missing value, so dropping it makes no difference
df2.dropna(subset=['CREDIT_LIMIT'], inplace=True)

In [None]:
# It seems that no columns have relationships with this column so use median value to replace it
df2['MINIMUM_PAYMENTS'].fillna(df2['MINIMUM_PAYMENTS'].median(), inplace=True)

In [None]:
# plot the distribution of each column
plt.figure(figsize=(20, 30))
for i, col in enumerate(df2.columns):
    ax = plt.subplot(9, 2, i + 1)
    sns.kdeplot(df2[col], ax=ax)
    plt.xlabel(col)
plt.show()
# I have found that many of the columns are skewed, so we must handle it

In [None]:
# to take the log value of these columns to evaluate it normally 
cols = ['BALANCE', 'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'ONEOFF_PURCHASES_FREQUENCY',
        'PURCHASES_INSTALLMENTS_FREQUENCY', 'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS',
        'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT']
# here do not need to add `PURCHASES` since the result is not good
for col in cols:
    df2[col] = np.log(1 + df2[col])

In [None]:
plt.figure(figsize=(15, 25))
for i, col in enumerate(cols):
    ax = plt.subplot(6, 2, i + 1)
    sns.kdeplot(df2[col], ax=ax)
    plt.xlabel(col)
plt.show()

In [None]:
# since the dimension of the data is so large, we need to take the important information of the data
from sklearn.decomposition import PCA

# PCA helps to decrease the dimensions
# using `mle` to fit automatically
pca = PCA(n_components='mle')
X_reduce = pca.fit_transform(df2)

In [None]:
from sklearn.cluster import KMeans

In [None]:
# to find the best k for clustering
km = []
innertia_m = []
for i in range(1, 10):
    km.append(KMeans(n_clusters=i, random_state=0).fit(X_reduce))
    innertia_m.append(km[i - 1].inertia_)
plt.scatter(range(1, 10), innertia_m)

In [None]:
# in the picture we can see that the best possible k is 3
# and we use 3 for clustering
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans.fit(X_reduce)

In [None]:
df2['LABEL'] = kmeans.labels_

In [None]:
# to recover the data
for col in cols:
    df2[col] = np.exp(df2[col])

In [None]:
# draw the picture between the `oneoff_puchases` and the `purchases`
plt.figure(figsize=(10, 6), dpi=300)
sns.scatterplot(palette='Accent', data=df2, x='ONEOFF_PURCHASES', y='PURCHASES', hue='LABEL')
plt.title('Clustered by one-off-purchases and total-purchases')
plt.show()

In [None]:
# draw the picture between the `credit_limit` and the `purchases`
plt.figure(figsize=(10, 6), dpi=300)
sns.scatterplot(palette='Accent', data=df2, x='CREDIT_LIMIT', y='PURCHASES', hue='LABEL')
plt.title('Clustered by credit-limit and total-purchases')
plt.show()

In [None]:
# draw the picture between the `oneoff_purchases_frequency` and the `purchases`
plt.figure(figsize=(10, 6), dpi=300)
sns.scatterplot(palette='Accent', data=df2, x='ONEOFF_PURCHASES_FREQUENCY', y='PURCHASES', hue='LABEL')
plt.title('Clustered by Oneoff-purchases-frequency and total-purchases')
plt.show()

# we can see that the clustering makes sense