In [None]:
# import package
import pandas as pd
import numpy as np
import sklearn
import sys
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN, MeanShift, estimate_bandwidth
from pyclustering.cluster.clarans import clarans
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import metrics
from sklearn.decomposition import PCA

In [None]:
# load Data
df = pd.read_csv("C:/Users/whgod/Desktop/머신러닝/Lab2/housing.csv")

In [None]:
df.describe() 
# we need to check the total_bedrooms count. (Less than others.)
# we need to scaling the data.

In [None]:
# As the population grows, total_bedrooms is also expected to increase.
# After sorting the data, check the correlation.
df_sort = df.sort_values(by=['population']) ; df_sort

In [None]:
sns.scatterplot(df_sort['population'], df_sort['total_bedrooms'])
# We can visually see that the values of the two variables are positively correlated.

In [None]:
# We can handle missing values using the ffill method.
df_clean = df_sort.fillna(method='ffill')
df_clean.isna().sum()

In [None]:
# save the clean data and reset index.
DF = df_clean.reset_index(drop='index'); DF

In [None]:
def AutoScaleEncode(DataFrame, scalers, encoder, encoding_col) :
    
    """
    It receives the desired data frame and scale function as variables and returns a data frame.
    --------------------------------
    DataFrame : DataFrame
    scalers : The scaler you want to apply
    encoder : The encoder you want to apply
    encoding_col : The columns you want to encode
    """
    Df_scale = DataFrame.drop([encoding_col], axis=1)
    Df_encode = DataFrame[[encoding_col]]
    
    scaler = scalers()
    scaled = scaler.fit_transform(Df_scale)
    scaled = pd.DataFrame(scaled, columns= Df_scale.columns)
    
    if encoder == OrdinalEncoder():
        enc = encoder
        enc = enc.fit_transform(Df_encode)
        enc = pd.DataFrame(enc, columns= Df_encode.columns)
        
        new_df = pd.concat([scaled, enc], axis=1)
    else:
        dum = pd.get_dummies(Df_encode)
        dum = pd.DataFrame(dum)
        
        new_df = pd.concat([scaled, dum], axis=1)
    
    return new_df

In [None]:
# Use AutoScaler
candidate = AutoScaleEncode(DF, MinMaxScaler, OneHotEncoder, 'ocean_proximity'); candidate.describe()

In [None]:
# hitmap plot
corr = candidate.corr();corr

plt.figure(figsize = (20,15))

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(data = corr,
            annot = True,
            mask = mask,
            fmt = '.2f',
            linewidths = 1.,
            cmap = 'RdYlBu_r')
plt.show()

# We can see that the latitude and longitude variables are highly correlated.
# We can also see that the three variables are highly correlated.

In [None]:
X= candidate[['total_bedrooms', 'total_rooms']] ; X

In [None]:
pca = PCA(n_components=2)
printcipalComponents = pca.fit_transform(candidate)
principalDf = pd.DataFrame(data=printcipalComponents, columns = ['principal component1', 'principal component2'])
principalDf.head()

In [None]:
pca.components_

In [None]:
candidate.columns

In [None]:
def main():
    np.set_printoptions(threshold=sys.maxsize)
    pd.set_option("display.max_rows", None, "display.max_columns", None)
    tmpdf = X

    ClusterParams = []
    kmeansParams = {
        'n_clusters' : [2, 4, 6, 8, 10, 12],
        'algorithm' : ['auto', 'full', 'elkan'],
        'init' : ['k-means++', 'random']
    }
    
    dbscanParams = {
        'eps' : [0.5, 1.0],
        'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'leaf_size' : [3, 5, 15, 20],
        'min_samples' : [3, 5, 10, 15]
    }

    GMParams = {
        'n_components' : [2, 4, 6, 8, 10, 12],
        'convariance_type' : ['full', 'tied', 'diag', 'spherical'],
        'init_params' : ['kmeans', 'random']
    }

    claransParams = {
        'number_clusters' : [2, 4, 6, 8, 10, 12],
        'numlocal' : [2, 4, 8, 10],
        'maxneighbor' : [3, 5, 15]
    }

    MSParams = {
        'bandwidth' : [0.7, 1.3, 2.576979121414909, 5]
    }

    ClusterParams.append(kmeansParams)
    ClusterParams.append(dbscanParams)
    ClusterParams.append(GMParams)
    ClusterParams.append(claransParams)
    ClusterParams.append(MSParams)

    scores, labels, bestScore, bestResult, bestIndex = findBestParams(tmpdf,ClusterParams)

    print("best params = " + str(labels[bestIndex]))

main()

In [None]:
principalDf = principalDf.drop(['MHV_cut'], axis=1)

In [None]:
pd.qcut(candidate.median_house_value,[0, 0.375, 0.625, 1], labels=[0,1,2])

In [None]:
def findBestParams(data, ClusterParams):
    
    """
    Compare the performance (silhouet score) of different clustering models on the same data set.
    return scores, labels, bestScore, bestResult, bestIndex.
    ------------------------------------------------------------------------------
    data : DataFrame 
    ClusterParams : the model you want to compare
        None :  kmeansParams = {
                    'n_clusters' : [2, 4, 6, 8, 10, 12],
                    'algorithm' : ['auto', 'full', 'elkan'],
                    'init' : ['k-means++', 'random']
                }
    
                dbscanParams = {
                    'eps' : [0.5, 1.0],
                    'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
                    'leaf_size' : [3, 5, 15, 20],
                    'min_samples' : [3, 5, 10, 15]
                }

                GMParams = {
                        'n_components' : [2, 4, 6, 8, 10, 12],
                        'convariance_type' : ['full', 'tied', 'diag', 'spherical'],
                        'init_params' : ['kmeans', 'random']
                }

                claransParams = {
                        'number_clusters' : [2, 4, 6, 8, 10, 12],
                        'numlocal' : [2, 4, 8, 10],
                        'maxneighbor' : [3, 5, 15]
                }

                MSParams = {
                        'bandwidth' : [0.7, 1.3, 2.576979121414909, 5]
                }
    """

    threads = 16

    scores = []
    labels = []
    bestScore = -100.0
    bestResult = 0
    bestIndex = 0

    for n_clusters in ClusterParams[0]['n_clusters']:
        for algorithm in ClusterParams[0]['algorithm']:
            for init in ClusterParams[0]['init']:

                paramLabel = ", method : KMeans, n_cluster : " + str(n_clusters) + ", algorithm : " + str(algorithm) + ", init : " + str(init)
                        
                km = KMeans(n_clusters=n_clusters, algorithm=algorithm, init=init)
                result = km.fit_predict(data)
                try:
                    score = silhouette_score(data, result)
                except ValueError as e:
                    score = -1
                    pass

                label = paramLabel + ", score :" + str(score)
                print(label)

                scores.append(score)
                labels.append(label)

                if score > bestScore:
                    bestScore = score
                    bestResult = result
                    bestIndex = len(scores) - 1


    for eps in ClusterParams[1]['eps']:
        for algorithm in ClusterParams[1]['algorithm']:
            for leaf_size in ClusterParams[1]['leaf_size']:
                for min_samples in ClusterParams[1]['min_samples']:

                    paramLabel = ", method : DBSCAN , eps : " + str(eps) + ", algorithm : " + str(algorithm) + ", leaf_Size : " + str(leaf_size) + ", min_samples : " + str(min_samples)

                    db = DBSCAN(eps=eps, algorithm=algorithm, leaf_size=leaf_size, min_samples=min_samples)
                    result = db.fit_predict(data)
                    try:
                        score = silhouette_score(data, result)
                    except ValueError as e:
                        score = -1
                        pass
                    label = paramLabel + ", score :" + str(score)
                    print(label)

                    scores.append(score)
                    labels.append(label)

                    if score > bestScore:
                        bestScore = score
                        bestResult = result
                        bestIndex = len(scores) - 1
            
            
    for n_components in ClusterParams[2]['n_components']:
        for convariance_type in ClusterParams[2]['convariance_type']:
            for init_params in ClusterParams[2]['init_params']:

                paramLabel = ", method : GaussianMixture, n_components : " + str(n_components) + ", convariance_type : " + str(convariance_type) + ", init_params : " + str(init_params)

                gm = GaussianMixture(n_components=n_components, covariance_type=convariance_type, init_params=init_params)
                result = gm.fit_predict(data)
                try:
                    score = silhouette_score(data, result)
                except ValueError as e:
                    score = -1
                    pass
                label = paramLabel + ", score :" + str(score)
                print(label)

                scores.append(score)
                labels.append(label)

                if score > bestScore:
                    bestScore = score
                    bestResult = result
                    bestIndex = len(scores) - 1
                    
    for number_clusters in ClusterParams[3]['number_clusters']:
        for numlocal in ClusterParams[3]['numlocal']:
            for maxneighbor in ClusterParams[3]['maxneighbor']:

                paramLabel = ", method : CLARANS, number_clusters : " + str(number_clusters) + ", numlocal : " + str(numlocal) + ", maxneighbor : " + str(maxneighbor)

                cl = clarans(np.array(data), number_clusters, numlocal, maxneighbor)
                result = cl.process()
                score = silhouette_score(data, result.get_cluster_encoding)

                label = paramLabel + ", score :" + str(score)
                print(label)

                scores.append(score)
                labels.append(label)

                if score > bestScore:
                    bestScore = score
                    bestResult = result
                    bestIndex = len(scores) - 1  
                            
    for bandwidth in ClusterParams[3]['bandwidth']:
                
        paramLabel = ", method : MeanShift, bandwidth : " + str(bandwidth)

        ms = MeanShift(bandwidth=bandwidth, n_jobs=threads)
        result = ms.fit_predict(data)
        try:
            score = silhouette_score(data, result)
        except ValueError as e:
            score = -1
            pass
        label = paramLabel + ", score :" + str(score)
        print(label)

        scores.append(score)
        labels.append(label)

        if score > bestScore:
            bestScore = score
            bestResult = result
            bestIndex = len(scores) - 1
            
    return scores, labels, bestScore, bestResult, bestIndex

In [None]:
X = df[['latitude', 'total_rooms', 'median_house_value']]
scaler = StandardScaler()
scaled = scaler.fit_transform(X)
scaled = pd.DataFrame(scaled, columns=X.columns)
label = GaussianMixture(n_components=2,covariance_type='tied',init_params='kmeans').fit_predict(X)

plt.figure(1)
plt.scatter(X['total_bedrooms'],X['total_rooms'], c=label)
plt.xlabel('total_bedrooms')
plt.ylabel('total_rooms')
plt.title('Gaussian Mixture clustering')

X['MHV_cut'] = pd.qcut(candidate.median_house_value,[0, 0.5, 1], labels=[0,1])

plt.figure(2)
plt.scatter(X['total_bedrooms'],X['total_rooms'], c= X['MHV_cut'])
plt.xlabel('total_bedrooms')
plt.ylabel('total_rooms')
plt.title('Mean_house_value')