In [459]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, Normalizer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from IPython.core.display_functions import display
from sklearn.cluster import KMeans, DBSCAN, MeanShift
from sklearn.mixture import GaussianMixture
from pyclustering.cluster.clarans import clarans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

# Adjusting for Data Frame Output
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [460]:
# Categorical feature need to change numeric
# Encoder
def object_encoder(dataframe, encoder, target_feature):
    if encoder == "LabelEncoder" or encoder == "LabelEncoder()":
        encoder = LabelEncoder()
        dataframe[target_feature] = encoder.fit_transform(dataframe[target_feature])
    elif encoder == "OrdinalEncoder" or encoder == "OrdinalEncoder()":
        encoder = LabelEncoder()
        dataframe[target_feature] = encoder.fit_transform(dataframe[target_feature])
    elif encoder is None:
        return dataframe
    else:
        return None

    return dataframe

In [461]:
# Scaling
def data_scaling(dataframe, scaling):
    if scaling == "StandardScaler" or scaling == "StandardScaler()":
        scaling = StandardScaler()
    elif scaling == "MinMaxScaler" or scaling == "MinMaxScaler()":
        scaling = MinMaxScaler()
    elif scaling == "MaxAbsScaler" or scaling == "MaxAbsScaler()":
        scaling = MaxAbsScaler()
    elif scaling == "RobustScaler" or scaling == "RobustScaler()":
        scaling = RobustScaler()
    elif scaling == "Normalizer" or scaling == "Normalizer()":
        scaling = Normalizer()
    elif scaling is None:
        return dataframe
    else:
        return None

    dataframe = pd.DataFrame(scaling.fit_transform(dataframe), columns=dataframe.columns)
    return dataframe

In [462]:
# K-means
def model_kmeans(dataset, k_list):
    pca = PCA(n_components=2)  # Reduce feature for plotting
    dataset = pd.DataFrame(pca.fit_transform(dataset))
    clusters = k_list  # count of clustering
    inits = [10, 20, 30]

    f, axes = plt.subplots(len(k_list), len(inits))
    f.set_size_inches((20, len(k_list) * 5))  # Set grid size
    plt.subplots_adjust(wspace=0.3, hspace=0.3)  # Set grid margins
    plt.title("K-means")

    for number_k, k in enumerate(clusters):
        for number_init, init in enumerate(inits):
            print("number_k: "+str(number_k))
            print("k: "+str(k))
            kmeans = KMeans(n_clusters=k, n_init=init, random_state=42).fit(dataset)
            result = kmeans.predict(dataset)
            for i in range(0, k):
                axes[number_k, number_init].scatter(result.iloc[:, 0], result.iloc[:, 1], label=i)
            axes[number_k, number_init].legend(loc='upper right')

    return plt

In [463]:
# read Dataset
df_origin = pd.read_csv('dataset/housing_test.csv')

In [464]:
# Check data information
# df_origin.info()
# df_origin.describe()

In [465]:
# Copy dataset
df = df_origin.copy()

# Drop null data
df.dropna(axis=0, inplace=True)

display(df)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
701,-121.97,37.64,32,1283,194.0,485,171,6.0574,431000,<1H OCEAN
954,-121.92,37.64,46,1280,209.0,512,208,5.1406,315600,INLAND
1850,-124.17,41.8,16,2739,480.0,1259,436,3.7557,109400,NEAR OCEAN
8314,-118.32,33.35,27,1675,521.0,744,331,2.1579,450000,ISLAND


In [466]:
# testing
df = object_encoder(df, 'OrdinalEncoder',"ocean_proximity")
df = data_scaling(df, "StandardScaler")

x = df.drop(columns=["median_house_value"])

display(x)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-0.268298,0.08151,0.814663,-1.087735,-1.102357,-1.049169,-1.131015,1.551268,0.707107
1,-0.13098,-0.008226,-0.037891,-0.453721,-0.698904,-0.549711,-0.734631,0.464752,-1.414214
2,-0.104573,-0.008226,1.288304,-0.458441,-0.6058,-0.466978,-0.408716,0.025508,-0.707107
3,-1.292903,1.547202,-1.553543,1.83691,1.076287,1.82195,1.599628,-0.638006,1.414214
4,1.796755,-1.612261,-0.511532,0.162987,1.330773,0.243907,0.674733,-1.403521,0.0


In [467]:
# data = model_kmeans(x, [2, 3])
# data.show()

In [468]:
display(x)
kmeans = KMeans(n_clusters=2, n_init=10, random_state=42)
result = kmeans.fit_predict(x)

display(result)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-0.268298,0.08151,0.814663,-1.087735,-1.102357,-1.049169,-1.131015,1.551268,0.707107
1,-0.13098,-0.008226,-0.037891,-0.453721,-0.698904,-0.549711,-0.734631,0.464752,-1.414214
2,-0.104573,-0.008226,1.288304,-0.458441,-0.6058,-0.466978,-0.408716,0.025508,-0.707107
3,-1.292903,1.547202,-1.553543,1.83691,1.076287,1.82195,1.599628,-0.638006,1.414214
4,1.796755,-1.612261,-0.511532,0.162987,1.330773,0.243907,0.674733,-1.403521,0.0


AttributeError: 'NoneType' object has no attribute 'split'