In [9]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [10]:
# Read the CSV file
data = pd.read_csv('/content/drive/MyDrive/dataset3.csv')

# Extract the 'Income' column from the dataset
names = data['Nama']
incomes = data['Penghasilan']

# Normalize the income data
normalized_incomes = (incomes - np.mean(incomes)) / np.std(incomes)

# Reshape the normalized incomes to have a minimum of two dimensions
normalized_incomes = normalized_incomes.values.reshape(-1, 1)

# Train the K-means model
num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
kmeans.fit(normalized_incomes)

# Get the cluster assignments for each income
cluster_indices = kmeans.predict(normalized_incomes)

# Define the cluster names
cluster_names = {
    1: "High Income",
    0: "Low  Income",
    2: "Medium Income"
}

# Print the cluster assignments for each income with names
#for i, cluster_index in enumerate(cluster_indices):
#    print("Income: {:.2f} - Cluster: {} ({})".format(incomes[i], cluster_index, cluster_names[cluster_index]))

# Find the lowest and highest values in each cluster
cluster_data = pd.DataFrame({'Income': incomes, 'Cluster': cluster_indices})
cluster_stats = cluster_data.groupby('Cluster')['Income'].agg(['min', 'max'])

# Print the lowest and highest values in each cluster
for cluster_index, stats in cluster_stats.iterrows():
    print("Cluster: {} - Lowest: {:.2f}, Highest: {:.2f}".format(cluster_index, stats['min'], stats['max']))

# Count the number of data points in each cluster
cluster_counts = cluster_data['Cluster'].value_counts().sort_index()

# Print the count of data points in each cluster
for cluster_index, count in cluster_counts.items():
    print("Cluster: {} - Count: {}".format(cluster_index, count))


Cluster: 0 - Lowest: 7149976.00, Highest: 9978272.00
Cluster: 1 - Lowest: 1039387.00, Highest: 4093042.00
Cluster: 2 - Lowest: 4153908.00, Highest: 7080183.00
Cluster: 0 - Count: 56
Cluster: 1 - Count: 76
Cluster: 2 - Count: 68




In [11]:
print(normalized_incomes)

print(cluster_indices.reshape(-1, 1))

[[ 5.06755293e-01]
 [-7.33081288e-01]
 [-8.79629488e-01]
 [-1.10837677e+00]
 [ 1.56073806e+00]
 [-2.98501907e-01]
 [-9.19547247e-01]
 [ 4.50965300e-01]
 [ 1.00552348e+00]
 [ 2.36591535e-01]
 [ 9.10041645e-01]
 [-8.71821855e-01]
 [-8.23206003e-01]
 [ 1.38748887e+00]
 [-5.38522004e-01]
 [-6.60252612e-01]
 [-5.29710566e-01]
 [ 3.94454865e-01]
 [ 1.65628394e+00]
 [-4.36744065e-01]
 [ 1.52053344e+00]
 [-1.13667468e+00]
 [-7.45435935e-01]
 [-1.21886561e+00]
 [ 1.55975910e+00]
 [ 1.15500003e+00]
 [-4.16713762e-01]
 [-3.64706807e-02]
 [-1.31405321e+00]
 [ 1.17041112e+00]
 [ 5.41069053e-01]
 [-2.78387372e-01]
 [-7.33061492e-01]
 [-1.57314794e+00]
 [-2.04239698e-01]
 [ 2.05470912e-02]
 [-5.61881245e-01]
 [-1.04397524e+00]
 [-8.27089247e-01]
 [-1.36870393e+00]
 [-6.98807897e-01]
 [ 1.33512796e+00]
 [-1.60531856e-01]
 [ 8.30085466e-02]
 [ 7.91201732e-01]
 [ 5.69422478e-01]
 [-9.54219229e-02]
 [ 1.99212403e-01]
 [ 1.60664565e+00]
 [ 1.52176472e+00]
 [ 4.12561401e-01]
 [-6.33657206e-01]
 [ 1.4679741

In [12]:
# New data for prediction
new_data = np.array([5000000])

# Normalize the new data using the same mean and standard deviation
new_data_normalized = (new_data - np.mean(incomes)) / np.std(incomes)

# Reshape the new data to have a minimum of two dimensions
new_data_normalized = new_data_normalized.reshape(-1, 1)

# Predict the cluster assignments for new data
new_cluster_indices = kmeans.predict(new_data_normalized)

# Print the cluster assignments for new data with names
for i, cluster_index in enumerate(new_cluster_indices):
    print("New Data: {:.0f} - {}".format(new_data[i], cluster_names[cluster_index]))

# Count the total number of data points
total_count = len(data)

# Print the total count of data points
print("Total Data Points: {}".format(total_count))

# plt.scatter(incomes, cluster_indices)
# plt.xlabel('Income')
# plt.ylabel('Cluster')
# plt.title('K-means Clustering of Incomes')
# plt.yticks(list(cluster_names.keys()), list(cluster_names.values()))
#plt.pie(cluster_counts, labels=["high", "low", "medium"])
#plt.show()

New Data: 5000000 - Medium Income
Total Data Points: 200


In [13]:
pip install skl2onnx

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting skl2onnx
  Downloading skl2onnx-1.14.1-py2.py3-none-any.whl (292 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.3/292.3 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting onnx>=1.2.1 (from skl2onnx)
  Downloading onnx-1.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m117.9 MB/s[0m eta [36m0:00:00[0m
Collecting onnxconverter-common>=1.7.0 (from skl2onnx)
  Downloading onnxconverter_common-1.13.0-py2.py3-none-any.whl (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.8/83.8 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: onnx, onnxconverter-common, skl2onnx
Successfully installed onnx-1.14.0 onnxconverter-common-1.13.0 skl2onnx-1.14.1


In [46]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

# Specify an initial type for the model ( similar to input shape for the model )
initial_type = [ 
    ( 'new_data_normalized' , FloatTensorType( [None,1] ) ) 
]

# Write the ONNX model to disk
converted_model = convert_sklearn( kmeans , initial_types=initial_type )
with open( "sklearn_model.onnx", "wb" ) as f:
    f.write( converted_model.SerializeToString() )

In [14]:
import csv

# Assuming you have the lists: nama, penghasilan, penghasilan_normalized

# Combine the lists using zip()
combined_data = zip(names, incomes, cluster_indices)

# Specify the output CSV file path
csv_file = "combined_data.csv"

# Open the file in write mode and write the data
with open(csv_file, "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Nama", "Penghasilan", "Cluster"])  # Write header
    writer.writerows(combined_data)  # Write the combined data rows

print("Data has been written to", csv_file)

Data has been written to combined_data.csv


In [104]:
# Load the dataset from CSV file
data = pd.read_csv('/content/drive/MyDrive/combined_data.csv')

# Preprocess the data
nama = data['Nama'].values
penghasilan = data['Penghasilan'].values
cluster = data['Cluster'].values

In [105]:
print(penghasilan)
cluster = np.array(cluster)
print(cluster)

[6639459 3445396 3067859 2478561 9354726 4564958 2965023 6495733 7924384
 5943464 7678404 3087973 3213217 8908402 3946619 3633017 3969319 6350151
 9600871 4208819 9251151 2405660 3413568 2193920 9352204 8309465 4260421
 5240002 1948698 8349167 6727858 4616777 3445447 1281219 4807796 5386891
 3886441 2644472 3203213 1807907 3533691 8773510 4920396 5547804 7372249
 6800902 5088132 5847168 9472993 9254323 6396797 3701532 9115748 1443891
 4093042 3666967 2434776 8656548 3668032 3260894 7149976 4656659 8959577
 6858936 2631062 6799323 8723396 1988153 4447216 6150795 3680801 5063399
 4307047 5372804 3432923 6290332 8417446 4803752 4221389 9106579 8416951
 1846501 9328109 2961097 4486040 6728766 4278671 2716309 9978272 1230711
 6199747 7580233 1687118 2033600 4477707 1277829 7872087 1538229 9166855
 3682233 7080183 2628055 8670981 3836014 5741385 9281598 8510922 3892001
 9356167 3643777 5114886 6457724 6351151 2043950 6720865 5573283 2928176
 5360908 5567595 9462041 1487323 4772752 9542091 37

In [106]:
# Normalize penghasilan column
penghasilan_normalized = (penghasilan - penghasilan.min()) / (penghasilan.max() - penghasilan.min())

# Reshape penghasilan_normalized to match expected input shape
penghasilan_normalized = penghasilan_normalized.reshape(-1, 1)

# Define number of clusters
num_clusters = 3

In [107]:
import tensorflow as tf

# Convert penghasilan_normalized to TensorFlow constant
penghasilan_tf = tf.constant(penghasilan_normalized, dtype=tf.float32)

# Create K-Means clustering model using Sequential model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(num_clusters)
])

In [116]:
# Compile the model
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
cost_fn = tf.keras.losses.MeanSquaredError()
model.compile(optimizer=opt, loss=cost_fn, metrics=['accuracy'])

# Train the model
model.fit(penghasilan_tf, cluster, epochs=50)

# Predict cluster probabilities for the input data
cluster_probs = model.predict(penghasilan_tf)
clusters = np.argmax(cluster_probs, axis=1)

# Print cluster assignments
#for i, clustert in enumerate(clusters):
#   print(f"Data {i+1} belongs to clustert {clustert+1}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [121]:
# Test the model with new data
new_data = np.array([penghasilan.min(), 2000000,4100000, 5000000, 5100000, 6000000, 7200000, 10000000 ])

# Normalize penghasilan column
data_normalized = (new_data - penghasilan.min()) / (penghasilan.max() - penghasilan.min())

# Reshape penghasilan_normalized to match expected input shape
data_normalized =data_normalized.reshape(-1, 1)

# Predict cluster probabilities for the input data
cluster_probs = model.predict(data_normalized)
clusters = np.argmax(cluster_probs, axis=1)

# Print cluster assignments
for i, clustert in enumerate(clusters):
    print(f"Data {i+1} belongs to cluster {clustert+1}")

Data 1 belongs to cluster 3
Data 2 belongs to cluster 3
Data 3 belongs to cluster 2
Data 4 belongs to cluster 2
Data 5 belongs to cluster 2
Data 6 belongs to cluster 2
Data 7 belongs to cluster 1
Data 8 belongs to cluster 1


In [49]:
# New data for prediction
new_data = np.array([5000000])

# Normalize the income data
normalized_incomes = (new_data - np.mean(incomes)) / np.std(incomes)

# Get the cluster assignments for each income
cluster_indices = model.predict(normalized_incomes, batch_size=10)

# Define the cluster names
cluster_names = {
    1: "High Income",
    0: "Low  Income",
    2: "Medium Income"
}

# Print the cluster assignments for each income with names
#for i, cluster_index in enumerate(cluster_indices):
#    print("Income: {:.2f} - Cluster: {} ({})".format(incomes[i], cluster_index, cluster_names[cluster_index]))

# Find the lowest and highest values in each cluster
cluster_data = pd.DataFrame({'Income': incomes, 'Cluster': cluster_indices})
cluster_stats = cluster_data.groupby('Cluster')['Income'].agg(['min', 'max'])

# Print the lowest and highest values in each cluster
for cluster_index, stats in cluster_stats.iterrows():
    print("Cluster: {} - Lowest: {:.2f}, Highest: {:.2f}".format(cluster_index, stats['min'], stats['max']))

# Count the number of data points in each cluster
cluster_counts = cluster_data['Cluster'].value_counts().sort_index()

# Print the count of data points in each cluster
for cluster_index, count in cluster_counts.items():
    print("Cluster: {} - Count: {}".format(cluster_index, count))

ValueError: ignored

In [123]:
# save_model
export_dir = 'saved_model/Kmeanslite'

model.save('saved_model/Kmeans.h5')

tf.saved_model.save(model, export_dir)



In [125]:
# Select mode of optimization
mode = "a" 

if mode == 'Storage':
    optimization = tf.lite.Optimize.OPTIMIZE_FOR_SIZE
elif mode == 'Speed':
    optimization = tf.lite.Optimize.OPTIMIZE_FOR_LATENCY
else:
    optimization = tf.lite.Optimize.DEFAULT

In [128]:
# save_model
converter = tf.lite.TFLiteConverter.from_saved_model(export_dir)

# Set the optimzations
converter.optimizations = [optimization]

# Invoke the converter to finally generate the TFLite model

tflite_model = converter.convert()
with open('model.tflite', 'wb') as f:
  f.write(tflite_model)

In [127]:
import pathlib
tflite_model_file = pathlib.Path('./Kmeans.tflite')
tflite_model_file.write_bytes(tflite_model)

2828