In [1]:
import re
from sklearn.cluster import KMeans


import numpy as np

'''
source: https://stackoverflow.com/questions/28344660/how-to-identify-cluster-labels-in-kmeans-scikit-learn
'''


def load_data():
    datafile = "data/iqr_cleaned_data.csv"

    '''
    names:
    the column names are stored in names,
    numpy documentation says you can reference by name with the usecols variable
    but I couldn't get it to work
    usecols:
    3,4 corresponds to price and item_id
    '''
    price_data = np.genfromtxt(
        datafile,
        delimiter=",",
        names="product_id, seller_id, price, item_id, customer_id, order_timestamp",
        usecols=2,
        skip_header=1
    )
    item_data = np.genfromtxt(
        datafile,
        delimiter=",",
        names="product_id, seller_id, price, item_id, customer_id, order_timestamp",
        usecols=range(3, 4),
        skip_header=1
    )
    '''
    delimiter: csv files use ,
    names: file headers, should be able to access via names but it wasn't working
    usecols: column range
    skip_headers:1 skips first line
    dtype: format type for numpy 'f8' = float64
        This type is required for kmeans.fit()
    deletechars: removes unwanted characters
    '''
    data = np.genfromtxt(
        datafile,
        delimiter=",",
        names="product_id, seller_id, price, item_id, customer_id, order_timestamp",
        usecols=range(2, 4),
        skip_header=1,
        dtype=[np.float64,np.float64],
        deletechars=', '
    )

    return data


def clustering_example():
    """
    Purpose: This function creates clusters from hard coded example data
    Returns: The clusters as labels array and the numpy array of the data as x_2d
    """
    x1 = [[1], [1], [2], [2], [2], [3], [3], [7], [7], [7]]
    x2 = [[1], [1], [2], [2], [2], [3], [3], [7], [7], [7]]

    x_2d = np.concatenate((x1, x2), axis=1)

    kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=300, n_init=10, random_state=0)
    labels = kmeans.fit(x_2d)
    return labels, x_2d


def clustering(data):
    """
    Purpose: This function creates clusters from my projects data
    Returns: The clusters as labels array and the numpy array of the data as x_2d
    """
    numpified = np.concatenate((data[:,0], data[:,1]), axis=1)
    kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=300, n_init=10, random_state=0)
    labels = kmeans.fit(numpified)
    return labels

if __name__ == "__main__":
    data = load_data()
    kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=0)
    #known: data is a dtype <f little-endian single precision float
    #need: float64

    x = np.array(data)

    labels = kmeans.fit(x)
    print(labels)

    #https://stackoverflow.com/questions/61913327/typeerror-cannot-cast-array-data-from-dtypefloat64-to-dtypeu32-accordin
    '''
    https://programmerah.com/how-to-fix-typeerror-cannot-cast-array-data-from-dtypefloat64-to-dtype%EF%BC%9Cu32-21114/
    Both U32 and S32 indicate that your numpy array is a string array, not a number array. Check whether there are strings
    in the dataset. If there are, just delete them. In numpy array, as long as one item is string, the type returned by
    the array is string array.
    If you need to convert numpy to floating-point number, please refer to the code:
    train= train.astype (float)
    train_ target = train_ target.astype (float)
    '''


TypeError: Cannot cast array data from dtype([('price', '<f8'), ('item_id', '<f8')]) to dtype('float64') according to the rule 'unsafe'