# Generate Kmeans clustering for patients

In [1]:
import gpudb
import collections
import pandas as pd

### Init Kinetica connection

In [2]:
_db = gpudb.GPUdb(encoding='BINARY', host="127.0.0.1", port="9191")



### Generate Kmeans

In [3]:
def kinetica_k_means(_input_table):
    _result = _db.aggregate_k_means(
                     table_name=_input_table,
                     column_names=['v0',
                                   'v1',
                                   'v2',
                                   'v3',
                                   'v4',
                                   'v5', ],
                     tolerance=0.00001,
                     k=5);

    _num_iters = _result['num_iters']
    _tolerance = _result['tolerance']
    _rms_dist = _result['rms_dist']
    _total_count = _result['count']
    _means_counts = _result['counts']
    _means_coordinates = _result['means']

    print('Total Count: {}'.format(_total_count))
    print('Iterations: {}'.format(_num_iters))
    
    _counts_df = pd.DataFrame(_means_counts, columns=['count'])
    _coord_df = pd.DataFrame(_means_coordinates, columns=['v0','v1','v2','v3','v4','v5'])
    _means_df = pd.concat([_counts_df, _coord_df], axis=1)
    _means_df = _means_df.reset_index()
    _means_df = _means_df.rename(index=str, columns={"index": "idx"})

    return _means_df

### Create result table

In [4]:
_means_df = kinetica_k_means('patient_sv')
_means_df

Total Count: 19092
Iterations: 10


Unnamed: 0,idx,count,v0,v1,v2,v3,v4,v5
0,0,17255,-0.028446,0.013104,-0.008807,0.028768,-0.00577,-0.007656
1,1,98,-0.556804,-0.08605,-0.193631,0.612112,0.034041,0.760758
2,2,276,-0.811426,-0.653601,0.263892,-0.064115,-0.014068,-0.091419
3,3,138,-0.597142,0.029557,-1.093013,-0.408174,-0.004386,-0.110256
4,4,1325,-0.308073,0.34126,0.114182,-0.050675,0.014922,0.001274


### Save results in table

In [5]:
def save_results(df, res_table, schema='KAISERS_SQL'):
    _result_type = [[df.columns[0], gpudb.GPUdbRecordColumn._ColumnType.INT],
                    [df.columns[1], gpudb.GPUdbRecordColumn._ColumnType.INT]]
    
    for _col_name in df.columns[2:]:
        _result_type.append([_col_name, gpudb.GPUdbRecordColumn._ColumnType.DOUBLE])

    _db.clear_table(res_table, options={ 'no_error_if_not_exists':'true' })

    _result_table = gpudb.GPUdbTable(db=_db, _type=_result_type, name=res_table,
        options={'collection_name': schema,
                 'is_replicated': 'true'} )

    _insert_records = df.to_records(index=False)
    _insert_rows = map(list, _insert_records)

    print('Inserting rows into <{}>: {}'.format(res_table, len(_insert_rows)))
    _result_table.insert_records(_insert_rows)

In [6]:
save_results(_means_df, 'kmeans_patient')

Inserting rows into <kmeans_patient>: 5


# Generate Male K-Means

In [7]:
_means_df_m = kinetica_k_means('patient_sv_m')
_means_df_m

Total Count: 8728
Iterations: 10


Unnamed: 0,idx,count,v0,v1,v2,v3,v4,v5
0,0,7707,-0.031765,0.006868,0.007336,-0.032039,-0.010479,-0.013405
1,1,126,-0.918594,-1.01056,-0.229881,0.114066,0.012858,-0.02364
2,2,94,-0.512029,0.039087,0.74671,-0.338759,-0.052265,-0.329027
3,3,445,-0.355494,0.220335,-0.0176,-0.139448,0.207982,0.23573
4,4,356,-0.47765,0.393835,-0.121794,0.205869,-0.197741,-0.113922


In [8]:
save_results(_means_df_m, 'kmeans_patient_m')

Inserting rows into <kmeans_patient_m>: 5


# Generate Female K-Means

In [9]:
_means_df_f = kinetica_k_means('patient_sv_f')
_means_df_f

Total Count: 10363
Iterations: 10


Unnamed: 0,idx,count,v0,v1,v2,v3,v4,v5
0,0,65,0.709157,0.007023,0.241615,-0.812213,0.660413,0.356604
1,1,166,0.988166,-0.692333,-0.197881,0.118802,-0.115929,-0.058018
2,2,84,0.622788,0.362415,1.124138,0.597634,-0.0538,0.086627
3,3,9438,0.034956,0.023338,0.006913,-0.027356,-0.017043,-0.000986
4,4,610,0.316659,0.412657,-0.235508,0.081849,0.034181,-0.02586


In [10]:
save_results(_means_df_f, 'kmeans_patient_f')

Inserting rows into <kmeans_patient_f>: 5
