In [2]:
:sccache 1
:dep ndarray
:dep plotly
:dep polars
:dep rand

extern crate ndarray;
extern crate plotly;
extern crate polars;
extern crate rand;

use ndarray::Array;
use plotly::*;
use plotly::common::Mode;
use polars::prelude::CsvReader;
use polars::prelude::*;
use polars::frame::DataFrame;
use rand::Rng;

sccache: true


In [3]:
// Grab the dataset from local csv file
// Datasets are courtesy of:
// http://archive.ics.uci.edu/ml/datasets/
// https://github.com/milaan9/Clustering-Datasets
let df = CsvReader::from_path("./data/kmeans-dataset.csv").unwrap().finish().unwrap();
println!("{:?}", df);

shape: (2990, 3)
┌──────────┬─────────┬─────────┐
│ x        ┆ y       ┆ cluster │
│ ---      ┆ ---     ┆ ---     │
│ f64      ┆ f64     ┆ i64     │
╞══════════╪═════════╪═════════╡
│ 1.00007  ┆ 40.9378 ┆ 0       │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 0.99736  ┆ 41.1714 ┆ 0       │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 0.134799 ┆ 41.8113 ┆ 0       │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 2.47585  ┆ 41.6346 ┆ 0       │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ ...      ┆ ...     ┆ ...     │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 96.5007  ┆ 67.9212 ┆ 9       │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 96.8852  ┆ 68.1787 ┆ 9       │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 96.834   ┆ 67.9841 ┆ 9       │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 98.4335  ┆ 68.2043 ┆ 9       │
└──────────┴─────────┴─────────┘


In [4]:
let x0:Vec<f64> = df[0].f64()?.into_no_null_iter().collect();
let y0:Vec<f64> = df[1].f64()?.into_no_null_iter().collect();

let trace = Scatter::new(x0.clone(), y0.clone()).mode(Mode::Markers);
let mut plot = Plot::new();
plot.add_trace(trace);

let layout = Layout::new().height(525);
plot.set_layout(layout);

plot.notebook_display();

In [7]:
fn split_data(df: &DataFrame, percent_train:f32)->(DataFrame, DataFrame) {
    // get number of elements to put in training data
    let num_train:usize = (percent_train * df.shape().0 as f32) as usize;
    // get number of elements to put in test data
    let num_test:usize = df.shape().0 - num_train;
    
    // shuffle dataframe
    let mut rng = rand::thread_rng();
    let shuffled = df.sample_frac(1.0, false, true, Some(rng.gen_range(0..u64::MAX))).expect("Error shufflilng df!");
    
    // return dataframes of each
    return (shuffled.slice(0, num_train), shuffled.slice( (num_train-1) as i64, num_test));
}

In [9]:
let (train_data, test_data) = split_data(&df, 0.8);
println!("{:?}", train_data);
println!("{:?}", test_data);

shape: (2392, 3)
┌─────────┬─────────┬─────────┐
│ x       ┆ y       ┆ cluster │
│ ---     ┆ ---     ┆ ---     │
│ f64     ┆ f64     ┆ i64     │
╞═════════╪═════════╪═════════╡
│ 46.3934 ┆ 98.1286 ┆ 8       │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 90.1393 ┆ 24.1829 ┆ 4       │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 46.3795 ┆ 96.0969 ┆ 8       │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 13.7783 ┆ 78.709  ┆ 7       │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ ...     ┆ ...     ┆ ...     │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ -1.1779 ┆ 2.52419 ┆ 5       │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 97.8297 ┆ 67.9631 ┆ 9       │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 17.0008 ┆ 71.9158 ┆ 7       │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 56.4155 ┆ 3.16169 ┆ 1       │
└─────────┴─────────┴─────────┘
shape: (598, 3)
┌─────────┬──────────┬─────────┐
│ x       ┆ y        ┆ cluster │
│ ---     ┆ ---      ┆ ---     │
│ f64     ┆ f64      ┆ i64     │
╞═════════╪══════════╪═════════╡
│ 56.4155 ┆ 3.16169  ┆ 1       │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
