In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

In [3]:
df = pd.read_csv('../data/color_feat_merged.csv')

In [4]:
def bin_column(column): 
    binned = []
    for value in column: 
        if value < 85: 
            binned.append(0)
        elif 85 <= value < 170:
            binned.append(1)
        else: 
            binned.append(2)
    return np.array(binned).reshape(-1,1)

In [5]:
r_binned = bin_column(df['r'].values)
g_binned = bin_column(df['g'].values)
b_binned = bin_column(df['b'].values)

In [6]:
mlss = MultilabelStratifiedShuffleSplit(n_splits=1, train_size=0.9, test_size=0.1, random_state=34567)



In [7]:
for train_idx, test_idx in mlss.split(df, np.hstack([r_binned, g_binned, b_binned])):
    pass

In [8]:
df_train = df.iloc[train_idx].sample(len(train_idx))

In [9]:
df_test = df.iloc[test_idx].sample(len(test_idx))

In [10]:
len(df_test)

640

In [11]:
df_train.to_csv('../data/development_set.csv', index=False)

In [12]:
df_test.to_csv('../data/holdout_set.csv', index=False)

In [13]:
df_test

Unnamed: 0.1,Unnamed: 0,mc_CRY-chi-0-all,mc_CRY-chi-1-all,mc_CRY-chi-2-all,mc_CRY-chi-3-all,mc_CRY-Z-0-all,mc_CRY-Z-1-all,mc_CRY-Z-2-all,mc_CRY-Z-3-all,mc_CRY-I-0-all,...,sum-D_func-alpha-1-all,sum-D_func-alpha-2-all,sum-D_func-alpha-3-all,color_cleaned,refcode,color_string,delta_t_seconds,r,g,b
1812,1812,3.6481,26.281600,19.482000,45.763600,784.0,896.00,672.0,1568.0,1.0,...,-12.000000,-12.000000,-24.00,colorless,WIHDUN,colorless,23.274056,232.0,234.0,244.0
3690,3690,3.5344,29.678933,23.130267,49.751067,729.0,990.00,1188.0,1521.0,1.0,...,-67.200000,-67.200000,-150.00,red,AWAWUR,red,13.378720,250.0,11.0,5.0
1058,1058,1.4400,31.104000,36.864000,80.304000,3969.0,3780.00,7056.0,8820.0,1.0,...,16.200000,-7.800000,-2.00,colorless,OGIYUZ,colorless,23.274056,232.0,234.0,244.0
3156,3156,3.6100,23.864000,42.332000,68.894000,841.0,841.00,2813.0,1972.0,1.0,...,0.000000,0.000000,0.00,blue,MUZRIJ,blue,19.723303,12.0,26.0,224.0
5957,5957,2.9354,13.671100,49.256700,19.670000,3222.5,3966.25,5407.5,5144.0,1.0,...,-386.120000,0.000000,-244.28,brown,SUKKUF,brown,17.611029,91.0,48.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1637,1637,2.7225,21.384000,25.245000,51.117000,900.0,900.00,1080.0,1680.0,1.0,...,-33.600000,-54.000000,-87.60,colorless,ULEPIM,colorless,23.274056,232.0,234.0,244.0
2131,2131,2.8561,23.254400,17.238000,40.492400,2304.0,1536.00,1152.0,2688.0,1.0,...,-24.000000,-24.000000,-48.00,colorless,CUMLUT,colorless,23.274056,232.0,234.0,244.0
4780,4780,1.4400,31.956000,16.740000,42.060000,3969.0,3906.00,5859.0,5166.0,1.0,...,69.000000,48.600000,117.60,yellow,GUKZAO,yellow,19.796456,247.5,242.5,16.0
4664,4664,2.8561,26.679467,26.792133,66.800067,2304.0,1760.00,4608.0,4160.0,1.0,...,-38.493333,-37.448889,-54.40,orange,HUFKUQ,orange,22.024030,246.5,132.0,4.0


In [14]:
from colour.models import RGB_to_HSV

In [15]:
RGB_to_HSV(df_train[['r', 'g', 'b']])

  S = as_float_array(delta / maximum)
  delta_R = (((maximum - R) / 6) + (delta / 2)) / delta
  delta_G = (((maximum - G) / 6) + (delta / 2)) / delta
  delta_B = (((maximum - B) / 6) + (delta / 2)) / delta
  H[np.asarray(H < 0)] += 1
  H[np.asarray(H > 1)] -= 1


array([[  3.27345309e-01,   8.74345550e-01,   1.91000000e+02],
       [  3.27345309e-01,   8.74345550e-01,   1.91000000e+02],
       [  8.33333333e-02,   9.45054945e-01,   9.10000000e+01],
       ..., 
       [  6.55660377e-01,   9.46428571e-01,   2.24000000e+02],
       [  6.55660377e-01,   9.46428571e-01,   2.24000000e+02],
       [  9.99099099e-01,   9.78835979e-01,   1.89000000e+02]])