In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

import numpy as np
import pandas as pd

In [37]:
class WinstonDataFormatter:

    def __init__(self, source_dir_path):

        self.mapping_dict = {
            'n02058221': 'albatross',
            'n02130308': 'cheetah',
            'n01518878': 'ostrich',
            'n02056570': 'penguin',
            'n02391049': 'zebra'
        }

        self.source_dir_path = source_dir_path

        self.df_labels = self._load_labels_df()
        self.df_data = self._load_data_df()

    def _load_labels_df(self):
        file_path = os.path.join(self.source_dir_path, 'labels.csv')
        df_labels = pd.read_csv(file_path, index_col=0).fillna(-1)
        df_labels.columns = self.mapping_dict.keys()
        return df_labels

    def _load_data_df(self):
        file_path = os.path.join(self.source_dir_path, 'data.csv')
        df_data = pd.read_csv(file_path, index_col=0)
        return df_data

    def format_data_and_save_as_csv(self, 
                                    save_dir_path, 
                                    sample_num_per_predicate=None):

        formatted_data_list = []

        for idx, row in self.df_labels.iterrows():
            file_name = "L_" + idx + ".csv"
            df_tmp = self.df_data.loc[:, ['folder', 'R', 'G', 'B']].copy()
            df_tmp.loc[:, 'label'] = 'hello'

            for folder_name in self.mapping_dict.keys():
                condition = df_tmp.loc[:, 'folder'] == folder_name
                df_tmp.loc[condition, 'label'] = row[folder_name]

                if sample_num_per_predicate is None:
                    condition = df_tmp.loc[:, 'folder'] == folder_name
                    df_tmp.loc[condition, 'label'] = row[folder_name]
                elif type(sample_num_per_predicate) == int and sample_num_per_predicate > 0:
                    selected_rows = df_tmp[condition].sample(n=sample_num_per_predicate)
                    df_tmp = pd.concat([selected_rows, df_tmp[~condition]])
                else:
                    print("Something wrong!")
                    break

            df_tmp = df_tmp.drop(['folder'], axis=1)

            formatted_data_list.append(df_tmp)

            df_tmp.to_csv(os.path.join(save_dir_path, file_name))
        
        return formatted_data_list    

In [38]:
data_formatter = WinstonDataFormatter('./')

In [39]:
data_formatter.format_data_and_save_as_csv('./winston_tmp',
                                           sample_num_per_predicate=None)

[             R         G         B label
 0     0.486640  0.440085  0.342231  -1.0
 1     0.427333  0.406967  0.376660  -1.0
 2     0.509321  0.512104  0.398580  -1.0
 3     0.581645  0.572609  0.549550  -1.0
 4     0.505475  0.517679  0.422280  -1.0
 ...        ...       ...       ...   ...
 6495  0.618658  0.583356  0.521907   1.0
 6496  0.506803  0.401962  0.288625   1.0
 6497  0.517260  0.509384  0.498286   1.0
 6498  0.470308  0.439706  0.373938   1.0
 6499  0.391544  0.400506  0.424961   1.0
 
 [6500 rows x 4 columns],
              R         G         B label
 0     0.486640  0.440085  0.342231  -1.0
 1     0.427333  0.406967  0.376660  -1.0
 2     0.509321  0.512104  0.398580  -1.0
 3     0.581645  0.572609  0.549550  -1.0
 4     0.505475  0.517679  0.422280  -1.0
 ...        ...       ...       ...   ...
 6495  0.618658  0.583356  0.521907   1.0
 6496  0.506803  0.401962  0.288625   1.0
 6497  0.517260  0.509384  0.498286   1.0
 6498  0.470308  0.439706  0.373938   1.0
 6499 

In [40]:
pd.read_csv('./winston_tmp/L_albatross(x).csv', index_col=0)

Unnamed: 0,R,G,B,label
0,0.486640,0.440085,0.342231,-1.0
1,0.427333,0.406967,0.376660,-1.0
2,0.509321,0.512104,0.398580,-1.0
3,0.581645,0.572609,0.549550,-1.0
4,0.505475,0.517679,0.422280,-1.0
...,...,...,...,...
6495,0.618658,0.583356,0.521907,-1.0
6496,0.506803,0.401962,0.288625,-1.0
6497,0.517260,0.509384,0.498286,-1.0
6498,0.470308,0.439706,0.373938,-1.0


In [42]:
def create_and_save_unsupervised_data(save_dir_path, data_num=10, data_dim=3):
    arr_U = np.random.rand(data_num, data_dim)
    df_U = pd.DataFrame(arr_U)
    df_U.to_csv(os.path.join(save_dir_path, 'U.csv'))
    return df_U

In [43]:
create_and_save_unsupervised_data('./winston_tmp', data_num=20)

Unnamed: 0,0,1,2
0,0.147098,0.482714,0.514674
1,0.454564,0.811699,0.039264
2,0.869785,0.01252,0.921532
3,0.85551,0.547825,0.890392
4,0.225771,0.48192,0.487123
5,0.173014,0.167029,0.173976
6,0.730805,0.277398,0.451279
7,0.47574,0.530275,0.063991
8,0.270844,0.019956,0.527189
9,0.260999,0.399335,0.310795


In [44]:
pd.read_csv('./winston_tmp/U.csv', index_col=0)

Unnamed: 0,0,1,2
0,0.147098,0.482714,0.514674
1,0.454564,0.811699,0.039264
2,0.869785,0.01252,0.921532
3,0.85551,0.547825,0.890392
4,0.225771,0.48192,0.487123
5,0.173014,0.167029,0.173976
6,0.730805,0.277398,0.451279
7,0.47574,0.530275,0.063991
8,0.270844,0.019956,0.527189
9,0.260999,0.399335,0.310795
