In [None]:
%load_ext autoreload
%autoreload 2
import os, pickle, csv, itertools, shutil, random
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torchvision, torch
from torchvision import transforms
from omegaconf import OmegaConf as oc
def normalize(data):return (data-np.min(data)) / (np.max(data)-np.min(data))

%matplotlib inline
%config InlineBackend.figure_format='retina'

import utils.utils as utils
import evals.embed_evals as evals
import utils.plot_data as plot
import utils.gen_triplets as gen
import pathlib
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

In [None]:
DATASET_SIZE = 2000
INFORMATIVE_F = 2
DISTRACTOR_F = 2

## informative features

In [None]:
X,y = datasets.make_blobs(2000, centers=2,cluster_std=2)
plt.scatter(X[:,0],X[:,1])

In [None]:
file = "blob_2.npz"
X = np.load(file)['X']
y = np.load(file)['y']

In [None]:
w=-6
b=3.7
xd = np.array([-5,5])
# yd = w*xd + b
# plt.plot(xd, yd, 'k', lw=3, ls='--')
plt.scatter(X[:,0],X[:,1], s=50,c=y,
        cmap="RdBu", vmin=-.2, vmax=1.2,
        edgecolor="white", linewidth=1)
plt.xlim(min(X[:,0]),max(X[:,0]))
plt.ylim(min(X[:,1]),max(X[:,1]))
plt.show()

## distracting features

In [None]:
x3x4 = np.random.random([DATASET_SIZE,DISTRACTOR_F])
X = np.hstack([X,x3x4])

In [None]:
np.savez(file, X=X, y=y)

## gen df

In [None]:
features = ["head size","body size","tail size","texture"]
df = pd.DataFrame(X, columns=features)
df.insert(0, "label", y)

In [None]:
img_id = []
for i in range(DATASET_SIZE):
    if i < 10:
        img = f"00{i}.png"
    elif i >= 10 and i < 100:
        img = f"0{i}.png"
    else:
        img = f"{i}.png"
    img_id.append(img)
df.insert(0, "img_id", img_id)

In [None]:
df_file = "datasets/wv_3d_blob_2/df.csv"
df.to_csv(df_file,index=False)

## borderless plot

In [None]:
df = pd.read_csv(df_file)
features = ["head size","body size"]
assert(len(df)==DATASET_SIZE)

In [None]:
plt.scatter(df[features[0]],df[features[1]], c=df["label"], s=50,
        cmap="RdBu", vmin=-.2, vmax=1.2,
        edgecolor="white", linewidth=1)
plt.tick_params(axis='both', which='both', bottom=False, top=False, labelbottom=False, right=False, left=False, labelleft=False)

## data splitting

In [None]:
data_dir = "datasets/wv_3d_blob_2"

In [None]:
raw_data_dir = os.path.join(data_dir, "raw_imgs")
split_dir =  os.path.join(data_dir, "data")
for i in range(DATASET_SIZE):
    img_id = df.iloc[i]["img_id"]
    label = df.iloc[i]["label"]
    src = os.path.join(raw_data_dir,img_id)
    dst = os.path.join(split_dir,str(label))
    pathlib.Path(dst).mkdir(parents=True, exist_ok=True)
    shutil.copy(src, dst)

In [None]:
total = np.arange(DATASET_SIZE)
valid = np.random.choice(total, int(DATASET_SIZE*0.2), replace=False)
total = np.setdiff1d(total, valid)
test = np.random.choice(total, int(DATASET_SIZE*0.2), replace=False)
total = np.setdiff1d(total, test)
train = total
train_df = df.iloc[train]
valid_df = df.iloc[valid]
test_df = df.iloc[test]

In [None]:
for split, df in zip(["train","valid","test"],[train_df,valid_df,test_df]):
    for i in range(len(df)):
        img_id = df.iloc[i]["img_id"]
        label = df.iloc[i]["label"]
        src = os.path.join(raw_data_dir,img_id)
        dst = os.path.join(data_dir,split,str(label))
        pathlib.Path(dst).mkdir(parents=True, exist_ok=True)
        shutil.copy(src,dst)

## synthetic features.pkl

In [None]:
data_dir = "datasets/wv_3d_blob_2"
df = pd.read_csv("datasets/wv_3d_blob_2/df.csv")

In [None]:
for split in ["train","valid","test"]:
    files = utils.dataset_filenames(os.path.join(data_dir,split))
    files = [x.split("/")[-1] for x in files[:,0]]
    features = np.array([list(df[df["img_id"]==f][["head size","body size","tail size","texture"]].iloc[0]) for f in files])
    pickle.dump(features,open(f"{data_dir}/{split}_features.pkl","wb"))