# WHOLE PIPELINE

1. Feature vector 들을 받아온다.
***
2. Feature vector 들을 Normalize 한다.  
    + 0 ~ 1 사이의 값으로 Normalize
    + Normalize factor를 같이 저장한다.
***
2. Feature vector 들을 Clustering 한다.  
    + K-MEANS 알고리즘을 이용해서 군집화
***
3. Clustering 된 Centroids 를 이용해서 Training Dataset 을 만든다.  
    + Centroids의 Index들을 데이터의 Training Label 로써 이용한다.  
    + 이때 만들어진 데이터를 csv format을 이용해서 저장한다.  
***
4. Traning Data를 필요에 따라서 Training Data와 Validation Data 로 구분한다.
    + 현재 별도의 Validation은 진행하지 않을 예정
***
5. Trainor 를 통해서 Classifying Model을 학습시킨다.  
    + Trainor 내에서는 Training data를  불러오고 카테고리 개수의 SVM을 학습시킨다.  
    + 이때 Trainior 는 SVM 파라미터를 입력으로 받는다.
    + https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
    + Train Accuracy 를 파악한다.  
    + 모델은 필요한 곳에 저장하도록 한다.
***
6. Predict 는 임의의 Feature에 대해서 해당 카테고리를 return 하도록 한다.
    + Predict를 통해서 해당 centroid 의 값들을 통해서 어떤 특징을 가지고 있는지도 파악하도록 한다
    
***


In [2]:
import os
import numpy as np
import matplotlib.pyplot as plt
import time
import scipy
import scv
from sklearn.svm import SVC

In [None]:
def normalizer(des):
    """
    Normalize the features all values from -1 to 1
    
    :param des: Length f features from n unique actors
    :type des: numpy.ndarray. shape: [num_images, dim_feature]
    return: Normalized features
    rtype: numpy.ndarray. shape: [num_images, dim_feature]
    """
    
    des = des - ((np.max(a, axis=0) + np.min(a, axis=0)) / 2)
    des = des / ((np.max(a, axis=0) - np.min(a, axis=0)) / 2)
    
    return des

def get_cluster(des, k, thres):
    """
    Make clusters from given features
    
    :param des: length f features from n unique actors
    :type des: numpy.ndarray. shape: [num_images, dim_feature]
    :param k: number of clusters 
    :type k: int
    :param thres: threshold for K-MEAN clustering algorithm
    :type thres: int
    :return: k centroids with f features
    :rtype: numpy.ndarray. shape: [num_centorids, dim_feature]
    """
    
    des = normalizer(des)
    
    np.random.seed(0)
    
    f = np.shape(des)[1]

    centroids = np.array([np.random.rand(f) for i in range(k)])
    
    while(1):
        prev_centroids = centroids
        centroids = np.zeros((k, f))
        data_length = [0] * k

        for feat in des:
            idx = 0
            min = -1

            for cent, i in zip(prev_centroids, range(len(prev_centroids))):
                if (np.linalg.norm(feat-cent) < min or min == -1):
                    idx = i
                    min = np.linalg.norm(feat-cent)

            data_length[idx] += 1
            centroids[idx] += feat

        for i in range(len(centroids)):
            if (data_length[i] != 0):
                centroids[i] = centroids[i] / data_length[i]
            else:
                centroids[i] = np.random.rand(f)

    if (np.linalg.norm(centroids - prev_centroids) < thres):
        break

    return centroids

def make_train_set(des, cent):
    """
    Make training dataset from centroids and descriptors
    
    :param des: length f features from n unique actors
    :type des: numpy.ndarray. shape: [num_images, dim_feature]
    :param cent: list of k centroids
    :type cent: numpy.ndarray. shape: [num_centorids, dim_feature]
    :return: None
    :rtype: None
    """
    
    label = []
    
    for feat in des:
        label.append(np.argmin(np.linalg.norm(cent - feat, axis = 1)))
    
    if not os.path.isdir("train_data"):
        os.mkdir("train_data")
    
    f = open('train_data/train_set.csv','w', newline='')
    wr = csv.writer(f)
    
    for i, feat in enumerate(des):
        wr.writerow(list(feat) + [label[i]])
        
    return None

def load_train_data(data_dir):
    """
    Load training dataset from given data directory
    
    :param data_dir: data directory to load the training data
    :type data_dir: string
    :return: train images and train labels
    :rtype: Tuple of lists
    List of numpy.ndarray. shape: [num_images, dim_feature] 
    List of numpy.ndarray. shape: [num_images, 1]
    """
    
    return None

def train_classifier(features, labels, svm_params):
    """
    Train the SVM classifier

    :param features: Features to train
    :type features: numpy.ndarray. shape: [num_images, dim_feature]
    :param labels: Labels to train
    :type labels: numpy.ndarray. shape: [num_images, 1]
    :param svm_params: SVM parameters
    :type svm_params: dict
    ['C'](float): Regularization parameter
    ['kernel'](str): Kernel specification
    :return: Trained classifier
    :rtype: sklearn.svm.SVC
    """

    svm = SVC(kernel=svm_params['kernel'], C=svm_params['C'], random_state=0)
    svm.fit(features, labels)

    return svm

def Trainer(feat_param, svm_param):
    """
    Train SVM models to classify new feature
      
    :param svm_params: SVM parameters
    :type svm_params: dict
    ['C'](float): Regularization parameter
    ['kernel'](str): Kernel specification
    :return: Trained classifier
    :rtype: sklearn.svm.SVC
    """
    
    if not os.path.isdir("model"):
        os.mkdir("model")
    
    start_time = time.time()
    train_des, train_idxs = load_train_data("train_data/train_set.csv")

    print('Train the classifiers...')
    accuracy = 0
    models = {}
    
    for class_name in category:
        target_idxs = np.array([read_txt(os.path.join(data_dir, '{}_train.txt'.format(class_name)))])
        target_labels = get_labels(train_idxs, target_idxs)
        
        models[class_name] = train_classifier(train_features, target_labels, svm_params)
        train_accuracy = models[class_name].score(train_features, target_labels) 
        print('{} Classifier train accuracy:  {:.4f}'.format(class_name ,train_accuracy))
        accuracy += train_accuracy
    
    print('Average train accuracy: {:.4f}'.format(accuracy/len(category)))
    del train_features, target_labels, target_idxs

    return models    
    
    return None

def Predict(feat, models, cent):
    """
    Predict the label based on the trained SVM classifier
    
    :param models: Trained model
    :type models: sklearn.svm.SVC
    :param cent: Centroids
    :type cent: numpy.ndarray. shape: [num_centorids, dim_feature]
    :return: Correspoding Centroids with label
    :rtype: TBD
    """
    
    return None

In [16]:
import numpy as np

a = np.array([[1, 1], [2, 2]])

type(a)

np.shape(a)

a = np.array([[1,2, 3], [4, 1, 0], [-5, 2, -3]])

In [17]:
print(np.max(a, axis=0))
print(np.min(a, axis=0))
print((np.max(a, axis=0) - np.min(a, axis=0))/2)
print((np.max(a, axis=0) + np.min(a, axis=0))/2)
(a - ((np.max(a, axis=0) + np.min(a, axis=0))/2)) / ((np.max(a, axis=0) - np.min(a, axis=0))/2)

[4 2 3]
[-5  1 -3]
[4.5 0.5 3. ]
[-0.5  1.5  0. ]


array([[ 0.33333333,  1.        ,  1.        ],
       [ 1.        , -1.        ,  0.        ],
       [-1.        ,  1.        , -1.        ]])

In [26]:
a = np.array([[1, 2, 3, 4, 5, 6, 7], [4, 5, 6, 8, 1, 4, 2], [1, 8, 4, 2, 5, 3, 2]])
b = np.array([1, 2, 3, 4, 6, 6, 7])

print(b-a)

print(np.linalg.norm(a - b, axis = 1))

print(np.argmin(np.linalg.norm(a - b, axis = 1)))

[[ 0  0  0  0  1  0  0]
 [-3 -3 -3 -4  5  2  5]
 [ 0 -6 -1  2  1  3  5]]
[1.         9.8488578  8.71779789]
0


In [36]:
import csv

f = open('write.csv','w', newline='')
wr = csv.writer(f)

c = np.array([1, 2, 3])

wr.writerow(list(a[0]) + [c[0]])

f.close()