In [20]:
import csv
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import time
import os
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder
from torch import nn
from torch.nn import functional as F
from IPython import display
from enum import Enum

In [2]:
BASE_PATH = './'
HEADER = 'glcm_bytes_'
CSV_SUFFIX = '.csv'
LABEL_FILENAME = 'label.csv'
PROPERTY_ARRAY_SIZE = 256

Properties = Enum('Property', ('contrast', 'homogeneity', 'correlation', 'dissimilarity', 'energy', 'entropy'))

In [3]:
def getPropertyCsvPath(property):
    return BASE_PATH + HEADER + property + CSV_SUFFIX

def handleStrListOutliers(strList):
    if len(strList) != PROPERTY_ARRAY_SIZE:
        strList += ['nan' for i in range(PROPERTY_ARRAY_SIZE - len(strList))]
    for i in range(len(strList)):
        if strList[i] == 'nan': strList[i] = "0"
    return strList

def fromStrListToFloatList(strList):
    return [round(float(i), 4) for i in strList]

def getPropertyStrList(item):
    return handleStrListOutliers(item[2].split(' '))

def regularizeList(list):
    min, max = 0.0, 0.0
    for item in list:
        for i in item:
            if i < min: min = i
            if i > max: max = i
    for i in range(len(list)):
        for j in range(len(list[0])):
            list[i][j] = round((list[i][j] - min) / (max - min), 4) * 100
    return list

def getPropertyList(property):
    path = getPropertyCsvPath(property)
    array = np.array(pd.read_csv(path))
    propertyList = []
    for item in array:
        propertyList.append(fromStrListToFloatList(getPropertyStrList(item)))
    return regularizeList(propertyList)

In [6]:
def combinePropertyLists():
    combinedList = []
    for property, _ in Properties.__members__.items():
        combinedList.append(getPropertyList(property))
    combinedArray = np.array(combinedList)
    return combinedArray

In [10]:
def getLabelCsvPath():
    return BASE_PATH + LABEL_FILENAME

def getLabelArray():
    csv_reader = pd.read_csv(getLabelCsvPath())
    return np.array(csv_reader)

In [15]:
def combineLabelAndProperties(labelArray, combinedPropertyArray):
    print("la: ", labelArray.shape)
    print("ca: ", combinedPropertyArray.shape)
    combinedList = []
    for i in range(len(labelArray)):
        tempList = []
        tempList.append(labelArray[i][0])
        tempList.append(labelArray[i][1])
        tempList.append(combinedPropertyArray[:, i, :])
        combinedList.append(tempList)
    return np.array(combinedList)

def polymerToArray():
    combinedPropertyArray = combinePropertyLists()
    labelArray = getLabelArray()
    return combineLabelAndProperties(labelArray, combinedPropertyArray)

finalArray = polymerToArray()
print(finalArray.shape)

la:  (10868, 2)
ca:  (6, 10868, 256)
(10868, 3)


  return np.array(combinedList)


In [30]:
print(finalArray[0][-1].shape)
print(type(finalArray[1][-1]))
# print(finalArray[1][-1].tolist())

(6, 256)
<class 'numpy.ndarray'>


In [28]:
class MulDataset(Dataset):
    def __init__(self, mulArray):
        self.mulArray = mulArray
        self.samples = []
        self.label_codec = LabelEncoder()
        self._init_dataset()
    
    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        feature, label = self.samples[idx]
        return self.one_hot_sample(feature, int(label))

    def _init_dataset(self):
        for item in self.mulArray:
            list = item[2].tolist()
            label = item[1]
            self.samples.append((list, label))

    def to_one_hot(self, codec, label):
        value_idxs = codec.transform(label)
        return torch.eye(len(codec.classes_))[value_idxs]

    def one_hot_sample(self, feature, label):
        t_feature = torch.tensor(feature, dtype=torch.float).view(6, 16, 16)
        t_label= torch.tensor(label-1, dtype=torch.long)
        return t_feature, t_label

In [29]:
batch_size, lr, num_epochs = 32, 0.1, 200
device = torch.device('cpu')
dataset = MulDataset(finalArray)
train_size = int(len(dataset) * 0.7)
test_size = len(dataset) - train_size
train_set, test_set = torch.utils.data.random_split(dataset, [train_size, test_size])
train_iter = iter(DataLoader(train_set, batch_size=batch_size, shuffle=True))
test_iter = iter(DataLoader(test_set, batch_size=batch_size, shuffle=True))