# **IMPORT LIBRARY**


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.nn as nn
import torch.optim as optim
from torchvision.models import resnet18, resnet34, googlenet
import time
from torch.autograd import Variable
from PIL import Image
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import cv2
import time
from torch.autograd import Variable

In [None]:
use_gpu = torch.cuda.is_available()

# **DATA PREPARATION**

In [None]:
data_path='/content/drive/MyDrive/aiforindonesia/project_1/Dataset'
project_path='/content/drive/MyDrive/aiforindonesia/project_1/output'

attribute_path = data_path+'/list_attribute.txt'
image_list_path = data_path+'/Images'

In [None]:
os.listdir(data_path)

['gender_classification.csv',
 'class_identity.txt',
 'list_attribute.txt',
 'gender_classification.xlsx',
 'model_saved',
 'Images']

In [None]:
os.listdir(project_path)

['cleaned_list_attribute.txt']

In [None]:
images = os.listdir(image_list_path)
images = [image for image in images if len(image.split('(')) <= 1]
print('Eg: ', images[:2])
print('Count:', len(images))
# images

Eg:  ['088686.jpg', '030742.jpg']
Count: 5000


In [None]:
data = pd.DataFrame(pd.read_csv(project_path+'/cleaned_list_attribute.txt', sep='\s+'))
data = pd.DataFrame(data.filter(items=images, axis=0))
data = data.replace(-1, 0)
data = data['Male']

In [None]:
data.info()

<class 'pandas.core.series.Series'>
Index: 5000 entries, 088686.jpg to 041283.jpg
Series name: Male
Non-Null Count  Dtype
--------------  -----
5000 non-null   int64
dtypes: int64(1)
memory usage: 78.1+ KB


In [None]:
# Redistribute male & female image
data_male = data[data==1]
data_female = data[data==0]
# data_male = data_male.sample(frac = 1, random_state = 42)
# data_female = data_female.sample(frac = 1, random_state = 42)

train_data = pd.concat([data_male[:1500], data_female[:1500]], axis=0)
test_data = pd.concat([data_male[1500:], data_female[1500:]], axis=0)
# train_data.sort_index(inplace = True)
# test_data.sort_index(inplace = True)

## **HELPERS**

In [None]:
def dataset_getitem(image_folder_path, data, idx):
  image_name = data.keys()[idx]
  gender = data.iloc[idx]

  image_path = os.path.join(image_folder_path, image_name)
  image = cv2.imread(image_path)
  image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
  image = Image.fromarray(image)

  return {
      'image': image,
      'gender': gender,
  }

# Function Sample
dataset_getitem(os.path.join(data_path, "Images"), train_data, 0)

{'image': <PIL.Image.Image image mode=RGB size=178x218>, 'gender': 1}

In [None]:
def get_report(model, test_dataloaders):
    train_time = 0
    y_true = torch.empty((0)).cuda()
    y_pred = torch.empty((0)).cuda()


    for data in test_dataloaders:
      inputs, labels = data

      if torch.cuda.is_available():
          inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())
      else:
          inputs, labels = Variable(inputs), Variable(labels)

      start_time = time.time()
      outputs = model(inputs)

      end_time = time.time()
      train_time += end_time - start_time

      preds = (torch.sigmoid(outputs).squeeze() > 0.5).float()
      y_true = torch.cat((y_true, labels.data), -1)
      y_pred = torch.cat((y_pred, preds), -1)

    inference_time = train_time/len(test_dataloaders)

    return y_true, y_pred, inference_time

# **DATA PRE-PROCESSING**

In [None]:
class GenderDataset(Dataset):
    def __init__(self, data, image_folder_path, transform=None):
        self.data = data
        self.image_folder_path = image_folder_path
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
      result = dataset_getitem(self.image_folder_path, self.data, idx)
      if self.transform:
        image = self.transform(result['image'])
      return image, torch.tensor(result['gender'], dtype=torch.long)

In [None]:
transform = {
     'train': transforms.Compose([
        transforms.RandomHorizontalFlip(),
        transforms.Resize(256),
        transforms.RandomRotation(45),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'test': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

In [None]:
train_set = GenderDataset(train_data, image_folder_path=os.path.join(data_path, "Images"), transform=transform['train'])
train_loader = DataLoader(train_set, batch_size=32, shuffle=True, num_workers=2)

test_set = GenderDataset(test_data, os.path.join(data_path, "Images"), transform=transform['test'])
test_loader = DataLoader(test_set, batch_size=32, shuffle=False, num_workers=2)

print({ 'train': len(train_set), 'test': len(test_set) })

{'train': 3000, 'test': 2000}


In [None]:
dataloders = {"train":train_loader, "test":test_loader}
dataset_sizes = {"train":len(train_set), "test":len(test_set)}

In [None]:
criterion = nn.BCEWithLogitsLoss()

## **MODEL GOOGLENET**

In [None]:
def train_model_gnet(model, dataloaders, dataset_sizes, criterion, optimizer, use_gpu=torch.cuda.is_available(), num_epochs=10):
    since = time.time()

    best_model_wts = model.state_dict()
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'test']:
            if phase == 'train':
                model.train(True)  # Set model to training mode
            else:
                model.train(False)  # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for data in dataloders[phase]:
                # get the inputs
                inputs, labels = data

                # wrap them in Variable
                if use_gpu:
                    inputs = Variable(inputs.cuda())
                    labels = Variable(labels.cuda())
                else:
                    inputs, labels = Variable(inputs), Variable(labels)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                outputs = model(inputs)

                # backward + optimize only if in training phase
                if phase == 'train':
                    outputs = outputs.logits.squeeze()
                    loss = criterion(outputs, labels.float())
                    loss.backward()
                    optimizer.step()
                else:
                    outputs = outputs.squeeze()
                    loss = criterion(outputs, labels.float())

                # statistics
                preds = (torch.sigmoid(outputs) > 0.5).float()
                running_loss += loss.data
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.float() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'test' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = model.state_dict()
                state = {'model':model.state_dict(),'optim':optimizer.state_dict()}
#                 torch.save(state,'./point_resnet_best.pth')

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best test Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [None]:
model_gnet = googlenet()
num_ftrs = model_gnet.fc.in_features
model_gnet.fc = nn.Linear(num_ftrs, 1)
optimizer_gnet = optim.Adam(model_gnet.parameters(), lr=0.0001)

if use_gpu:
    model_gnet = model_gnet.cuda()



In [None]:
model_gnet = train_model_gnet(model_gnet, dataloders, dataset_sizes, criterion, optimizer_gnet, num_epochs=10)

Epoch 0/9
----------
train Loss: 0.0186 Acc: 0.6823
test Loss: 0.0136 Acc: 0.8000

Epoch 1/9
----------
train Loss: 0.0123 Acc: 0.8217
test Loss: 0.0089 Acc: 0.8755

Epoch 2/9
----------
train Loss: 0.0090 Acc: 0.8773
test Loss: 0.0087 Acc: 0.8860

Epoch 3/9
----------
train Loss: 0.0082 Acc: 0.8877
test Loss: 0.0092 Acc: 0.8795

Epoch 4/9
----------
train Loss: 0.0076 Acc: 0.8947
test Loss: 0.0106 Acc: 0.8480

Epoch 5/9
----------
train Loss: 0.0065 Acc: 0.9083
test Loss: 0.0138 Acc: 0.8495

Epoch 6/9
----------
train Loss: 0.0062 Acc: 0.9150
test Loss: 0.0091 Acc: 0.8940

Epoch 7/9
----------
train Loss: 0.0057 Acc: 0.9260
test Loss: 0.0046 Acc: 0.9385

Epoch 8/9
----------
train Loss: 0.0056 Acc: 0.9300
test Loss: 0.0092 Acc: 0.8885

Epoch 9/9
----------
train Loss: 0.0054 Acc: 0.9273
test Loss: 0.0047 Acc: 0.9410

Training complete in 9m 8s
Best test Acc: 0.941000
