# Loading the relevant libraries

In [1]:
import os
import pandas as pd 
import numpy as np
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import Counter
from torchvision import transforms, utils, datasets
from torch.utils.data import Dataset, DataLoader, random_split, SubsetRandomSampler, WeightedRandomSampler

# Open the file and extract the labels of the imagenet dataset

In [2]:
imagenet_label_list = pd.read_csv(r"/home/junzhin/Project/Summer_project/code/version1.0/imagenet_labels_set.txt", sep = " ",header = None)

In [3]:
imagenet_label_list_dict = dict(zip(imagenet_label_list[2].apply(lambda x: x.lower()), imagenet_label_list[0]))


# Checking the directory of the imagenet and make sure the existence and presence of the labels

In [4]:
your_path =r"/data/gpfs/datasets/Imagenet/" 
file_dir = os.listdir(your_path)

In [5]:
train_filenames = os.listdir(your_path + file_dir[1])
valid_filenames = os.listdir(your_path + file_dir[2])
print(len(train_filenames))
print(len(valid_filenames))

1000
1000


In [6]:
print(set(train_filenames) == set(valid_filenames))

True


In [7]:
imagenet_training_filePath = os.path.join(your_path, file_dir[1])
imagenet_training_filePath = os.path.join(your_path, file_dir[2])


In [8]:

image_net_train_dataset = datasets.ImageFolder(
        imagenet_training_filePath)


# Open the office31 dataset and extract the labels 

In [9]:
office31_path =r"/home/junzhin/Project/datasets/domain_adaptation_images/"
office31_dir = os.listdir(office31_path)
amazon_file_path = os.path.join(office31_path, office31_dir[0]+ "/images/")
dslr_file_path = os.path.join(office31_path, office31_dir[1]+ "/images/")
webcam_file_path = os.path.join(office31_path, office31_dir[2]+ "/images/")
print(office31_dir)

['amazon', 'dslr', 'webcam']


## Checking if three domains have the same set of class labels

In [10]:
amazon =  os.listdir(amazon_file_path)
dslr =  os.listdir(dslr_file_path)
webcam =  os.listdir(webcam_file_path)
 

In [11]:
print(set(amazon) == set(dslr) == set(webcam))

True


## Loading one of the domains of the class labels for further manipulation

In [12]:
office_31_train_dataset = datasets.ImageFolder(
        amazon_file_path)
office_31_train_dataset



Dataset ImageFolder
    Number of datapoints: 2817
    Root location: /home/junzhin/Project/datasets/domain_adaptation_images/amazon/images/

In [13]:
print(dict(Counter(office_31_train_dataset.targets)))
office_31_training_labels = office_31_train_dataset.classes
print(office_31_train_dataset.classes[:20])

{0: 92, 1: 82, 2: 72, 3: 82, 4: 36, 5: 94, 6: 91, 7: 97, 8: 97, 9: 81, 10: 99, 11: 100, 12: 100, 13: 98, 14: 100, 15: 99, 16: 100, 17: 94, 18: 96, 19: 95, 20: 93, 21: 100, 22: 98, 23: 98, 24: 90, 25: 75, 26: 100, 27: 99, 28: 99, 29: 96, 30: 64}
['back_pack', 'bike', 'bike_helmet', 'bookcase', 'bottle', 'calculator', 'desk_chair', 'desk_lamp', 'desktop_computer', 'file_cabinet', 'headphones', 'keyboard', 'laptop_computer', 'letter_tray', 'mobile_phone', 'monitor', 'mouse', 'mug', 'paper_notebook', 'pen']


In [14]:
selected_index = [ office_31_train_dataset.class_to_idx[i] for i in office_31_train_dataset.classes[:20]]
print(selected_index)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]


In [15]:
trainset_1 = torch.utils.data.Subset(office_31_train_dataset, selected_index)
train_loader = torch.utils.data.DataLoader(trainset_1, batch_size=1, shuffle= False, num_workers=1, pin_memory=True,sampler=None)

## Checking the duplicate classes between the imagenet of 1000 classes and office31

In [16]:
office_31_training_labels = [word.lower().replace("_", " ") for word in office_31_training_labels]

In [17]:
imagenet_label_list = [word.lower().replace("_", " ") for word in imagenet_label_list[2]]

### Method 1: Exact string match between two datasets classes

In [18]:
masked_list1 = set(office_31_training_labels).intersection(set(imagenet_label_list))

### Method 2 Similarity string matching between two datasets classes: To remove any possible similar strings in both set of classes

In [19]:
import textdistance

In [20]:
def find_similars_dict(source_labels, target_labels, threshold = 1.0):
    similarity_dict =  {}
    threshold = 1.0
    for x in source_labels:
        similarity_dict[x] = []
        for y in target_labels:
            similarity_score = textdistance.smith_waterman.normalized_similarity(x,y)
            if similarity_score >= threshold:
                similarity_dict[x].append([y,similarity_score])
    for each_word_list in similarity_dict:
        similarity_dict[each_word_list].sort(key=lambda x: x[1], reverse = True)
    # levenshtein damerau_levenshtein gotoh smith_waterman
    return similarity_dict

In [21]:
similarity_dict = find_similars_dict(office_31_training_labels,imagenet_label_list)

In [22]:
print(similarity_dict)

{'back pack': [], 'bike': [['mountain bike', 1.0]], 'bike helmet': [], 'bookcase': [['bookcase', 1.0]], 'bottle': [['beer bottle', 1.0], ['pop bottle', 1.0], ['wine bottle', 1.0], ['pill bottle', 1.0], ['water bottle', 1.0]], 'calculator': [], 'desk chair': [], 'desk lamp': [], 'desktop computer': [['desktop computer', 1.0]], 'file cabinet': [], 'headphones': [], 'keyboard': [['computer keyboard', 1.0], ['typewriter keyboard', 1.0]], 'laptop computer': [], 'letter tray': [['tray', 1.0]], 'mobile phone': [], 'monitor': [['monitor', 1.0]], 'mouse': [['mouse', 1.0]], 'mug': [['coffee mug', 1.0]], 'paper notebook': [['notebook', 1.0]], 'pen': [['fountain pen', 1.0]], 'phone': [['microphone', 1.0], ['pay-phone', 1.0], ['cellular telephone', 1.0], ['dial telephone', 1.0]], 'printer': [['printer', 1.0]], 'projector': [['projector', 1.0]], 'punchers': [], 'ring binder': [['binder', 1.0]], 'ruler': [], 'scissors': [], 'speaker': [['loudspeaker', 1.0]], 'stapler': [], 'tape dispenser': [], 'tras

In [23]:
masked_list2 = set()
for key in similarity_dict:
    for values in similarity_dict[key]:
        masked_list2.add(values[0])
print(masked_list2)

{'pop bottle', 'beer bottle', 'pill bottle', 'coffee mug', 'wine bottle', 'printer', 'notebook', 'bookcase', 'desktop computer', 'cellular telephone', 'monitor', 'microphone', 'projector', 'mountain bike', 'mouse', 'dial telephone', 'water bottle', 'typewriter keyboard', 'fountain pen', 'loudspeaker', 'tray', 'computer keyboard', 'pay-phone', 'binder'}


## Save them into files:

In [24]:
masked_list1 = [x.replace(" ", "_") for x in masked_list1]
masked_list2 = [x.replace(" ", "_") for x in masked_list2]
print(masked_list1)
print(masked_list2)

['printer', 'bookcase', 'desktop_computer', 'monitor', 'projector', 'mouse']
['pop_bottle', 'beer_bottle', 'pill_bottle', 'coffee_mug', 'wine_bottle', 'printer', 'notebook', 'bookcase', 'desktop_computer', 'cellular_telephone', 'monitor', 'microphone', 'projector', 'mountain_bike', 'mouse', 'dial_telephone', 'water_bottle', 'typewriter_keyboard', 'fountain_pen', 'loudspeaker', 'tray', 'computer_keyboard', 'pay-phone', 'binder']


In [25]:
masked_list1_df = pd.DataFrame([(imagenet_label_list_dict[i], i) for i in masked_list1])
masked_list2_df = pd.DataFrame([(imagenet_label_list_dict[i], i) for i in masked_list2])
masked_list1_df.to_csv('masked_office31_imagenetlabel1_df.csv',index = False, header = False)
masked_list2_df.to_csv('masked_office31_imagenetlabel2_df.csv',index = False, header = False)


# Open the officehome dataset and extract the labels 

In [26]:
officehome_path =r"/home/junzhin/Project/datasets/OfficeHomeDataset_10072016"
officehome_dir = os.listdir(officehome_path)
art_file_path = os.path.join(officehome_path, officehome_dir[0])
clipart_file_path = os.path.join(officehome_path, officehome_dir[1])
product_file_path = os.path.join(officehome_path, officehome_dir[4])
world_file_path = os.path.join(officehome_path, officehome_dir[5])

In [27]:
print(officehome_path)
print(officehome_dir)

/home/junzhin/Project/datasets/OfficeHomeDataset_10072016
['Art', 'Clipart', 'ImageInfo.csv', 'imagelist.txt', 'Product', 'Real World']


In [28]:
art =  os.listdir(art_file_path)
clipart =  os.listdir(clipart_file_path)
product =  os.listdir(product_file_path)
world =  os.listdir(world_file_path)
print(set(art) == set(clipart) == set(product) == set(world))
print(len(set(art)))

True
65


## Choosing one of the domains for labels extractions:


In [29]:
office_home_train_dataset = datasets.ImageFolder(art_file_path)
office_home_train_dataset.classes
office_home_label_list = [i.lower().replace("_", " ") for i in office_home_train_dataset.classes]

In [30]:
masked_list1 = set(office_home_label_list).intersection(set(imagenet_label_list))
masked_list1

{'backpack',
 'bucket',
 'hammer',
 'laptop',
 'monitor',
 'mouse',
 'notebook',
 'printer',
 'radio',
 'refrigerator',
 'screwdriver'}

In [31]:
similarity_dict = find_similars_dict(office_home_label_list, imagenet_label_list)

In [32]:
print(similarity_dict)


{'alarm clock': [], 'backpack': [['backpack', 1.0]], 'batteries': [], 'bed': [], 'bike': [['mountain bike', 1.0]], 'bottle': [['beer bottle', 1.0], ['pop bottle', 1.0], ['wine bottle', 1.0], ['pill bottle', 1.0], ['water bottle', 1.0]], 'bucket': [['bucket', 1.0]], 'calculator': [], 'calendar': [], 'candles': [], 'chair': [['barber chair', 1.0], ['folding chair', 1.0], ['rocking chair', 1.0]], 'clipboards': [], 'computer': [['desktop computer', 1.0], ['hand-held computer', 1.0]], 'couch': [['studio couch', 1.0]], 'curtains': [], 'desk lamp': [], 'drill': [['power drill', 1.0]], 'eraser': [['rubber eraser', 1.0]], 'exit sign': [], 'fan': [['electric fan', 1.0]], 'file cabinet': [], 'flipflops': [], 'flowers': [], 'folder': [], 'fork': [], 'glasses': [['sunglasses', 1.0]], 'hammer': [['hammer', 1.0]], 'helmet': [['crash helmet', 1.0], ['football helmet', 1.0]], 'kettle': [], 'keyboard': [['computer keyboard', 1.0], ['typewriter keyboard', 1.0]], 'knives': [], 'lamp shade': [], 'laptop': 

In [33]:
masked_list2 = set()
for key in similarity_dict:
    for values in similarity_dict[key]:
        masked_list2.add(values[0])
print(masked_list2)

{'backpack', 'crash helmet', 'pool table', 'dutch oven', 'pop bottle', 'screwdriver', 'beer bottle', 'pill bottle', 'refrigerator', 'coffee mug', 'bucket', 'wine bottle', 'printer', 'laptop', 'notebook', 'desktop computer', 'power drill', 'cellular telephone', 'wooden spoon', 'monitor', 'barber chair', 'mountain bike', 'sunglasses', 'mouse', 'radio', 'frying pan', 'dining table', 'dial telephone', 'water bottle', 'folding chair', 'typewriter keyboard', 'fountain pen', 'rubber eraser', 'hammer', 'rocking chair', 'loudspeaker', 'studio couch', 'football helmet', 'hand-held computer', 'electric fan', 'computer keyboard'}


## Save them into files

In [34]:
masked_list1 = [x.replace(" ", "_") for x in masked_list1]
masked_list2 = [x.replace(" ", "_") for x in masked_list2]
print(masked_list1)
print(masked_list2)

['backpack', 'printer', 'laptop', 'notebook', 'screwdriver', 'refrigerator', 'monitor', 'hammer', 'bucket', 'radio', 'mouse']
['backpack', 'crash_helmet', 'pool_table', 'dutch_oven', 'pop_bottle', 'screwdriver', 'beer_bottle', 'pill_bottle', 'refrigerator', 'coffee_mug', 'bucket', 'wine_bottle', 'printer', 'laptop', 'notebook', 'desktop_computer', 'power_drill', 'cellular_telephone', 'wooden_spoon', 'monitor', 'barber_chair', 'mountain_bike', 'sunglasses', 'mouse', 'radio', 'frying_pan', 'dining_table', 'dial_telephone', 'water_bottle', 'folding_chair', 'typewriter_keyboard', 'fountain_pen', 'rubber_eraser', 'hammer', 'rocking_chair', 'loudspeaker', 'studio_couch', 'football_helmet', 'hand-held_computer', 'electric_fan', 'computer_keyboard']


In [35]:
masked_list1_df = pd.DataFrame([(imagenet_label_list_dict[i], i) for i in masked_list1])
masked_list2_df = pd.DataFrame([(imagenet_label_list_dict[i], i) for i in masked_list2])
masked_list1_df.to_csv('masked_officehome_imagenetlabel1_df.csv',index = False, header = False)
masked_list2_df.to_csv('masked_officehome_imagenetlabel2_df.csv',index = False, header = False)

## Experiment of choosing the subset of the dataset

In [92]:
image_net_train_dataset = datasets.ImageFolder(
        imagenet_training_filePath,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
        ])
        )
image_net_train_dataset

Dataset ImageFolder
    Number of datapoints: 49997
    Root location: /data/gpfs/datasets/Imagenet/val_blurred
    StandardTransform
Transform: Compose(
               RandomResizedCrop(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bilinear)
               RandomHorizontalFlip(p=0.5)
               ToTensor()
           )

In [93]:
masked_label_list_office31 = pd.read_csv(r"/home/junzhin/Project/Summer_project/code/version1.0/masked_office31_imagenetlabel2_df.csv", header = None)
masked_label_list_officehome = pd.read_csv(r"/home/junzhin/Project/Summer_project/code/version1.0/masked_officehome_imagenetlabel2_df.csv", header = None)

In [94]:
chosen_index_office31 = [image_net_train_dataset.class_to_idx[i] for i in image_net_train_dataset.classes if i not in list(masked_label_list_office31[0])]
print(len(chosen_index_office31))
chosen_index_officehome = [image_net_train_dataset.class_to_idx[i]  for i in image_net_train_dataset.classes if i not in list(masked_label_list_officehome[0])]
print(len(chosen_index_officehome))

976
959


In [95]:
masked_image_net_train_dataset_office31 = torch.utils.data.Subset(image_net_train_dataset, chosen_index_office31)
masked_image_net_train_dataset_officehome = torch.utils.data.Subset(image_net_train_dataset, chosen_index_officehome)

In [96]:
train_loader = torch.utils.data.DataLoader(masked_image_net_train_dataset_office31, batch_size=1, shuffle=False, pin_memory=True)
train_loader

<torch.utils.data.dataloader.DataLoader at 0x2afda8001a10>

In [101]:
for i, (images, target) in enumerate(train_loader):
    print(i)
    print("-"*100)
    print(images)
    print("-"*100)
    print(target)

    if i > 2000:
        break

0
----------------------------------------------------------------------------------------------------
tensor([[[[0.7333, 0.7765, 0.6941,  ..., 0.6667, 0.5412, 0.5176],
          [0.6863, 0.8314, 0.8000,  ..., 0.6745, 0.5608, 0.5255],
          [0.7020, 0.7529, 0.8549,  ..., 0.4902, 0.5490, 0.5569],
          ...,
          [0.6706, 0.6039, 0.5020,  ..., 0.4745, 0.3843, 0.3882],
          [0.6314, 0.6745, 0.6000,  ..., 0.3882, 0.3412, 0.3412],
          [0.5059, 0.5804, 0.6275,  ..., 0.4980, 0.4235, 0.3882]],

         [[0.7569, 0.8000, 0.7216,  ..., 0.5961, 0.4863, 0.4745],
          [0.7098, 0.8549, 0.8353,  ..., 0.5412, 0.4667, 0.4824],
          [0.7255, 0.7765, 0.8824,  ..., 0.4275, 0.5020, 0.5216],
          ...,
          [0.7490, 0.7176, 0.5647,  ..., 0.5137, 0.4039, 0.3882],
          [0.6941, 0.7804, 0.7294,  ..., 0.4157, 0.3882, 0.3961],
          [0.5529, 0.6627, 0.7451,  ..., 0.5412, 0.4902, 0.4627]],

         [[0.7451, 0.7882, 0.6941,  ..., 0.4314, 0.2941, 0.2902],
     

KeyboardInterrupt: 