# Loading the relevant libraries

In [13]:
import os
import pandas as pd 
import numpy as np
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import Counter
from torchvision import transforms, utils, datasets
from torch.utils.data import Dataset, DataLoader, random_split, SubsetRandomSampler, WeightedRandomSampler

# Open the file and extract the labels of the imagenet dataset

In [78]:
imagenet_label_list = pd.read_csv(r"/home/junzhin/Project/Summer_project/code/version1.0/imagenet_labels_set.txt", sep = " ",header = None)
len(imagenet_label_list)

1000

In [73]:
len(pd.unique(label_list[0]))/len(label_list) *100 

100.0

In [79]:
imagenet_label_list_dict = dict(zip(label_list[2], label_list[0]))
print(imagenet_label_list_dict)

{'kit_fox': 'n02119789', 'English_setter': 'n02100735', 'Siberian_husky': 'n02110185', 'Australian_terrier': 'n02096294', 'English_springer': 'n02102040', 'grey_whale': 'n02066245', 'lesser_panda': 'n02509815', 'Egyptian_cat': 'n02124075', 'ibex': 'n02417914', 'Persian_cat': 'n02123394', 'cougar': 'n02125311', 'gazelle': 'n02423022', 'porcupine': 'n02346627', 'sea_lion': 'n02077923', 'malamute': 'n02110063', 'badger': 'n02447366', 'Great_Dane': 'n02109047', 'Walker_hound': 'n02089867', 'Welsh_springer_spaniel': 'n02102177', 'whippet': 'n02091134', 'Scottish_deerhound': 'n02092002', 'killer_whale': 'n02071294', 'mink': 'n02442845', 'African_elephant': 'n02504458', 'Weimaraner': 'n02092339', 'soft-coated_wheaten_terrier': 'n02098105', 'Dandie_Dinmont': 'n02096437', 'red_wolf': 'n02114712', 'Old_English_sheepdog': 'n02105641', 'jaguar': 'n02128925', 'otterhound': 'n02091635', 'bloodhound': 'n02088466', 'Airedale': 'n02096051', 'hyena': 'n02117135', 'meerkat': 'n02138441', 'giant_schnauzer

# Checking the directory of the imagenet and make sure the existence and presence of the labels

In [5]:
your_path =r"/data/gpfs/datasets/Imagenet/" 
file_dir = os.listdir(your_path)

In [6]:
train_filenames = os.listdir(your_path + file_dir[1])
valid_filenames = os.listdir(your_path + file_dir[2])
print(len(train_filenames))
print(len(valid_filenames))

1000
1000


In [59]:
print(set(train_filenames) == set(valid_filenames))

True


In [62]:
imagenet_training_filePath = os.path.join(your_path, file_dir[1])
imagenet_training_filePath = os.path.join(your_path, file_dir[2])


In [65]:

image_net_train_dataset = datasets.ImageFolder(
        imagenet_training_filePath,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
           
        ]))
image_net_train_dataset


Dataset ImageFolder
    Number of datapoints: 49997
    Root location: /data/gpfs/datasets/Imagenet/val_blurred
    StandardTransform
Transform: Compose(
               RandomResizedCrop(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bilinear)
               RandomHorizontalFlip(p=0.5)
               ToTensor()
           )

In [66]:
image_net_train_dataset.class_to_idx

{'n01440764': 0,
 'n01443537': 1,
 'n01484850': 2,
 'n01491361': 3,
 'n01494475': 4,
 'n01496331': 5,
 'n01498041': 6,
 'n01514668': 7,
 'n01514859': 8,
 'n01518878': 9,
 'n01530575': 10,
 'n01531178': 11,
 'n01532829': 12,
 'n01534433': 13,
 'n01537544': 14,
 'n01558993': 15,
 'n01560419': 16,
 'n01580077': 17,
 'n01582220': 18,
 'n01592084': 19,
 'n01601694': 20,
 'n01608432': 21,
 'n01614925': 22,
 'n01616318': 23,
 'n01622779': 24,
 'n01629819': 25,
 'n01630670': 26,
 'n01631663': 27,
 'n01632458': 28,
 'n01632777': 29,
 'n01641577': 30,
 'n01644373': 31,
 'n01644900': 32,
 'n01664065': 33,
 'n01665541': 34,
 'n01667114': 35,
 'n01667778': 36,
 'n01669191': 37,
 'n01675722': 38,
 'n01677366': 39,
 'n01682714': 40,
 'n01685808': 41,
 'n01687978': 42,
 'n01688243': 43,
 'n01689811': 44,
 'n01692333': 45,
 'n01693334': 46,
 'n01694178': 47,
 'n01695060': 48,
 'n01697457': 49,
 'n01698640': 50,
 'n01704323': 51,
 'n01728572': 52,
 'n01728920': 53,
 'n01729322': 54,
 'n01729977': 55,
 '

In [67]:
image_net_train_dataset.classes

['n01440764',
 'n01443537',
 'n01484850',
 'n01491361',
 'n01494475',
 'n01496331',
 'n01498041',
 'n01514668',
 'n01514859',
 'n01518878',
 'n01530575',
 'n01531178',
 'n01532829',
 'n01534433',
 'n01537544',
 'n01558993',
 'n01560419',
 'n01580077',
 'n01582220',
 'n01592084',
 'n01601694',
 'n01608432',
 'n01614925',
 'n01616318',
 'n01622779',
 'n01629819',
 'n01630670',
 'n01631663',
 'n01632458',
 'n01632777',
 'n01641577',
 'n01644373',
 'n01644900',
 'n01664065',
 'n01665541',
 'n01667114',
 'n01667778',
 'n01669191',
 'n01675722',
 'n01677366',
 'n01682714',
 'n01685808',
 'n01687978',
 'n01688243',
 'n01689811',
 'n01692333',
 'n01693334',
 'n01694178',
 'n01695060',
 'n01697457',
 'n01698640',
 'n01704323',
 'n01728572',
 'n01728920',
 'n01729322',
 'n01729977',
 'n01734418',
 'n01735189',
 'n01737021',
 'n01739381',
 'n01740131',
 'n01742172',
 'n01744401',
 'n01748264',
 'n01749939',
 'n01751748',
 'n01753488',
 'n01755581',
 'n01756291',
 'n01768244',
 'n01770081',
 'n017

# Open the office31 dataset and extract the labeles 

In [9]:
office31_path =r"/home/junzhin/Project/datasets/domain_adaptation_images/"
office31_dir = os.listdir(office31_path)
amazon_file_path = os.path.join(office31_path, office31_dir[0]+ "/images/")
dslr_file_path = os.path.join(office31_path, office31_dir[1]+ "/images/")
webcam_file_path = os.path.join(office31_path, office31_dir[2]+ "/images/")
print(office31_dir)

['amazon', 'dslr', 'webcam']


## Checking if three domains have the same set of class labels

In [44]:
amazon =  os.listdir(amazon_file_path)
dslr =  os.listdir(dslr_file_path)
webcam =  os.listdir(webcam_file_path)
 

In [45]:
print(set(amazon) == set(dslr) == set(webcam))

True


## Loading one of the domains of the class labels for further manipulation

In [38]:
train_dataset = datasets.ImageFolder(
        amazon_file_path,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
           
        ]))
train_dataset



Dataset ImageFolder
    Number of datapoints: 2817
    Root location: /home/junzhin/Project/datasets/domain_adaptation_images/amazon/images/
    StandardTransform
Transform: Compose(
               RandomResizedCrop(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bilinear)
               RandomHorizontalFlip(p=0.5)
               ToTensor()
           )

In [71]:
print(dict(Counter(train_dataset.targets)))
office_31_training_labels = train_dataset.classes
print(train_dataset.classes[:20])

{0: 92, 1: 82, 2: 72, 3: 82, 4: 36, 5: 94, 6: 91, 7: 97, 8: 97, 9: 81, 10: 99, 11: 100, 12: 100, 13: 98, 14: 100, 15: 99, 16: 100, 17: 94, 18: 96, 19: 95, 20: 93, 21: 100, 22: 98, 23: 98, 24: 90, 25: 75, 26: 100, 27: 99, 28: 99, 29: 96, 30: 64}
['back_pack', 'bike', 'bike_helmet', 'bookcase', 'bottle', 'calculator', 'desk_chair', 'desk_lamp', 'desktop_computer', 'file_cabinet', 'headphones', 'keyboard', 'laptop_computer', 'letter_tray', 'mobile_phone', 'monitor', 'mouse', 'mug', 'paper_notebook', 'pen']


In [55]:
selected_index = [ train_dataset.class_to_idx[i] for i in train_dataset.classes[:20]]
print(selected_index)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]


In [58]:
trainset_1 = torch.utils.data.Subset(train_dataset, selected_index)
train_loader = torch.utils.data.DataLoader(trainset_1, batch_size=1, shuffle= False, num_workers=1, pin_memory=True,sampler=None)

## Checking the duplicate classes between the imagenet of 1000 classes and office31

In [153]:
office_31_training_labels = [word.lower().replace("_", " ")for word in office_31_training_labels]
office_31_training_labels

['back pack',
 'bike',
 'bike helmet',
 'bookcase',
 'bottle',
 'calculator',
 'desk chair',
 'desk lamp',
 'desktop computer',
 'file cabinet',
 'headphones',
 'keyboard',
 'laptop computer',
 'letter tray',
 'mobile phone',
 'monitor',
 'mouse',
 'mug',
 'paper notebook',
 'pen',
 'phone',
 'printer',
 'projector',
 'punchers',
 'ring binder',
 'ruler',
 'scissors',
 'speaker',
 'stapler',
 'tape dispenser',
 'trash can']

In [155]:
imagenet_label_list = [word.lower().replace("_", " ") for word in list(imagenet_label_list_dict.keys())]
imagenet_label_list

['kit fox',
 'english setter',
 'siberian husky',
 'australian terrier',
 'english springer',
 'grey whale',
 'lesser panda',
 'egyptian cat',
 'ibex',
 'persian cat',
 'cougar',
 'gazelle',
 'porcupine',
 'sea lion',
 'malamute',
 'badger',
 'great dane',
 'walker hound',
 'welsh springer spaniel',
 'whippet',
 'scottish deerhound',
 'killer whale',
 'mink',
 'african elephant',
 'weimaraner',
 'soft-coated wheaten terrier',
 'dandie dinmont',
 'red wolf',
 'old english sheepdog',
 'jaguar',
 'otterhound',
 'bloodhound',
 'airedale',
 'hyena',
 'meerkat',
 'giant schnauzer',
 'titi',
 'three-toed sloth',
 'sorrel',
 'black-footed ferret',
 'dalmatian',
 'black-and-tan coonhound',
 'papillon',
 'skunk',
 'staffordshire bullterrier',
 'mexican hairless',
 'bouvier des flandres',
 'weasel',
 'miniature poodle',
 'cardigan',
 'malinois',
 'bighorn',
 'fox squirrel',
 'colobus',
 'tiger cat',
 'lhasa',
 'impala',
 'coyote',
 'yorkshire terrier',
 'newfoundland',
 'brown bear',
 'red fox',


### Method 1: Exact string match between two datasets classes

In [97]:
set(office_31_training_labels).intersection(set(imagenet_label_list))

{'bookcase', 'desktop computer', 'monitor', 'mouse', 'printer', 'projector'}

### Method 2 Similarity string matching between two datasets classes: To remove any possible similar strings in both set of classes

In [104]:
import textdistance
print(textdistance.lcsstr.normalized_similarity("monitor","monitor"))

1.0


In [178]:
similarity_dict =  {}
threshold = 0.5
for x in office_31_training_labels:
    similarity_dict[x] = []
    for y in imagenet_label_list:
        similarity_score = textdistance.lcsseq.normalized_similarity(x,y)
        if similarity_score > threshold:
             similarity_dict[x].append([y,similarity_score])
for each_word_list in similarity_dict:
    similarity_dict[each_word_list].sort(key=lambda x: x[1], reverse = True)
# levenshtein damerau_levenshtein gotoh smith_waterman

In [179]:
similarity_dict

{'back pack': [['backpack', 0.8888888888888888],
  ['black swan', 0.6],
  ['black stork', 0.5454545454545454],
  ['book jacket', 0.5454545454545454]],
 'bike': [],
 'bike helmet': [['mobile home', 0.6363636363636364],
  ['crash helmet', 0.5833333333333333],
  ['football helmet', 0.5333333333333333]],
 'bookcase': [['bookcase', 1.0],
  ['bookshop', 0.625],
  ['boathouse', 0.5555555555555556],
  ['book jacket', 0.5454545454545454]],
 'bottle': [['otter', 0.6666666666666667],
  ['bottlecap', 0.6666666666666667],
  ['bolete', 0.6666666666666667],
  ['box turtle', 0.6],
  ['pop bottle', 0.6],
  ['bobsled', 0.5714285714285714],
  ['bittern', 0.5714285714285714],
  ['bow tie', 0.5714285714285714],
  ['beer bottle', 0.5454545454545454],
  ['wine bottle', 0.5454545454545454],
  ['pill bottle', 0.5454545454545454]],
 'calculator': [['cauliflower', 0.5454545454545454]],
 'desk chair': [['barber chair', 0.5833333333333333],
  ['folding chair', 0.5384615384615384],
  ['rocking chair', 0.53846153846