In [1]:
# Installations and imports
!pip install datasets
import pandas as pd
import numpy as np
from datasets import load_dataset
from scipy.spatial import distance

Collecting datasets
  Downloading datasets-1.17.0-py3-none-any.whl (306 kB)
[K     |████████████████████████████████| 306 kB 2.0 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 49.3 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 53.1 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 454 kB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 73.1 MB/s 
Collecting async-timeout<5.0,>=4.0.0a3
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting multidict<7.0,>=4.5
  Downloading multidict-5.2.0-cp37-cp37m-ma

In [2]:
# Connect Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load CIFAR training data
dataset = load_dataset('cifar100', split='train')

Downloading:   0%|          | 0.00/2.21k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading and preparing dataset cifar100/cifar100 (download: 161.17 MiB, generated: 418.62 MiB, post-processed: Unknown size, total: 579.80 MiB) to /root/.cache/huggingface/datasets/cifar100/cifar100/1.0.0/0f9be8dd0480d385177a5c250878f4480651bbf0fc86d714b33d56c9aaad5160...


Downloading:   0%|          | 0.00/169M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset cifar100 downloaded and prepared to /root/.cache/huggingface/datasets/cifar100/cifar100/1.0.0/0f9be8dd0480d385177a5c250878f4480651bbf0fc86d714b33d56c9aaad5160. Subsequent calls will reuse this data.


In [None]:
# Process training data
cifar100train = pd.DataFrame(dataset)
cifar100train.drop(['coarse_label'], axis = 1, inplace=True)
cifar100train.to_csv('CIFAR100TRAIN.csv',index=False)

feat_list = []
for img in cifar100train['img']:
  feat_list.append(np.array(img).flatten())

data = pd.DataFrame(feat_list)
data['class'] = cifar100train['fine_label'].tolist()
data.to_pickle('zeroshot_data.pkl') # Reupload it to correct location / drive

In [3]:
cutoff_label = 10  # 0 - 9 -> unseen , 10-99 ->seen
total_class = 100

In [None]:
# Create text files having the classes
!touch train_classes.txt
!touch zsl_classes.txt

In [8]:
train_labels = [i for i in range(cutoff_label,total_class)]
zsl_labels = [i for i in range(cutoff_label)]

# Reupload it to correct location / drive
with open('train_classes.txt', 'w') as f:
    for label in train_labels:
        f.write(str(label))
        f.write('\n')

# Reupload it to correct location / drive
with open('zsl_classes.txt', 'w') as f:
    for label in zsl_labels:
        f.write(str(label))
        f.write('\n')

In [4]:
# Retrieve the merged word vector - vgg csv
merged_df = pd.read_csv('/content/drive/MyDrive/Machine Learning/Zero Shot Learning/NTECH/CIFAR/Dataset/Merged_WordVec_VGGFeat.csv')

In [9]:
cls = [i for i in range(total_class)]
merged_df['class'] = cls

In [10]:
unseen_class =zsl_labels
seen_class =train_labels
total_cls = total_class

unseen_wordvec = {}

def minkowskiDists(unseen,seen):
  # Returns minkowski distance between two word2 vecs (between unseen and seen class)
  v1 = list(merged_df[merged_df.columns[:300]].iloc[[unseen]].reset_index(drop=True).T[0]) #v1(unseen) against all v2(seen) 
  v2 = list(merged_df[merged_df.columns[:300]].iloc[[seen]].reset_index(drop=True).T[0])
  return distance.minkowski(v1, v2)


# For every class in unseen class, find minkowski dist of wordvectors of every seen class with each unseen class
for uc in unseen_class:
  temp=[]
  for sc in seen_class:
    temp.append(minkowskiDists(uc,sc))
  unseen_wordvec[uc] = np.array(temp)


unseen_vgg={}

# VGG features * Word Vector
def vgg_wordvec(vgg_elem,wv):
  return vgg_elem * wv

# For every VGG feature of unseen class, multiply with the new word2vec
def vgg(unseen):
  # Get the existing VGG features of the unseen class
  vgg_feat_list = list(merged_df[merged_df.columns[300:-1]].iloc[[unseen]].reset_index(drop=True).T[0])
  temp = []

  # For each VGG feature
  for vgg_feat in vgg_feat_list:
    sum =0
    # Add the new word vectors 
    for elem in unseen_wordvec[unseen]:
      sum+=vgg_wordvec(vgg_feat,elem)
    temp.append(sum)
  # Save the new VGG features to another dictionary
  unseen_vgg[unseen] = np.array([i/total_cls for i in temp])

# For each unseen classes  
for uc in unseen_class:
  vgg(uc)

In [11]:
dummy_df = merged_df.copy() # Copy the original dataframe, for safety.

In [15]:
# For every unseen class
for uc in unseen_class:
  # Add the original wordvectors of the unseen class with the newly formed VGG features
  wv = np.array(list(merged_df[merged_df.columns[:300]].iloc[[uc]].reset_index(drop=True).T[0]))
  vg = unseen_vgg[uc]
  new_feat = np.append(wv,vg)
  # Reset the row with new values
  dummy_df.loc[uc,:400] = new_feat

dummy_df.drop(['class'],axis=1,inplace=True) # Removing the "class" column

  


In [16]:
emptyarray = []
emptyarray_vgg = []
for cls in range(total_class):
  emptyarray.append( [cls,np.array(dummy_df.loc[cls][:300])] )
  emptyarray_vgg.append( [cls,np.array(dummy_df.loc[cls])] )

# Reupload it to correct location / drive
with open('class_vec.npy', 'wb') as f:
    np.save(f, np.array(emptyarray))

# Reupload it to correct location / drive
with open('class_vec_new.npy', 'wb') as f:
    np.save(f, np.array(emptyarray_vgg))

  if __name__ == '__main__':
  del sys.path[0]
