In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
from numpy import asarray
import pandas as pd
import os
from keras.optimizers import Adam
import pickle
import random
from PIL import Image, ImageOps
from collections import Counter
import glob
import itertools
import matplotlib.pyplot as plt

## PreProcessing the data

In [3]:
'''
As we are storing the images and retrieving them from the google drive need to make sure that the file path in the dataframe and in the train relationships are same
'''
def get_path(path):
  path = path.split('/')[3:]
  path = '\\'.join(str(elem) for elem in path)
  return path

def get_photos_path(photos):
  ret_list = []
  for items in photos:
    item = items.split('/')[3:]
    path = '\\'.join(str(elem) for elem in item)
    ret_list.append(path)
  ret_photos = tuple(ret_list)
  return ret_photos

In [4]:
df = pd.DataFrame(data = {'family': [], 'person': [], 'photos': []})
for family in glob.glob('drive/MyDrive/Project_Dataset/train/*'):
    df_family = get_path(family)
    for person in glob.glob(family + '/*'):
        df_person = get_path(person)
        photos = tuple(glob.glob(person + '/*'))
        df_photos = get_photos_path(photos)
        df = df.append(pd.Series({'family': df_family, 'person': df_person, 'photos': df_photos}), ignore_index = True)
df

Unnamed: 0,family,person,photos
0,train\F0002,train\F0002\MID1,"(train\F0002\MID1\P00012_face2.jpg, train\F000..."
1,train\F0002,train\F0002\MID3,"(train\F0002\MID3\P00017_face1.jpg, train\F000..."
2,train\F0002,train\F0002\MID2,"(train\F0002\MID2\P00018_face3.jpg, train\F000..."
3,train\F0009,train\F0009\MID1,"(train\F0009\MID1\P10576_face2.jpg, train\F000..."
4,train\F0009,train\F0009\MID2,"(train\F0009\MID2\P11743_face1.jpg, train\F000..."
...,...,...,...
2084,train\F0901,train\F0901\MID4,"(train\F0901\MID4\P09523_face2.jpg, train\F090..."
2085,train\F0901,train\F0901\MID2,"(train\F0901\MID2\P09517_face1.jpg, train\F090..."
2086,train\F0903,train\F0903\MID1,"(train\F0903\MID1\P09537_face2.jpg, train\F090..."
2087,train\F0903,train\F0903\MID3,"(train\F0903\MID3\P09539_face2.jpg, train\F090..."


In [5]:
# Writing the dataframe to a csv so that we don't have to do it everytime.
df.to_csv('combinations.csv', index =False)

In [6]:
# df = pd.read_csv("combinations.csv", converters={"photos":eval})

In [7]:
'''
Creating the combinations of people in a family
'''
family = df.family.unique()
people = df.person.unique()
combinations = []
for fam in family:
    people_in_family = df[df.family == fam].person
    combinations =[*combinations, *list(itertools.combinations(people_in_family, 2))]
    p1 = [x[0] for x in combinations]
p2 = [x[1] for x in combinations]
combinations = pd.DataFrame({'p1': p1, 'p2': p2, 'tuples': [tuple(sorted([p1[i], p2[i]])) for i in range(len(combinations))]})
combinations.head()

Unnamed: 0,p1,p2,tuples
0,train\F0002\MID1,train\F0002\MID3,"(train\F0002\MID1, train\F0002\MID3)"
1,train\F0002\MID1,train\F0002\MID2,"(train\F0002\MID1, train\F0002\MID2)"
2,train\F0002\MID3,train\F0002\MID2,"(train\F0002\MID2, train\F0002\MID3)"
3,train\F0009\MID1,train\F0009\MID2,"(train\F0009\MID1, train\F0009\MID2)"
4,train\F0009\MID1,train\F0009\MID3,"(train\F0009\MID1, train\F0009\MID3)"


In [8]:
train_relationships= pd.read_csv("train_relationships.csv")

In [9]:
train_relationships.p1 = train_relationships.p1.apply(lambda x: 'train/' + x).apply(lambda x: str.replace(x, '/', '\\'))
train_relationships.p2 = train_relationships.p2.apply(lambda x: 'train/' + x).apply(lambda x: str.replace(x, '/', '\\'))

In [10]:
train_relationships

Unnamed: 0,p1,p2
0,train\F0002\MID1,train\F0002\MID3
1,train\F0002\MID2,train\F0002\MID3
2,train\F0005\MID1,train\F0005\MID2
3,train\F0005\MID3,train\F0005\MID2
4,train\F0009\MID1,train\F0009\MID4
...,...,...
3593,train\F1000\MID5,train\F1000\MID8
3594,train\F1000\MID5,train\F1000\MID9
3595,train\F1000\MID6,train\F1000\MID9
3596,train\F1000\MID7,train\F1000\MID8


In [11]:
len(train_relationships) - ((train_relationships.p1.isin(people)) & (train_relationships.p2.isin(people))).sum()

539

In [12]:
train_relationships = train_relationships[(train_relationships.p1.isin(people)) & (train_relationships.p2.isin(people))]
train_relationships['tuples'] = [tuple(sorted([train_relationships.iloc[row, 0], train_relationships.iloc[row, 1]])) for row in range(len(train_relationships))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [13]:
train_relationships

Unnamed: 0,p1,p2,tuples
0,train\F0002\MID1,train\F0002\MID3,"(train\F0002\MID1, train\F0002\MID3)"
1,train\F0002\MID2,train\F0002\MID3,"(train\F0002\MID2, train\F0002\MID3)"
2,train\F0005\MID1,train\F0005\MID2,"(train\F0005\MID1, train\F0005\MID2)"
3,train\F0005\MID3,train\F0005\MID2,"(train\F0005\MID2, train\F0005\MID3)"
4,train\F0009\MID1,train\F0009\MID4,"(train\F0009\MID1, train\F0009\MID4)"
...,...,...,...
3279,train\F0901\MID3,train\F0901\MID1,"(train\F0901\MID1, train\F0901\MID3)"
3280,train\F0901\MID3,train\F0901\MID4,"(train\F0901\MID3, train\F0901\MID4)"
3281,train\F0901\MID5,train\F0901\MID4,"(train\F0901\MID4, train\F0901\MID5)"
3282,train\F0903\MID1,train\F0903\MID2,"(train\F0903\MID1, train\F0903\MID2)"


In [14]:
combinations['kinship'] = 0
combinations.loc[combinations['tuples'].isin(train_relationships['tuples']), 'kinship'] = 1

In [15]:
combinations

Unnamed: 0,p1,p2,tuples,kinship
0,train\F0002\MID1,train\F0002\MID3,"(train\F0002\MID1, train\F0002\MID3)",1
1,train\F0002\MID1,train\F0002\MID2,"(train\F0002\MID1, train\F0002\MID2)",0
2,train\F0002\MID3,train\F0002\MID2,"(train\F0002\MID2, train\F0002\MID3)",1
3,train\F0009\MID1,train\F0009\MID2,"(train\F0009\MID1, train\F0009\MID2)",1
4,train\F0009\MID1,train\F0009\MID3,"(train\F0009\MID1, train\F0009\MID3)",1
...,...,...,...,...
5752,train\F0901\MID3,train\F0901\MID2,"(train\F0901\MID2, train\F0901\MID3)",0
5753,train\F0901\MID4,train\F0901\MID2,"(train\F0901\MID2, train\F0901\MID4)",1
5754,train\F0903\MID1,train\F0903\MID3,"(train\F0903\MID1, train\F0903\MID3)",0
5755,train\F0903\MID1,train\F0903\MID2,"(train\F0903\MID1, train\F0903\MID2)",1


In [16]:
combinations['kinship'].value_counts()

1    3059
0    2698
Name: kinship, dtype: int64

In [17]:
photo_combinations = pd.DataFrame({'tuples' : [], 'kinship' : []})
for row in range(len(combinations)):
    p1 = combinations.p1[row]
    p2 = combinations.p2[row]
    kinship_pair = [combinations.kinship[row]]
    
    photos1 = df[df.person == p1].photos
    photos1 = [list(x) for x in photos1][0]
    photos2 = df[df.person == p2].photos
    photos2 = [list(x) for x in photos2][0]
            
    photo_pairs = list(itertools.product(photos1, photos2))
    kinship_pairs = kinship_pair * len(photo_pairs)

    comb_temp = pd.DataFrame({'tuples' : photo_pairs, 'kinship' : kinship_pairs})
    photo_combinations = photo_combinations.append(comb_temp)
photo_combinations.kinship = pd.to_numeric(photo_combinations.kinship, downcast = 'integer')
photo_combinations = photo_combinations.sample(frac=1).reset_index(drop=True)
photo_combinations

Unnamed: 0,tuples,kinship
0,"(train\F0872\MID4\P09211_face4.jpg, train\F087...",0
1,"(train\F0215\MID1\P02287_face2.jpg, train\F021...",1
2,"(train\F0376\MID9\P10635_face1.jpg, train\F037...",0
3,"(train\F0754\MID2\P07920_face5.jpg, train\F075...",1
4,"(train\F0601\MID19\P12002_face1.jpg, train\F06...",0
...,...,...
422866,"(train\F0601\MID2\P12067_face2.jpg, train\F060...",0
422867,"(train\F0601\MID2\P06288_face1.jpg, train\F060...",1
422868,"(train\F0617\MID4\P06478_face1.jpg, train\F061...",1
422869,"(train\F0198\MID1\P02126_face4.jpg, train\F019...",0


In [18]:
photo_combinations.to_csv("photo_combinations.csv", index = False)