This file prepares the Asian Politicians Dataset (APD) for training with a Siamese network.

In [None]:
# Unzip C.zip with Japanese filenames
# !unzip -O "ISO-10646-J-1" C.zip

This file does the following:

- gets all image names from specified folder (folder `C`)
- gets names of all the people in images from specified folder
- creates empty folders `train` and `test`
- for each person's name, copies 11% of files from specified folder (`C`) to folder `test`
- for each person's name, copies remaining 89% of files from folder (`C`) to folder `train`
- reads negative comparisons from `negative_pairs.txt`
- reads positive comparisons from `positive_pairs.txt`
- combines comparisons into dataframe `info`
- updates `info` to indicate whether files are in `test` or in `train` folders for each comparison
- saves `info` to `info.csv`
- separates `info` based on `train` and `test` files, and saves `test_info.csv` and `train_info.csv`

**Note**: Even though only 11% of image files were copied to folder `test`, there is approximately 20% of comparisons for the test set as specified in `info.csv`

**Note 2**: All image files in the `test` folder are not contained in the `train` folder

The result is the model will see each person at least once when training, but during testing it will compare completely unseen images for each person

In [1]:
%reload_ext autoreload
%matplotlib inline

In [2]:
from fastai import *
from fastai.vision import *
import matplotlib.pyplot as plt
from PIL import Image

from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib
from subprocess import call

from fastai.metrics import error_rate

In [None]:
## get list of japanese fonts (on Linux)
# !fc-list :lang=ja family

In [3]:
# Add Japanese Fonts to Matplotlib

fonts = """
Noto Sans Mono CJK TC,Noto Sans Mono CJK TC Bold
Noto Sans CJK TC,Noto Sans CJK TC Medium
Noto Sans CJK TC,Noto Sans CJK TC DemiLight
Noto Sans CJK KR,Noto Sans CJK KR Black
Noto Sans CJK TC,Noto Sans CJK TC Black
Noto Sans Mono CJK TC,Noto Sans Mono CJK TC Regular
Noto Sans CJK SC,Noto Sans CJK SC Light
Noto Sans Mono CJK SC,Noto Sans Mono CJK SC Regular
Noto Sans CJK SC,Noto Sans CJK SC Regular
Noto Sans CJK JP,Noto Sans CJK JP Light
Noto Sans CJK SC,Noto Sans CJK SC Medium
Noto Sans CJK TC,Noto Sans CJK TC Regular
Noto Sans CJK KR,Noto Sans CJK KR Bold
Noto Sans CJK JP,Noto Sans CJK JP Bold
Noto Sans CJK SC,Noto Sans CJK SC Bold
Noto Sans CJK KR,Noto Sans CJK KR DemiLight
Noto Sans CJK KR,Noto Sans CJK KR Thin
Noto Sans CJK SC,Noto Sans CJK SC Black
Noto Sans CJK JP,Noto Sans CJK JP Thin
Noto Sans CJK SC,Noto Sans CJK SC Thin
Noto Sans CJK TC,Noto Sans CJK TC Bold
Noto Sans CJK JP,Noto Sans CJK JP Medium
Noto Sans CJK JP,Noto Sans CJK JP Black
Noto Sans CJK JP,Noto Sans CJK JP DemiLight
Noto Sans Mono CJK SC,Noto Sans Mono CJK SC Bold
Noto Sans CJK TC,Noto Sans CJK TC Light
Noto Sans CJK SC,Noto Sans CJK SC DemiLight
Noto Sans CJK KR,Noto Sans CJK KR Light
Fixed
Noto Sans Mono CJK JP,Noto Sans Mono CJK JP Regular
TakaoPGothic
Noto Sans CJK KR,Noto Sans CJK KR Medium
Noto Sans CJK JP,Noto Sans CJK JP Regular
Noto Sans Mono CJK KR,Noto Sans Mono CJK KR Regular
Noto Sans CJK TC,Noto Sans CJK TC Thin
Noto Sans CJK KR,Noto Sans CJK KR Regular
Noto Sans Mono CJK KR,Noto Sans Mono CJK KR Bold
Noto Sans Mono CJK JP,Noto Sans Mono CJK JP Bold
""".split('\n')[1:-1]
fonts = list(map(lambda o: o.split(","), fonts))

def flatten(a): return [i for sublist in a for i in sublist]

matplotlib.rcParams['font.sans-serif'] = flatten(fonts) + matplotlib.rcParams['font.sans-serif']
plt.rcParams["font.family"] = ["sans-serif"]

# plt.rcParams["font.sans-serif"] # list all fonts in use for matplotlib

In [4]:
path = Path("data/") # path to where data is stored
path_img = path/Path("C/") # folder containing all images
extension = '.jpg' # extension of files in path_img

In [5]:
path_img.ls()[:10] # first ten files in image path

[PosixPath('data/C/北村経夫_34.jpg'),
 PosixPath('data/C/磯崎仁彦_8.jpg'),
 PosixPath('data/C/三原じゅん子_18.jpg'),
 PosixPath('data/C/自見はなこ_18.jpg'),
 PosixPath('data/C/陳鑑林_55.jpg'),
 PosixPath('data/C/那谷屋正義_44.jpg'),
 PosixPath('data/C/羽田雄一郎_15.jpg'),
 PosixPath('data/C/和田政宗_10.jpg'),
 PosixPath('data/C/黃容根_57.jpg'),
 PosixPath('data/C/森屋宏_10.jpg')]

In [6]:
# CLEAR FOLDERS
# Replace train and test folders with empty folders if they exist

call(f"rm -r {path}/train".split())
call(f"mkdir {path}/train".split())

call(f"rm -r {path}/test".split())
call(f"mkdir {path}/test".split())

call(f"rm {path}/info.csv".split()) # remove info.csv if it exists

1

In [7]:
# PREPARE INFO DATAFRAME 
# (Contains filenames to be compared, correct labels, and whether or not each comparison is training / testing)

# Negative labels
neg_pairs = pd.read_csv(path/"negative_pairs.txt", names=["pairs"])
neg_pairs = pd.DataFrame(neg_pairs["pairs"].str.split("\t").to_list(), columns=["name1","id1","name2","id2"])
neg_pairs = neg_pairs.reindex(sorted(neg_pairs.columns), axis=1)
neg_pairs["label"] = False

# Positive labels
pos_pairs = pd.read_csv(path/"positive_pairs.txt", names=["pairs"])
pos_pairs = pd.DataFrame(pos_pairs["pairs"].str.split("\t").to_list(), columns=["name1","id1","id2"])
pos_pairs["name2"] = pos_pairs["name1"]
pos_pairs = pos_pairs.reindex(sorted(pos_pairs.columns), axis=1)
pos_pairs["label"] = True

# Combined labels
info = pd.concat([pos_pairs, neg_pairs], ignore_index=True)
info["file1"] = info["name1"] + "_" + info["id1"] # add filenames
info["file2"] = info["name2"] + "_" + info["id2"]
info_sorted = info
info = info.reindex(["file1","file2","name1","name2","id1","id2","label"], axis=1) # reorder columns
info = info.sample(frac=1).reset_index(drop=True) # shuffle dataframe

info = info.drop(["name1","name2","id1","id2"], axis=1) # drop extra columns



# FILL TRAIN AND TEST FOLDERS

test_pct = 0.11 # percentage of files in test folder from all files
# Note: this is different from percentage of comparisons from test set

info["train"] = True

# Get list of all files
all_files = pd.concat([info_sorted["file1"], info_sorted["file2"]]).sort_values(ignore_index=True).drop_duplicates()
all_files = all_files.reset_index(drop=True)

# Get list of all class names
all_names = pd.concat([info_sorted["name1"], info_sorted["name2"]]).sort_values(ignore_index=True).drop_duplicates()
all_names = all_names.reset_index(drop=True)


# Recreate all_files with a name column
all_files = pd.concat([all_files, all_files.map(lambda o: o.split("_")[0])], axis=1)
all_files.columns = ["file", "name"]
all_files["train"] = True


# Add file extension
all_files["file"] += extension
info["file1"] += extension
info["file2"] += extension


# Ensure all class names are in training set and test set
for name in all_names:
    all_files.loc[all_files.loc[(all_files["name"] == name)].sample(frac=test_pct).index, "train"] = False

test_files = all_files[all_files["train"] == False]["file"]
train_files = all_files[all_files["train"] == True]["file"]


# COPY FILES

# Move test files to test folder
for some_file in test_files:
    call(f"cp {path_img}/{some_file} {path}/test".split())

# Move train files to train folder
for some_file in train_files:
    call(f"cp {path_img}/{some_file} {path}/train".split())


# UPDATE AND SAVE INFO
    
# Update info: separate training set from test set
for vfile in test_files:
    mask = info["file1"] == vfile
    info.loc[info.loc[mask].index, "train"] = False

    mask = info["file2"] == vfile
    info.loc[info.loc[mask].index, "train"] = False
    
info.to_csv(path/"info.csv") # save info
all_names.to_csv(path/"class_names.csv") # save class names
    
# Ratio of test to train comparisons
comparison_ratio = len(info[info["train"] == False]) / len(info)

print("Ratio of test to train comparisons: ", comparison_ratio)

info.head()

Ratio of valid to train comparisons:  0.20889153973206093


Unnamed: 0,file1,file2,label,train
0,梁家騮_82.jpg,郭家麒_33.jpg,False,True
1,吳敦義_12.jpg,吳敦義_26.jpg,True,True
2,易志明_58.jpg,易志明_32.jpg,True,True
3,劉銓忠_5.jpg,劉銓忠_8.jpg,True,True
4,李俊毅_24.jpg,姚文智_27.jpg,False,True


In [8]:
# this should approximately equal test_pct
len(test_files) / len(all_files)

0.11077062192303828

In [28]:
# Save info for test and train folders separately as well

mask = info["train"] == True
train_info = info[mask]

mask = info["train"] == False
test_info = info[mask]

train_info = train_info.drop(["train"], axis=1).reset_index(drop=True)
test_info = test_info.drop(["train"], axis=1).reset_index(drop=True)

train_info.to_csv(path/"train_info.csv")
test_info.to_csv(path/"test_info.csv")

test_info.head()

Unnamed: 0,file1,file2,label
0,江崎鐵磨_2.jpg,松下新平_25.jpg,False
1,謝偉銓_2.jpg,方剛_80.jpg,False
2,李嘉進_27.jpg,丁守中_18.jpg,False
3,楊麗環_5.jpg,楊麗環_8.jpg,True
4,張學明_38.jpg,張學明_72.jpg,True
