# International Space Station Archaeology Project: Facial Recognition Data Generation

In [1]:
import Main_Model
import os
import json

### Small Test Set

In [2]:
# Initializes the model.  The last number is the number of threads to use.
model = Main_Model.Master_Model(r"../cropped_Astronaut_photos/",r'./pickles_bin/', 8)

Training on all images in ../cropped_Astronaut_photos/ using 8 threads
Training on image: cropped_pedro_duque&spain.jpg
	Model has already been trained on cropped_pedro_duque&spain.jpg
Training on image: cropped_kathryn_hire&usa.jpg
	Model has already been trained on cropped_kathryn_hire&usa.jpg
Training on image: cropped_michael_lopez-alegria&usa.jpg
	Model has already been trained on cropped_michael_lopez-alegria&usa.jpg
Training on image: cropped_valery_korzun&russia.jpg
	Model has already been trained on cropped_valery_korzun&russia.jpg
Training on image: cropped_stephen_robinson&usa.jpg
	Model has already been trained on cropped_stephen_robinson&usa.jpg
Training on image: cropped_gregory_wiseman&usa.jpg
	Model has already been trained on cropped_gregory_wiseman&usa.jpg
Training on image: cropped_philippe_perrin&france.jpg
	Model has already been trained on cropped_philippe_perrin&france.jpg
Training on image: cropped_chris_hadfield&canada.jpg
	Model has already been trained on cro

In [7]:
# Finds all the faces in a specific directory.  In this case, testing on a folder of exclusively
# Chris Cassidy
model.findFacesDir("./ISS_Photos/" + "Astronaut Chris Cassidy/")
d = model.found_faces

['iss036e029877_9495987996_o', 'jpg']
./pickles_bin/anton_shkaplerov&russia.dat
[]
./pickles_bin/stephen_robinson&usa.dat
[]
./pickles_bin/lee_archambault&usa.dat
[]
./pickles_bin/richard_mastracchio&usa.dat
[]
./pickles_bin/tracy_caldwell&usa.dat
[]
./pickles_bin/paul_lockhart&usa.dat
[]
./pickles_bin/charles_hobaugh&usa.dat
[]
./pickles_bin/david_saint&canada.dat
[]
./pickles_bin/kenneth_bowersox&usa.dat
[]
./pickles_bin/leopold_eyharts&france.dat
[]
./pickles_bin/michael_lopez&usa.dat
[]
./pickles_bin/takao_doi&japan.dat
[]
./pickles_bin/koichi_wakata&japan.dat
[]
./pickles_bin/donald_pettit&usa.dat
[]
./pickles_bin/jose_hernandez&usa.dat
[]
./pickles_bin/brent_jett&usa.dat
[]
./pickles_bin/james_wetherbee&usa.dat
[]
./pickles_bin/lisa_nowak&usa.dat
[]
./pickles_bin/edward_fincke&usa.dat
[]
./pickles_bin/eric_boe&usa.dat
[]
./pickles_bin/chris_hadfield&canada.dat
[]
./pickles_bin/aleksandr_skvortsov&russia.dat
[]
./pickles_bin/frederick_sturckow&usa.dat
[]
./pickles_bin/norishige_ka

In [9]:
print(len(d.values()))

136


In [13]:
# Determining accuracy; this was dramatically improved later.
count = 0
for pic in d.values():
    if 'christopher_cassidy&usa' in pic:
        count += 1
print(count)

66


In [15]:
d['expedition-36-crew-members_9730917229_o.jpg']

{'marc_garneau&canada': [4],
 'aleksandr_kaleri&russia': [5],
 'mark_polansky&usa': [5],
 'christopher_cassidy&usa': [1],
 'vladimir_dezhurov&russia': [2],
 'fyodor_yurchikhin&russia': [4],
 'pavel_vinogradov&russia': [0],
 'luca_parmitano&italy': [5],
 'aleksandr_misurkin&russia': [2],
 'umberto_guidoni&italy': [4],
 'karen_nyberg&usa': [3]}

### Full Set

In [3]:
# Processes the photos in every subdirectory of ISS_Photos
photos = {}
folder_list = os.walk("./ISS_Photos")
folders = [item[0] for item in folder_list]
for f in folders:
    model.findFacesDir(f + "/")
    photos[f] = model.found_faces

Looking for learned faces in all images in ./ISS_Photos/ using 8 threads
Looking for learned faces in all images in ./ISS_Photos/All onboard images/ using 8 threads
Analyzing image s127e006875_9467252116_o.jpg
Analyzing image iss002e5937_9495224751_o.jpg
Analyzing image iss002e5478_9498038088_o.jpg
Analyzing image s118e06106_9411538430_o.jpg
Analyzing image s132e008034_9449949807_o.jpg
Analyzing image s133e007425_9473520684_o.jpg
Analyzing image s127e006850_9467253374_o.jpg
Analyzing image s132e010067_9452718284_o.jpg
Analyzing image iss003e5552_9502196073_o.jpg
Analyzing image s128e007472_9468325058_o.jpg
Analyzing image iss01e5175_9474345590_o.jpg
Analyzing image s135e008061_9505379224_o.jpg
Analyzing image zinnias_24266599379_o.jpg
Analyzing image s129e007954_9404313994_o.jpg
Analyzing image s130e008980_9414676133_o.jpg
Analyzing image s118e09190_9411678570_o.jpg
Analyzing image iss002e5339_9498040622_o.jpg
Analyzing image s131e010135_9449180669_o.jpg
Analyzing image s118e09290_9408

In [8]:
print(len(folders))
print(photos)

2
{'yuri_gidzenko&russia': [0]}


In [9]:
# Save the processed data for fast reuse
with open('rawdata.json', 'w') as fp:
    json.dump(photos['./ISS_Photos'], fp)

In [12]:
from apyori import apriori

In [24]:
# Create a list of lists of all the astronauts photographed together
transactions = [list(list(photos['./ISS_Photos'].values())[i].keys()) for i in range(len(list(photos['./ISS_Photos'].values())))]

In [27]:
# Removes all the transactions which do not contain any astronaut names
finalTransactions = []
for i in transactions:
    if len(i) > 0:
        finalTransactions.append(i)

In [51]:
# Applies an apriori Market Basket Analysis algorithm to find the most frequent pairs.
# The minsupport is set to a very low value, as only a few of the photos will contain
# any specific astronaut.  We then use a very high min confidence, to ensure the model
# is quite sure the relationship is correct.
results = list(apriori(finalTransactions, min_support=0.01, min_confidence=0.80,
                        min_lift=1.0, max_length=None))

In [53]:
print(results[0])

RelationRecord(items=frozenset({'ronald_garan&usa', 'andrei_borisenko&russia'}), support=0.013729977116704805, ordered_statistics=[OrderedStatistic(items_base=frozenset({'ronald_garan&usa'}), items_add=frozenset({'andrei_borisenko&russia'}), confidence=1.0, lift=54.625)])


In [67]:
# Parse the information out of the apriori results
frequentItems = {}
for r in results:
    names = [x for x in r[0]]
    pair = str((names[0], names[1]))
    if pair not in results:
        frequentItems[pair] = {}
        frequentItems[pair]["support"] = r[1]
        frequentItems[pair]["confidence"] = r[2][0][2]
        frequentItems[pair]["lift"] = r[2][0][3]
    elif r[2][0][2] > results[pair]["confidence"]:
        frequentItems[pair]["support"] = r[1]
        frequentItems[pair]["confidence"] = r[2][0][2]
        frequentItems[pair]["lift"] = r[2][0][3]
print(frequentItems)
print(len(results))
print(len(frequentItems))

{"('ronald_garan&usa', 'andrei_borisenko&russia')": {'support': 0.013729977116704805, 'confidence': 1.0, 'lift': 62.42857142857142}, "('andrei_borisenko&russia', 'sergey_volkov&russia')": {'support': 0.016018306636155607, 'confidence': 0.875, 'lift': 54.625}, "('james_reilly&usa', 'janet_kavandi&usa')": {'support': 0.011441647597254004, 'confidence': 1.0, 'lift': 62.42857142857142}, "('james_reilly&usa', 'michael_gernhardt&usa')": {'support': 0.011441647597254004, 'confidence': 1.0, 'lift': 87.4}, "('stephen_lendsey&usa', 'james_reilly&usa')": {'support': 0.011441647597254004, 'confidence': 1.0, 'lift': 87.4}, "('susan_helms&usa', 'james_reilly&usa')": {'support': 0.011441647597254004, 'confidence': 1.0, 'lift': 87.4}, "('michael_gernhardt&usa', 'janet_kavandi&usa')": {'support': 0.011441647597254004, 'confidence': 0.8333333333333334, 'lift': 52.023809523809526}, "('stephen_lendsey&usa', 'michael_gernhardt&usa')": {'support': 0.011441647597254004, 'confidence': 0.8333333333333334, 'lif

In [68]:
# Save the frequent pairs data to a json
with open('frequentpairs.json', 'w') as fp:
    json.dump(frequentItems, fp)

In [74]:
# Find the raw freqencies at which astronauts are found
frequencies = {}
for k in photos['./ISS_Photos'].values():
    for name in k.keys():
        if name not in frequencies:
            frequencies[name] = 1
        else:
            frequencies[name] += 1
print(frequencies)
print(len(frequencies))

{'kimiya_yui&japan': 11, 'robert_behnken&usa': 16, 'nicholas_patrick&usa': 18, 'kathryn_hire&usa': 15, 'james_dutton&usa': 15, 'benjamin_drew&usa': 10, 'yury_usachev&russia': 35, 'george_zamka&usa': 11, 'daniel_bursch&usa': 15, 'scott_horowitz&usa': 5, 'shannon_walker&usa': 1, 'james_voss&usa': 23, 'clayton_anderson&usa': 6, 'catherine_coleman&usa': 14, 'nicole_stott&usa': 14, 'michael_barratt&usa': 11, 'stephen_lendsey&usa': 10, 'william_shepherd&usa': 5, 'claudie_haignere&france': 5, 'paolo_nespoli&italy': 12, 'ellen_ochoa&usa': 1, 'john_olivas&usa': 3, 'mikhail_tyurin&russia': 15, 'vladimir_dezhurov&russia': 21, 'joan_higginbotham&usa': 1, 'terry_virts&usa': 15, 'yuri_onufrienko&russia': 10, 'charles_hobaugh&usa': 15, 'susan_helms&usa': 21, 'james_reilly&usa': 5, 'michael_gernhardt&usa': 6, 'janet_kavandi&usa': 7, 'maksim_surayev&russia': 2, 'roman_romanenko&russia': 6, 'jeffrey_williams&usa': 4, 'robert_satcher&usa': 4, 'leland_melvin&usa': 7, 'frank_winne&belgium': 6, 'chris_hadfi

In [75]:
# Save the frequencies at which pairs appear to a json
with open('frequencies.json', 'w') as fp:
    json.dump(frequencies, fp)