# Creates folders with symlinks for training

Based on the accuracy of the folders, create training folders that represent different accuracy thresholds

In [104]:
import glob
import os
from pathlib import Path
import pandas as pd

SPECTROGRAM_PATH = '/earthquake-recognition/data/all-spectrograms/'
SYMLINK_PATH = '/earthquake-recognition/data/all-spectrograms-symlinks/'
CSV_PATH = 'csv/weighted_results_everywhere.csv'

In [89]:
folders = glob.glob(SPECTROGRAM_PATH + "*")
names = list(map(os.path.basename, folders))

In [126]:
pd.set_option('display.max_rows', 1000)

df = pd.read_csv(CSV_PATH)
df['Name'] = df['Name'].map(lambda name: name.split("/")[1])
df['Noise Percent Correct'] = df['Amount Correct Noise'] / df['Amount Total Noise']
df['Local Percent Correct'] = df['Amount Correct Local'] / df['Amount Total Local']
df = df.sort_values('Total Percent Correct', ascending=False)
new_order = [0,6,1,2,3,4,7,8,5]
df = df[df.columns[new_order]]
df = df.reset_index(drop=True)
df[:10]

Unnamed: 0,Name,Epochs,Amount Correct Noise,Amount Total Noise,Amount Correct Local,Amount Total Local,Noise Percent Correct,Local Percent Correct,Total Percent Correct
0,18.468--67.111099,10,45,45,467,467,1.0,1.0,1.0
1,33.599991--117.195427,10,1466,1466,1094,1094,1.0,1.0,1.0
2,40.723--111.907173,10,134,134,122,122,1.0,1.0,1.0
3,39.5089--119.836601,10,1524,1524,1036,1036,1.0,1.0,1.0
4,18.018099--66.022209,10,229,229,283,283,1.0,1.0,1.0
5,33.6688--116.672997,10,1261,1261,786,787,1.0,0.998729,0.999512
6,20.125248--155.777374,10,1281,1281,766,767,1.0,0.998696,0.999512
7,32.820301--117.056702,10,989,989,1058,1059,1.0,0.999056,0.999512
8,19.493172--155.386017,10,1435,1436,1123,1124,0.999304,0.99911,0.999219
9,19.493172--155.386017-minmag1,10,1453,1455,1105,1105,0.998625,1.0,0.999219


In [127]:
def limit_percent_correct(df, limit):
    """ 
    Limits the percent correct while requiring the earthquake percent correct
    to be at least as high as the minimum as well as the total percent correct
    """
    
    return df[(df['Total Percent Correct'] > limit) & (df['Local Percent Correct'] > limit)]

df_995 = limit_percent_correct(df, .995)
df_99 = limit_percent_correct(df, .99)
df_98 = limit_percent_correct(df, .98)
df_97 = limit_percent_correct(df, .97)
df_96 = limit_percent_correct(df, .96)
df_95 = limit_percent_correct(df, .95)
df_90 = limit_percent_correct(df, .9)

dfs = [df_995, df_99, df_98, df_97, df_96, df_95, df_90]
df_names = ['99.5', '99', '98', '97', '96', '95', '90']

In [132]:
# If writing in DOCKER, the path will be wrong. So use the relative of the real filesystem
RELATIVE_SYMLINK_PATH = '/home/audretj/developer-projects/earthquake-recognition/data/all-spectrograms'

def make_symlinks(folder_names, read_path, write_path):
    """
    Makes symlinks from the foldernames in the read_path in the folder name from the write_path
    """
    read_folders = set(glob.glob(read_path + "*"))
    read_folders = [folder for folder in read_folders if os.path.basename(folder) in set(folder_names)]
    
    if not os.path.exists(write_path):
        os.makedirs(write_path)
    
    for folder in read_folders:
        writepath = os.path.join(write_path, os.path.basename(folder))
        relative_path = os.path.join(RELATIVE_SYMLINK_PATH, os.path.basename(folder))     
        
        # If it exists, skip it
        if not os.path.exists(writepath):
            os.symlink(relative_path, writepath)
        

In [133]:
for df, df_name in zip(dfs, df_names):
    folder_names = df.Name.values
    write_path = os.path.join(SYMLINK_PATH, df_name)
    symlink_path = os.path.join(SYMLINK_PATH, df_name)
    make_symlinks(folder_names, SPECTROGRAM_PATH, symlink_path)