In [1]:
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
import hypertools as hyp
import time, glob, itertools
import geoplot as gplt

from scipy import spatial

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(style='ticks', context='talk')
%matplotlib inline

In [2]:
path = r'J:\\Geology\\WSGS\\Projects\\Critical Minerals\\probability maps\\'
allFiles=glob.glob(path+"*.csv")
frame=pd.DataFrame
listed=[]
for file in allFiles:
    df = pd.read_csv(file, index_col=[0])
    listed.append(df)
frame = pd.concat(listed)

In [3]:
huc12=gpd.read_file(r'J:\Geology\WSGS\GIS\huc_12.shp') #reads in the hydraulic basin units and adds an area column
huc12= huc12.to_crs({'init': 'epsg:3732'})


In [4]:
len(huc12.HUC_12.unique())

2382

In [5]:
frame['HUC_12']=frame.HUC_12.astype(str)

In [6]:
CaptainHucstable = huc12.merge(frame, on='HUC_12' )

In [7]:
inHouse=pd.read_csv(r'J:\Geology\WSGS\Projects\Critical Minerals\inhouse_elemental.csv')

In [8]:
from geopandas import GeoDataFrame
from shapely.geometry import Point
import fiona
#writes the point data to a shapefile in the dir called data.shp
geometry = [Point(xy) for xy in zip(inHouse.longitude, inHouse.latitude)]
crs = {'init': 'epsg:3732'} 
geo_df = GeoDataFrame(inHouse, crs={'init': 'epsg:4326'}, geometry=geometry)
#geo_df.to_file(driver='ESRI Shapefile', filename='data.shp')
projGeoDF=geo_df.to_crs(crs)

In [9]:
val_huc = gpd.sjoin(huc12, projGeoDF)

In [10]:
validations=pd.read_csv(r'J:\Geology\WSGS\Projects\Critical Minerals\validationmap.csv')

In [11]:
val_huc = val_huc.merge(validations, on='Sample_ID') #this is the inhouse data joined with the HUC dataset

In [21]:
true_vals = []
for samples in tqdm(range(len(val_huc))):
    true_vals.append(val_huc.loc[samples].HUC_12)

bflat = []  
for samples in tqdm(range(len(val_huc))):
    formation = val_huc.loc[samples].formation[0:10]
    b = []
    for element in range(34,99):
        ppm = val_huc.columns[element] #select each element in the list
        valued = val_huc.loc[samples][element] #select each elements values 
        #from the big list, select element, formation, and concentration greater than the sample has, and create a list of the
        #basins that we should explore
        b0 = CaptainHucstable[(CaptainHucstable['element']== ppm) & (CaptainHucstable['formation'].str.contains(formation)) &(
            CaptainHucstable['bins_ppm']>= valued) & (CaptainHucstable['prob']>0.6)].HUC_12.values
        b.append(b0)
    bflat.append(np.unique(np.asarray([item for sublist in b for item in sublist])))
preds = np.unique(np.asarray([item for sublist in bflat for item in sublist]))
TP = len(set(true_vals).intersection(preds))
FP = len(preds)-len(set(true_vals).intersection(preds))
TN = len(huc12.HUC_12.unique())-len(preds)
FN = len(set(true_vals).intersection(set(huc12.HUC_12.unique()) ^ set(preds)))

precision = TP/(TP+FP)
recall = TP/(TP+FN)
acc = (TP+TN)/(TP+TN+FP+FN)

100%|██████████████████████████████████████████████████████████████████████████████| 101/101 [00:00<00:00, 4204.92it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 101/101 [2:58:47<00:00, 106.21s/it]


In [22]:
print([precision, recall, acc ])

[0.025193798449612403, 0.8125, 0.36511919698870765]


In [23]:
F1 = 2*((precision*recall)/(precision+recall))
F1

0.04887218045112781

In [24]:
print([TP, FP, TN, FN])

[39, 1509, 834, 9]


In [None]:
#df=pd.DataFrame(columns=val_huc.columns[34:-1])
#df['Sample_ID'] = val_huc.Sample_ID
#choices = []
Fone = []



for samples in tqdm(range(len(val_huc))):
    TP = []
    FP = []
    TN = []
    FN = []
    a = val_huc.loc[samples].HUC_12 #select the samples huc
    formation = val_huc.loc[samples].formation[0:10]
    b = []
    for element in range(34,99):
        ppm = val_huc.columns[element] #select each element in the list
        valued = val_huc.loc[samples][element] #select each elements values 
        #from the big list, select element, formation, and concentration greater than the sample has, and create a list of the
        #basins that we should explore
        b0 = CaptainHucstable[(CaptainHucstable['element']== ppm) & (CaptainHucstable['formation'].str.contains(formation)) &(
            CaptainHucstable['bins_ppm']>= valued) & (CaptainHucstable['prob']>0.1)].HUC_12.values
        b.append(b0)
    bflat = np.unique(np.asarray([item for sublist in b for item in sublist]))

    if a in bflat:
        #df[ppm].iloc[samples]=1 #if the actual HUC matches the list of predicted HUC's then we record a 1
        TP.append(1)
        FP.append(len(b)-1)
        TN.append(len(huc12.HUC_12.unique())-len(b))
        FN.append(0)
                       
    else: 
        TP.append(0)
        FP.append(len(b))
        TN.append(len(huc12.HUC_12.unique())-1)
        FN.append(1)
            #df[ppm].iloc[samples]=0 #if they don't match we record a 0
        #choices.append(len(b))
    precision = np.nan_to_num(np.asarray(TP)/(np.asarray(TP)+np.asarray(FP)))
    recall = np.asarray(TP)/(np.asarray(TP)+np.asarray(FN))
    acc = (np.asarray(TP)+np.asarray(TN))/(np.asarray(TP)+np.asarray(TN)+np.asarray(FP)+np.asarray(FN))
    F1 = np.nan_to_num(2*((precision*recall)/(precision+recall)))
    Fone.append(F1)
#df.to_csv(r'J:\Geology\WSGS\Projects\Critical Minerals\accuracy_60percent_probability.csv')


In [24]:
inHouse.columns

Index(['Publication', 'Sample_ID', 'Old_Sample_ID', 'WyoDoG', 'WyoDoG_Name',
       'Sample_Desc', 'latitude', 'longitude', 'ag_ppm', 'al_pct', 'as_ppm',
       'au_ppm', 'ba_ppm', 'be_ppm', 'bi_ppm', 'ca_pct', 'cd_ppm', 'cd_ppm.1',
       'co_ppm', 'cr_ppm', 'cs_ppm', 'cu_ppm', 'dy_ppm', 'er_ppm', 'eu_ppm',
       'fe_pct', 'ga_ppm', 'gd_ppm', 'ge_ppm', 'hf_ppm', 'hg_ppm', 'ho_ppm',
       'in_ppm', 'k_pct', 'la_ppm', 'li_ppm', 'lu_ppm', 'mg_pct', 'mn_ppm',
       'mo_ppm', 'na_pct', 'nb_ppm', 'nd_ppm', 'ni_ppm', 'p_ppm', 'pb_ppm',
       'pd_ppm', 'pr_ppm', 'pt_ppm', 'rb_ppm', 're_ppm', 's_pct', 'sb_ppm',
       'sc_ppm', 'se_ppm', 'sm_ppm', 'sn_ppm', 'sr_ppm', 'ta_ppm', 'tb_ppm',
       'te_ppm', 'th_ppm', 'ti_ppm1', 'ti_pct', 'ti_ppm', 'tm_ppm', 'u_ppm',
       'v_ppm', 'w_ppm', 'y_ppm', 'yb_ppm', 'zn_ppm', 'zr_ppm', 'geometry'],
      dtype='object')