In [529]:
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
import hypertools as hyp
import time, glob, itertools
import geoplot as gplt
import cartopy.crs as ccrs
import cartopy.io.img_tiles as cimgt
from scipy import spatial

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(style='ticks', context='talk')
%matplotlib inline

In [530]:
path = r'J:\\Geology\\WSGS\\Projects\\Critical Minerals\\probability maps\\'
allFiles=glob.glob(path+"*.csv")
frame=pd.DataFrame
listed=[]
for file in allFiles:
    df = pd.read_csv(file, index_col=[0])
    listed.append(df)
frame = pd.concat(listed)

In [531]:
huc12=gpd.read_file(r'J:\Geology\WSGS\GIS\huc_12.shp') #reads in the hydraulic basin units and adds an area column
huc12= huc12.to_crs({'init': 'epsg:3732'})


In [532]:
frame['HUC_12']=frame.HUC_12.astype(str)

In [533]:
CaptainHucstable = huc12.merge(frame, on='HUC_12' )

In [534]:
inHouse=pd.read_csv(r'J:\Geology\WSGS\Projects\Critical Minerals\inhouse_elemental.csv')

In [535]:
from geopandas import GeoDataFrame
from shapely.geometry import Point
import fiona
#writes the point data to a shapefile in the dir called data.shp
geometry = [Point(xy) for xy in zip(inHouse.longitude, inHouse.latitude)]
crs = {'init': 'epsg:3732'} 
geo_df = GeoDataFrame(inHouse, crs={'init': 'epsg:4326'}, geometry=geometry)
geo_df.to_file(driver='ESRI Shapefile', filename='data.shp')
projGeoDF=geo_df.to_crs(crs)

In [536]:
val_huc = gpd.sjoin(huc12, projGeoDF)

In [537]:
validations=pd.read_csv(r'J:\Geology\WSGS\Projects\Critical Minerals\validationmap.csv')

In [538]:
val_huc[val_huc['Sample_ID']==validations.loc[0].Sample_ID].HUC_12.unique()

array(['101800061003'], dtype=object)

In [None]:
predictionError=[]
binList=[]
elementList=[]

for item in CaptainHucstable.element.unique():
    try:
        distances=[]
#selects the HUC basin for each sample, predicts the concentration for that huc given the probabilities for that formation
#it samples from a uniform distribution for each bin given the probability of each bin
#it saves the predictions and compares the number of predictions per bin to the number of actual counts per bin
    
        for i in tqdm(range(len(validations.Sample_ID))):
            basin = val_huc[val_huc['Sample_ID']==validations.loc[i].Sample_ID].HUC_12.unique()[0]
            predicted = CaptainHucstable[(CaptainHucstable['element']==item)& (CaptainHucstable['HUC_12']==str(basin)) 
                        &(CaptainHucstable['formation'].str.contains(validations.loc[i].formation[0:10]))
                           ][['bins_ppm', 'prob', 'formation']].append(pd.DataFrame([[1, 0, 0]], columns=['bins_ppm', 'prob', 'formation'])
                                                                      ).sort_values(by=['bins_ppm'], ascending=True).iloc[0:]
            inhouse_samples = sorted(val_huc[val_huc['HUC_12']== str(basin)]['co_ppm'].dropna().values)
    
            bins=np.insert(CaptainHucstable[(CaptainHucstable['element']==item)].bins_ppm.unique(), 0, 0)
            hist, bin_edges = np.histogram(inhouse_samples, bins=bins)
    
            counts = np.floor(predicted.prob.values*len(inhouse_samples)).astype(int).tolist()
            counts.pop(0)
            try:
                distances.append(counts-hist)
            except:
                pass    
        differences = pd.DataFrame(distances, columns=bins[0:-1])
        predictionError.append([differences.iloc[0:,0].mean(),differences.iloc[0:,1].mean(),differences.iloc[0:,2].mean(),differences.iloc[0:,3].mean()])
        binList.append(bins)
        elementList.append(item)
    except:
        pass


100%|██████████████████████████████████████████████████████████████████████████████████| 99/99 [03:39<00:00,  2.22s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 99/99 [03:37<00:00,  2.20s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 99/99 [03:37<00:00,  2.20s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 99/99 [03:32<00:00,  2.15s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 99/99 [03:37<00:00,  2.20s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 99/99 [03:41<00:00,  2.23s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 99/99 [03:39<00:00,  2.22s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 99/99 [03:31<00:00,  2.14s/it]
100%|███████████████████████████████████

In [644]:
predictionError

[[9.0, 1.0, -6.0, -10.0], [15.0, 0.0, 5.0, -18.0]]

In [645]:
binList

[array([  0.  ,   2.39,   3.08,   4.19, 224.2 ]),
 array([  0.,   0.,   1.,   3., 194.])]

In [642]:
print([differences.iloc[0:,0].mean(),differences.iloc[0:,1].mean(),differences.iloc[0:,2].mean(),differences.iloc[0:,3].mean()])
#negative numbers are underprediction and positive numbers are over prediction
#again these are the difference between predicted counts per bin and actual counts per bin

[9.0, 1.0, -6.0, -10.0]


In [627]:
differences.iloc[0:,0].mean()

-1.462686567164179