In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import numpy as np
import pandas as pd
import seaborn as sns
from joblib import dump
from joblib import load
from train import train_rbf_simple
from utils import generate_migrants
from utils import calculate_thresholds_basic
from migrant_detection import predict_migrant_linear
from migrant_detection import adaptive_threshold
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Test basic migrant dection on linearly seperable dataset with linear SVM still included for the Visualization

In [None]:
data = load("insert dataset")
clf = load("insert model")
# Generate 10 migrants and a datapoint for each year since the migration
n=10
migrants = generate_migrants(data, n)

migrant = migrants[0]
# Extract the first two dimensions for plotting
x = migrant[:, 0].flatten()
y = migrant[:, 1].flatten()
cluster_0 = data[data['cluster'] == 0].drop(columns=['cluster'])
cluster_1 = data[data['cluster'] == 1].drop(columns=['cluster'])
x1 = data['87Sr/86Sr']
y1 = data['208Pb/204Pb']
plt.scatter(x1,y1)

# Generate basic threshold for distance to hyperplane(t1) and distance to cluster connecting vector(t2)
t1, t2 = calculate_thresholds_basic(data, clf)

# Drop labels
data = data.drop(columns=['cluster'])

predictions_migrant = predict_migrant_linear(migrant, data, clf, t1, t2)
print(predictions_migrant)

# Plot detected migrants
plt.scatter(x[0:20], y[0:20], c=predictions_migrant, cmap='viridis')

# Plot support vectors
support = np.array([[x[0],x[1], 3] for x in clf.support_vectors_]) #np.append(np.array(clf.support_vectors_),[[3],[3]],axis=1)
plt.scatter(support[:,0], support[:,1], c='red', cmap='viridis')
#plot.scatter

In [None]:
# Test Migrant prediction on cluster - still included for the visualization
data = load("insert dataset")
clf = load("insert model")

data_no_cluster = data.drop(columns=['cluster'])

# Extract the first two dimensions for plotting
x1 = data['87Sr/86Sr']
y1 = data['208Pb/204Pb']
plt.scatter(x1,y1)

# Generate basic threshold for distance to hyperplane(t1) and distance to cluster connecting vector(t2)
t1, t2 = calculate_thresholds_basic(data, clf)

# Drop labels
data = data.drop(columns=['cluster'])

predictions_migrant = predict_migrant_linear(data_no_cluster.values, data, clf, t1, t2)
print(predictions_migrant)

# Plot detected migrants
plt.scatter(x1, y1, c=predictions_migrant, cmap='viridis')

# Plot support vectors
support = np.array([[x[0],x[1], 3] for x in clf.support_vectors_]) #np.append(np.array(clf.support_vectors_),[[3],[3]],axis=1)
plt.scatter(support[:,0], support[:,1], c='red', cmap='viridis')
#plot.scatter


Test Migrant Generation from generated Data

In [None]:
# Included for the visualization
data = load("insert dataset")
n=1
migrants = generate_migrants(data, n)
# Extract the first two dimensions for plotting
x = migrants[:, :, 0].flatten()
y = migrants[:, :, 1].flatten()
cluster_0 = data[data['cluster'] == 0].drop(columns=['cluster'])
cluster_1 = data[data['cluster'] == 1].drop(columns=['cluster'])

x1 = data['87Sr/86Sr']
y1 = data['208Pb/204Pb']
plt.scatter(x1,y1)
#sns.kdeplot(data=cluster_0, x='87Sr/86Sr', y='208Pb/204Pb')
#sns.kdeplot(data=cluster_1, x='87Sr/86Sr', y='208Pb/204Pb')
# Create an array of colors based on the year (from 0 to 19)
colors = np.tile(np.arange(20), n)

# Create a scatter plot
plt.scatter(x, y, c=colors, cmap='magma')
# Plot single migrant
#plt.scatter(x[10], y[10], c='red')
# Add labels and title
plt.xlabel('87Sr/86Sr')
plt.ylabel('208Pb/204Pb')
#plt.title('Process if linear Mixing after Migration')

# Show colorbar for reference
cbar = plt.colorbar()
cbar.set_label('Year')

# Display the plot
#plt.show()
#plt.savefig('linear_mixing_process.svg')

In [None]:
''# Look at preparatory experiments for threshold calculations
from pprint import pprint
from joblib import load
import seaborn as sns
import numpy as np
import pandas as pd

results = load ("experiment_result/preparatory_experiment_threshold_3_multiplier_fine_tuning.pkl")
print(results.copy().groupby(['Multiplier']).mean().groupby(['Seed']).mean())
results = results.drop(columns=['Seed'])
results = results.rename(columns={"Score": "F1-Score"})
results = results.groupby(['Multiplier']).mean()
sns.lineplot(results)
plt.show()

In [None]:
# Visualize the distance distributions
#%pip install statsmodels # install statsmodels if needed
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
  
#data = np.array(load('Plots/distances_cluster_0.pkl'))
data = np.array(load('Plots/distances_cluster_1.pkl'))
#data = load('Plots/distances_ccv_[0.89797944 0.18919623 0.08775629 0.19234022 0.27453981 0.14073406].pkl')
#data = np.array(load('Plots/distances_ccv_[0.89797944 0.18919623 0.08775629 0.19234022 0.27453981 0.14073406].pkl'))
std = np.std(data)
mean = np.mean(data)
stats.skew(data)
stats.kurtosis(data,fisher=False)
cols =['Distance to CCV']
#data = pd.DataFrame(data,columns= cols)
#sns.displot(data, x=cols[0], kind='kde')
#plt.savefig("Distances_ccv_kde_plot_c1.svg")

sm.qqplot(data, line='s') 
#plt.show() 

#fig = plt.figure()
#ax = fig.add_subplot(111)
#res = stats.probplot(data, plot=sns.mpl.pyplot)

#ax.set_title("Probability Plot for the Distances of Cluster 1 to the CCV")

#plt.savefig("Distances_ccv_qq_plot_c1.svg")
#plt.show()
'''
g = sns.displot(
    data, 
    x=cols[0],
    kind='kde',
    facet_kws=dict(sharey=False, sharex=False)
)
# extract and flatten the axes from the figure
axes = g.axes.flatten()

# iterate through each axes
for ax in axes:
    # extract the species name
    #spec = ax.get_title().split(' = ')[1]
    
    # select the data for the species
    #data = pen_g.loc[spec, :]
    
    # print data as needed or comment out
    #print(data)
    
    # plot the lines
    ax.axvline(x=mean, c='k', ls='-', lw=1.8)
    ax.axvline(x=mean + std, c='yellow', ls='--', lw=1.8)
    ax.axvline(x=mean - std, c='yellow', ls='--', lw=1.8)

    ax.axvline(x=mean + 2*std, c='orange', ls='--', lw=1.8)
    ax.axvline(x=mean - 2*std, c='orange', ls='--', lw=1.8)

    ax.axvline(x=mean + 3*std, c='red', ls='--', lw=1.8)
    ax.axvline(x=mean - 3*std, c='red', ls='--', lw=1.8)

plt.savefig("Distances_ccv_kde_plot_quantiles_c1.svg")
'''

In [None]:
''# Look at Experiment 2 Results Cluster Size
from pprint import pprint
from joblib import load
import seaborn as sns
import numpy as np
import pandas as pd

results = load('experiment_result/experiment03_unbalanced_clusters.pkl') 
results = results.drop(columns=['Seed'])
results = results.rename(columns={"Score": "F1-Score"})
results = results.groupby(['Samples']).mean()
results = results.reset_index()

ax = sns.pointplot(data = pd.melt(results, ['Samples'], var_name = 'Metric', value_name = 'Score'), x='Samples', y='Score', hue='Metric')
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
plt.xticks(rotation=45)
#ax.set_xticks(['25','50','100','200','500','1000','10000','25000', '50000', '75000', '100000', '150000', '250000'])
#ax.set_xticks(range(13), labels=['25','50','100','200','500','1000','10000','25000', '50000', '75000', '100000', '150000', '250000']) # <--- set the ticks first
plt.show()

In [None]:
# Look at Experiment 1 Migration Progress Groups of Years
from pprint import pprint
from joblib import load
import seaborn as sns
import numpy as np
import pandas as pd
results = load('experiment_result/experiment01_groups_of_years_seeds_multiplier_1_5.pkl')
results = results.drop(columns=['Mahalanobis_migrant_c0', 'Mahalanobis_migrant_c1', 'Min_Mahalanobis', 'Mahalanobis_c0', 'Mahalanobis_c1'])

cols = ['Start_Year', 'End_Year']
results['Years'] = results[cols].apply(lambda row: '-'.join(row.values.astype(str)), axis=1)
results = results.drop(columns=cols)
results = results.drop(columns=['Seed'])
results = results.rename(columns={"Score": "F1-Score"})
results = results.groupby(['Years']).mean()
results = results.reset_index()
print(results)

order = ["1-4", "5-8", "9-12", "13-16", "16-20"]
ax = sns.pointplot(data = pd.melt(results, ['Years'], var_name = 'Metric', value_name = 'Score'), x='Years', y='Score', hue='Metric', order=order)
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
ax.set(ylim=(0.5, 1.01))
plt.xticks(rotation=45)
#ax.set_xticks(['25','50','100','200','500','1000','10000','25000', '50000', '75000', '100000', '150000', '250000'])
#ax.set_xticks(range(13), labels=['25','50','100','200','500','1000','10000','25000', '50000', '75000', '100000', '150000', '250000']) # <--- set the ticks first
plt.show()
columns=['Seed', 'Start_Year', 'End_Year', 'Mahalanobis_migrant_c0', 'Mahalanobis_migrant_c1', 'Min_Mahalanobis', 'Mahalanobis_c0', 'Mahalanobis_c1', 'Accuracy', 'Recall', 'Precision', 'Score']
columns=['Seed', 'Start_Year', 'End_Year', 'Accuracy', 'Recall', 'Precision', 'Score']

In [None]:
''# Look at Experiment 1 Migration Progress per Year
from pprint import pprint
from joblib import load
import seaborn as sns
import numpy as np
import pandas as pd
results = load("experiment_result/experiment01_per_years_seeds_multiplier_2.pkl")
# Mahalanobis Distance Visualization
results = results.drop(columns=['End_Year'])
results = results.drop(columns=['Seed'])
results['Min_Mahalanobis'] = results['Min_Mahalanobis'].apply(np.mean)

results = results.drop(columns=[ 'Migrant_Prediction', 'Mahalanobis_migrant_c0', 'Mahalanobis_migrant_c1', 'Mahalanobis_c0', 'Mahalanobis_c1'])
results = results.groupby(['Start_Year']).mean()
results = results.reset_index()
results = results.rename(columns={"Start_Year": "Year", "Min_Mahalanobis": "Mahalanobis Distance"})
sns.pointplot( data = results, x='Year', y='Mahalanobis Distance')
#ax = sns.pointplot(data = pd.melt(results, ['Year'], var_name = 'Data', value_name = 'Mahalanobis Distance'), x='Year', y='Mahalanobis Distance', hue='Metric')
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
"""
# Average Prediction of Migrant
results = results.drop(columns=['End_Year'])
results = results.drop(columns=['Seed'])
results['Migrant_Prediction'] = results['Migrant_Prediction'].apply(np.sum)
results = results.drop(columns=[ 'Min_Mahalanobis', 'Mahalanobis_migrant_c0', 'Mahalanobis_migrant_c1', 'Mahalanobis_c0', 'Mahalanobis_c1'])
results = results.groupby(['Start_Year']).mean()
results = results.reset_index()
sns.pointplot( data = results, x='Start_Year', y='Migrant_Prediction')
plt.xticks(rotation=45)
plt.show()
"""
"""
# Metric Visualization
results = results.drop(columns=[ 'Migrant_Prediction', 'Mahalanobis_migrant_c0', 'Mahalanobis_migrant_c1', 'Min_Mahalanobis', 'Mahalanobis_c0', 'Mahalanobis_c1'])
print(results.copy().groupby(['Start_Year']).mean())
print(results.copy().groupby(['Start_Year']).mean().groupby(['Seed']).mean())
#sns.barplot(results, x='Samples', y='Recall')
#plt.show()
#cols = ['Start_Year', 'End_Year']
#results['Years'] = results[cols].apply(lambda row: '-'.join(row.values.astype(str)), axis=1)
results = results.drop(columns=['End_Year'])
results = results.drop(columns=['Seed'])
results = results.rename(columns={"Score": "F1-Score", "Start_Year": "Year"})
#results = results[results['Samples']<1000]
results = results.groupby(['Year']).mean()
results = results.reset_index()
print(results)

#sns.pointplot(data = results, x=results.index, y='F1-Score')
#sns.pointplot(data = results, x=results.index, y='Recall')
ax = sns.pointplot(data = pd.melt(results, ['Year'], var_name = 'Metric', value_name = 'Score'), x='Year', y='Score', hue='Metric')
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
#ax.set(ylim=(0.5, 1.01))
plt.xticks(rotation=45)
#ax.set_xticks(['25','50','100','200','500','1000','10000','25000', '50000', '75000', '100000', '150000', '250000'])
#ax.set_xticks(range(13), labels=['25','50','100','200','500','1000','10000','25000', '50000', '75000', '100000', '150000', '250000']) # <--- set the ticks first
plt.show()
"""
columns = ['Seed', 'Start_Year', 'End_Year', 'Migrant_Prediction' ,'Mahalanobis_migrant_c0', 'Mahalanobis_migrant_c1', 'Min_Mahalanobis', 'Mahalanobis_c0', 'Mahalanobis_c1', 'Accuracy', 'Recall', 'Precision', 'Score']

columns=['Seed', 'Start_Year', 'End_Year', 'Accuracy', 'Recall', 'Precision', 'Score']


In [None]:
# Look at results for Experiment Spread Differences
from pprint import pprint
from joblib import load
import seaborn as sns
import numpy as np
import pandas as pd
results = load('experiment_result/experiment04_Spread_Differences_C10E10_rbf.pkl')
print(results.copy().groupby(['Multiplier for STD']).mean())
print(results.copy().groupby(['Multiplier for STD']).mean().groupby(['Seed']).mean())
results = results.drop(columns=['Seed','Trace Cluster 0', "Trace Cluster 1","Determinant Cluster 0", "Determinant Cluster 0","Confusion Matrix"])
results = results.rename(columns={"Score": "F1-Score"})
results = results.groupby(['Multiplier for STD']).mean()
results = results.reset_index()

ax = sns.pointplot(data = pd.melt(results, ['Multiplier for STD'], var_name = 'Metric', value_name = 'Score'), x='Multiplier for STD', y='Score', hue='Metric')
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
#ax.set(ylim=(0.5, 1.01))
plt.xticks(rotation=45)
#ax.set_xticks(['25','50','100','200','500','1000','10000','25000', '50000', '75000', '100000', '150000', '250000'])
#ax.set_xticks(range(13), labels=['25','50','100','200','500','1000','10000','25000', '50000', '75000', '100000', '150000', '250000']) # <--- set the ticks first
plt.show()

columns = ['Seed', 'Multiplier for STD','Trace Cluster 0', "Trace Cluster 1","Determinant Cluster 0", "Determinant Cluster 0","Confusion Matrix", 'Accuracy', 'Recall', 'Precision', 'Score']

In [None]:
# Look at results for Experiment Spread Differences additional infos
from pprint import pprint
from joblib import load
import seaborn as sns
import numpy as np
import pandas as pd
results = load('experiment_result/experiment04_Spread_Differences_10E10_rbf_additional_Infos.pkl')
results.columns = ['Seed', 'Multiplier for STD','Trace Cluster 0', "Trace Cluster 1","Determinant Cluster 0", "Determinant Cluster 1","Confusion Matrix", 'Accuracy', 'Recall', 'Precision', 'Score', 'SVM F1-Score', 'Number of overlapping Dimensions']

results = results.drop(columns=['Seed', 'Accuracy', 'Recall', 'Precision', 'Trace Cluster 0', "Trace Cluster 1","Determinant Cluster 0", "Determinant Cluster 1","Confusion Matrix", "Score", 'SVM F1-Score'])

results = results.groupby(['Multiplier for STD']).mean()
results = results.reset_index()

ax = sns.pointplot(data = results, x = "Multiplier for STD", y="Number of overlapping Dimensions")

plt.xticks(rotation=45)
plt.show()

columns = ['Seed', 'Multiplier for STD','Trace Cluster 0', "Trace Cluster 1","Determinant Cluster 0", "Determinant Cluster 0","Confusion Matrix", 'Accuracy', 'Recall', 'Precision', 'Score']

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay#
from sklearn.preprocessing import MinMaxScaler
"""
The Fritzens-Sanzeno/reduced dataset was published by Grupe, G., Klaut, D., Otto, L., Mauder, M., Lohrer, J., Kröger, P., and Lang, A. (2020).
The genesis and spread of the early Fritzens-Sanzeno culture (5th/4th cent. BCE)–Stable
isotope analysis of cremated and uncremated skeletal finds. Journal of Archaeological
Science: Reports, 29:102121. Publisher: Elsevier.

This should be cited accordingly when used
"""
# Generate combined results from grid searches
# Look at results of gridsearch
#grid = load("isotope_prediction/parameter_tuning/grid_search_isotope_2-3.pkl")
#grid = load('isotope_prediction/parameter_tuning/grid_search_isotope_1-2.pkl')
#grid = load('models/grid_search_isotope_1-2_linear_n100.pkl')

# Step 1: Filter rows where 'param_svc__kernel' is 'linear'
#linear_df = results[results['param_svc__kernel'] == 'linear']
# Step 2: Find duplicates in 'param_svc__C' within the filtered data
#duplicates = linear_df.duplicated(subset='param_svc__C')
# Combine the duplicate mask with the 'linear' condition back onto the original DataFrame
# To get indices of the original DataFrame to drop
#indices_to_drop = linear_df[duplicates].index

# Step 3: Drop these indices from the original DataFrame
#results = results.drop(index=indices_to_drop)

grid = load('models/grid_search_isotope_reduced_linear_n100.pkl')
results = pd.DataFrame(grid.cv_results_)
results = results.sort_values('rank_test_score')
#groups = dict(tuple(results.groupby('Group')))
results.head(10)

grid = load('models/grid_search_isotope_reduced_rbf_n100.pkl')
results1 = pd.DataFrame(grid.cv_results_)
results1 = results1.sort_values('rank_test_score')
results1.head(10)

grid = load('models/grid_search_isotope_reduced_poly_n20.pkl')
results2 = pd.DataFrame(grid.cv_results_)
results2 = results2.sort_values('rank_test_score')
results2.head(10)

#res = pd.concat([results2.head(2), results.head(2),results1.head(2)], axis=0, ignore_index=True) # Best two per kernel
res = pd.concat([results2, results,results1], axis=0, ignore_index=True) # All results
res = res.sort_values('mean_test_score', ascending=False)
res.head(20)
#dump(res, "models/grid_search_isotope_1-3_combined")

"""
grid = load('models/grid_search_isotope_reduced_poly_n100.pkl')
grid = pd.DataFrame(grid.cv_results_)
grid = grid[(grid['mean_test_score'].isna()) ]
grid.sort_values('param_svc__gamma')
"""
"""# Read Isotope Data Set
isodata = pd.read_csv("dataset")
# Create Groups for North of Alps, Inneralpine, and southern Tirol
site_group = pd.DataFrame(isodata["site code"].values /100)
isodata['site group'] = site_group
isodata['site group'] = isodata['site group'].astype('int64')
cols = ["87Sr/86Sr", "208Pb/204Pb", "207Pb/204Pb", "206Pb/204Pb", "208Pb/207Pb", "206Pb/207Pb"]

# Select two groups
isodata = isodata.query('`site group` == 1 | `site group` == 2')
isodata.loc[isodata['site group'] == 1, 'site group'] =0
isodata.loc[isodata['site group'] == 2, 'site group'] =1

scaler = MinMaxScaler() #Can be changed to robustscaler or standardscaler



# Scaling adjusted before splitting
isodata[cols] = scaler.fit_transform(isodata[cols])

print(
    "The best parameters are %s with a score of %0.2f"
    % (grid.best_params_, grid.best_score_)
)
predictions = grid.predict(isodata[cols])
print(classification_report(isodata['site group'],predictions))
cm = confusion_matrix(isodata['site group'], predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()
"""

In [None]:
# Visualize reduced isotope dataset prediction
from joblib import load
import seaborn as sns
from itertools import cycle
import numpy as np
import pandas as pd
from matplotlib.container import BarContainer
models = load('isotope_prediction/models/best_models_reduced_dataset.pkl')
result = load('isotope_prediction/predictions/reduced_datasetKernel: rbf C: 100 Gamma: scale.pkl')
result = load('isotope_prediction/predictions/reduced_dataset.pkl')
cols = result.columns
"""
The Fritzens-Sanzeno/reduced dataset was published by Grupe, G., Klaut, D., Otto, L., Mauder, M., Lohrer, J., Kröger, P., and Lang, A. (2020).
The genesis and spread of the early Fritzens-Sanzeno culture (5th/4th cent. BCE)–Stable
isotope analysis of cremated and uncremated skeletal finds. Journal of Archaeological
Science: Reports, 29:102121. Publisher: Elsevier.

This should be cited accordingly when used
"""
"""
result = result.sort_values(by=['Earliest positive Prediction at Multiplier'], ascending=False).head(50)
#result[['Migrant','Earliest positive Prediction at Multiplier']]
plot_data = result.loc[result['Earliest positive Prediction at Multiplier'] != -1].sort_values(by=['Earliest positive Prediction at Multiplier'], ascending=False)
ax = sns.barplot(plot_data, y='Burial', x='Earliest positive Prediction at Multiplier', hue='Migrant')
#plt.figure(figsize=(100,8))
#plt.xticks(rotation=45)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=5)
"""


"""
# For best 10

# Create Plot based on average Rank with new logarithmic grid
m1 = load("isotope_prediction/predictions/reduced_dataset_exhaustive_gridKernel: rbf C: 265608778.2946684 Gamma: 3.5111917342151347e-06.pkl")
m2 = load("isotope_prediction/predictions/reduced_dataset_exhaustive_gridKernel: rbf C: 351119173.4215127 Gamma: 3.5111917342151347e-06.pkl")
m3 = load("isotope_prediction/predictions/reduced_dataset_exhaustive_gridKernel: rbf C: 613590727.3413188 Gamma: 3.5111917342151347e-06.pkl")
m4 = load("isotope_prediction/predictions/reduced_dataset_exhaustive_gridKernel: rbf C: 1072267222.0103253 Gamma: 3.5111917342151347e-06.pkl")
m5 = load("isotope_prediction/predictions/reduced_dataset_exhaustive_gridKernel: rbf C: 1873817422.8603868 Gamma: 3.5111917342151347e-06.pkl")
m6 = load("isotope_prediction/predictions/reduced_dataset_exhaustive_gridKernel: rbf C: 2477076355.991714 Gamma: 3.5111917342151347e-06.pkl")
m7 = load("/isotope_prediction/predictions/reduced_dataset_exhaustive_gridKernel: rbf C: 3274549162.877732 Gamma: 3.5111917342151347e-06.pkl")
m8 = load("isotope_prediction/predictions/reduced_dataset_exhaustive_gridKernel: rbf C: 4328761281.083061 Gamma: 3.5111917342151347e-06.pkl")
m9 = load("isotope_prediction/predictions/reduced_dataset_exhaustive_gridKernel: rbf C: 5722367659.35022 Gamma: 3.5111917342151347e-06.pkl")
m10 = load("isotope_prediction/predictions/reduced_dataset_exhaustive_gridKernel: rbf C: 10000000000.0 Gamma: 3.5111917342151347e-06.pkl")

m1["rank1"] = m1["Earliest positive Prediction at Multiplier"].rank(method='min', ascending=False)
m1["rank2"] = m2["Earliest positive Prediction at Multiplier"].rank(method='min', ascending=False)
m1["rank3"] = m3["Earliest positive Prediction at Multiplier"].rank(method='min', ascending=False)
m1["rank4"] = m4["Earliest positive Prediction at Multiplier"].rank(method='min', ascending=False)
m1["rank5"] = m5["Earliest positive Prediction at Multiplier"].rank(method='min', ascending=False)
m1["rank6"] = m6["Earliest positive Prediction at Multiplier"].rank(method='min', ascending=False)
m1["rank7"] = m7["Earliest positive Prediction at Multiplier"].rank(method='min', ascending=False)
m1["rank8"] = m8["Earliest positive Prediction at Multiplier"].rank(method='min', ascending=False)
m1["rank9"] = m9["Earliest positive Prediction at Multiplier"].rank(method='min', ascending=False)
m1["rank10"] = m10["Earliest positive Prediction at Multiplier"].rank(method='min', ascending=False)
m1["Average Rank"] = m1[["rank1", "rank2", "rank3", "rank4", "rank5", "rank6", "rank7", "rank8", "rank9", "rank10"]].mean(axis=1)
m1[["rank1", "rank2", "rank3", "rank4", "rank5", "rank6", "rank7", "rank8", "rank9", "rank10", "Average Rank"]]
print(m1["Average Rank"])
"""

# For best 2 per kernel
# Best 2 per Kernel reduced dataset
m1 = load('isotope_prediction/predictions/reduced_dataset_exhaustive_grid_best_2_per_kernelKernel: linear C: 705.4802310718645 Gamma: nan.pkl')
m2 = load('isotope_prediction/predictions/reduced_dataset_exhaustive_grid_best_2_per_kernelKernel: linear C: 932.60334688322 Gamma: nan.pkl')
m3 = load('isotope_prediction/predictions/reduced_dataset_exhaustive_grid_best_2_per_kernelKernel: poly C: 0.7847599703514611 Gamma: 3.792690190732254.pkl')
m4 = load('isotope_prediction/predictions/reduced_dataset_exhaustive_grid_best_2_per_kernelKernel: poly C: 1623776.7391887177 Gamma: 0.0069519279617756054.pkl')
m5 = load('isotope_prediction/predictions/reduced_dataset_exhaustive_grid_best_2_per_kernelKernel: rbf C: 1072267222.0103253 Gamma: 3.5111917342151347e-06.pkl')
m6 = load('isotope_prediction/predictions/reduced_dataset_exhaustive_grid_best_2_per_kernelKernel: rbf C: 5722367659.35022 Gamma: 3.5111917342151347e-06.pkl')

m1["rank1"] = m1["Earliest positive Prediction at Multiplier"].rank(method='min', ascending=False)
m1["rank2"] = m2["Earliest positive Prediction at Multiplier"].rank(method='min', ascending=False)
m1["rank3"] = m3["Earliest positive Prediction at Multiplier"].rank(method='min', ascending=False)
m1["rank4"] = m4["Earliest positive Prediction at Multiplier"].rank(method='min', ascending=False)
m1["rank5"] = m5["Earliest positive Prediction at Multiplier"].rank(method='min', ascending=False)
m1["rank6"] = m6["Earliest positive Prediction at Multiplier"].rank(method='min', ascending=False)
m1["Average Rank"] = m1[["rank1", "rank2", "rank3", "rank4", "rank5", "rank6"]].mean(axis=1)


# Create barplot of ranks
plot_data = m1.sort_values(by=["Average Rank"], ascending=True).reset_index(drop=True)
plot_data['order'] = plot_data.index
plot_data = plot_data.reset_index()
plot_data['index'] = plot_data['index'] + 1
plot_data['index'] = plot_data['index'].astype(str)
plot_data['Burial_unique'] = plot_data['index'] + ": " + plot_data['Site'] + " " + plot_data['Burial']
# Add additional archaeologic context
archaeologic_context = pd.read_csv("path dataset with additional archaeologic context")
plot_data['Identifier'] = plot_data['Site'] + plot_data['Burial']
archaeologic_context['Identifier'] = archaeologic_context['Site'] + archaeologic_context['Burial']
archaeologic_context = archaeologic_context[['Identifier', 'Archaeologic Context']]
plot_data = plot_data.astype({'Identifier': str})
archaeologic_context = archaeologic_context.astype({'Identifier': str, 'Archaeologic Context' : str})

plot_data = plot_data.set_index('Identifier').join(archaeologic_context.set_index('Identifier'))

plt.figure(figsize=(10,20))
max_rank = plot_data['Average Rank'].max()
plot_data = plot_data[plot_data['Average Rank'] != max_rank] # Entferne alle mit max Rank - diese wurden nie als Migrant Candidate erkannt

plot_data = plot_data.sort_values(by=["order"], ascending=True).reset_index(drop=True)
ax = sns.barplot(plot_data, y='Burial_unique', x="Average Rank", hue='Sample Information', dodge=False, errorbar=None)#  order=plot_data.index,
# Add hatches based on additional archaeologic context
hatch_map = {
    "0" : "//",
    "1" : None,
    "2" : "\\\\"
}
plot_data['Archaeologic Context'] = plot_data['Archaeologic Context'].map(hatch_map)

plt.legend(fontsize='xx-large', title_fontsize='xx-large')


hatches_list = plot_data['Archaeologic Context'].to_numpy()
patches_list = list(ax.patches)
num_bars = int(len(patches_list)/3) # if more category change the 3
for i,thisbar_hatch in enumerate(ax.patches):
    # Set a different hatch for each bar
    if i < plot_data.shape[0]:
        if i == 14:
            print('2')
        hatch = plot_data['Archaeologic Context'].to_numpy()[i]
        thisbar_hatch.set_hatch(hatch) # Each Category needs to be colored: First Group
        patches_list[i].set_hatch(hatch)
        patches_list[i+num_bars].set_hatch(hatch) # Second Group
        patches_list[i+(2*num_bars)].set_hatch(hatch) # Third Group





#ax.set_xticklabels(plot_data["Average Rank"])
ax.set_ylabel("Burial")
plt.xlabel("Average Rank", fontsize=20)
plt.ylabel("Burial", fontsize=20)
#selection = plot_data.head(34)
plt.show()
dump(plot_data, "isotope_prediction/datasets/Fritzens_Sanzeno_Candidates.pkl")

"""
# Select all samples that have been predicted as migrants at some point
plot_data[plot_data['Average Rank'] < plot_data['Average Rank'].max()]
labeled = plot_data[plot_data['Average Rank'] < plot_data['Average Rank'].max()]
"""
"""
# Create pairplot of results
unscaled_isodata = pd.read_csv("path to dataset fritzens sanzeno")
unscaled_isodata['Average Rank'] = plot_data['Average Rank']
# Define marker styles "Circle", "Square", "Diamond"
marker_styles = ['o', 's', 'D']  
# Map species to marker styles  (read again as  "Circle", "Square", "Diamond")
species_to_marker = {0: 'o', 1: 'D'}
unscaled_isodata['markers'] = unscaled_isodata['site group'].map(species_to_marker)
# Pass marker style column to scatterplot arguments
# This sort of works:  at least maps the variable to the default marker style:
scatter_kws = {'s': 100, 'alpha': 0.8, 'style': unscaled_isodata['site group'], 'markers': species_to_marker}  # 'style' assigns DEFAULT markers which can be interpreted as "circle, x, square"  

cols = ["87Sr/86Sr", "208Pb/204Pb", "207Pb/204Pb", "206Pb/204Pb", "208Pb/207Pb", "206Pb/207Pb"]
p = sns.pairplot(unscaled_isodata, vars =cols, hue="Average Rank", plot_kws=scatter_kws)#, hue='site group')

# ensure axes match on each pairplot
for ax in p.axes.flatten():
    xlab = ax.get_xlabel()
    if len(xlab)==0: continue
    ax.set_xlim([unscaled_isodata[xlab].min() - unscaled_isodata[xlab].std(),unscaled_isodata[xlab].max() + unscaled_isodata[xlab].std()])

    ylab = ax.get_ylabel()
    if len(ylab)==0: continue
    ax.set_xlim([unscaled_isodata[xlab].min() - unscaled_isodata[xlab].std(),unscaled_isodata[xlab].max() + unscaled_isodata[xlab].std()])


lims_by_col = {'87Sr/86Sr':[0.708, 0.724]}#, "208Pb/204Pb":[37.9, 39], "207Pb/204Pb":[,15.725]}
"""
"""
# ensure axes match on each pairplotplt.legend(title='Team', fontsize='medium', title_fontsize='x-large')
for ax in p.axes.flatten():
    xlab = ax.get_xlabel()
    if len(xlab)==0: continue
    if xlab == '87Sr/86Sr':
        ax.set_xlim([lims_by_col[xlab]])

    ylab = ax.get_ylabel()
    if len(ylab)==0: continue
    if xlab == '87Sr/86Sr':
        ax.set_xlim(lims_by_col[ylab])
"""



"""
#results = load('experiment_result/experiment01_groups_of_years_seeds_multiplier_1_5.pkl')
# columns = ['Seed', 'Samples', 'Accuracy', 'Recall', 'Precision', 'Score']
#results = load('experiment_result/experiment01_real_gaussian_C1.pkl')
print(results.copy().groupby(['Multiplier for STD']).mean())
print(results.copy().groupby(['Multiplier for STD']).mean().groupby(['Seed']).mean())
#sns.barplot(results, x='Samples', y='Recall')
#plt.show()
#cols = ['Start_Year', 'End_Year']
#results['Years'] = results[cols].apply(lambda row: '-'.join(row.values.astype(str)), axis=1)
results = results.drop(columns=['Seed','Trace Cluster 0', "Trace Cluster 1","Determinant Cluster 0", "Determinant Cluster 0","Confusion Matrix"])
results = results.rename(columns={"Score": "F1-Score"})
#results = results[results['Samples']<1000]
results = results.groupby(['Multiplier for STD']).mean()
results = results.reset_index()
print(results)

#sns.pointplot(data = results, x=results.index, y='F1-Score')
#sns.pointplot(data = results, x=results.index, y='Recall')
ax = sns.pointplot(data = pd.melt(results, ['Multiplier for STD'], var_name = 'Metric', value_name = 'Score'), x='Multiplier for STD', y='Score', hue='Metric')
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
#ax.set(ylim=(0.5, 1.01))
plt.xticks(rotation=45)
#ax.set_xticks(['25','50','100','200','500','1000','10000','25000', '50000', '75000', '100000', '150000', '250000'])
#ax.set_xticks(range(13), labels=['25','50','100','200','500','1000','10000','25000', '50000', '75000', '100000', '150000', '250000']) # <--- set the ticks first
plt.show()

columns = ['Seed', 'Multiplier for STD','Trace Cluster 0', "Trace Cluster 1","Determinant Cluster 0", "Determinant Cluster 0","Confusion Matrix", 'Accuracy', 'Recall', 'Precision', 'Score']

"""