In [1]:
# Feb 24

In [2]:
from fastai.tabular import *
from fastai.basic_data import *
from fastai.metrics import accuracy

In [3]:
from tqdm import tqdm

In [4]:
import pandas
from tools import scoring_function
from sklearn.metrics import classification_report, confusion_matrix

In [5]:
# # For sands
# target = 'sand'
# class1 = 'SS'
# class2 = 'LS'

In [6]:
# For lith
target = 'lith'
class1 = 'Sand'
class2 = 'Shale'

In [7]:
f1_scores = []
accuracy_scores = []
confusion_matrices = []

In [8]:
for i in tqdm(range(10)):
    ground_truths = pandas.read_csv('../data/with_lith_sand.csv', index_col=0)
    ss = ground_truths[ground_truths[target] == class1].sample(frac=1)
    ls = ground_truths[ground_truths[target] == class2].sample(frac=1)

    # For sands
    # stratified_sands = pandas.concat([ss.iloc[:3000], ls.iloc[:1000]], axis=0)
    
    # For lith
    stratified_sands = pandas.concat([ss.iloc[:25000], ls.iloc[:25000]], axis=0)
    
    # It's easy to accidentally uncomment this by default
    # Don't touch this line!
    stratified_sands = stratified_sands.sample(frac=1)
    
    # For sands
    # train_data = stratified_sands.iloc[0:2500]
    # test_data = stratified_sands.iloc[2500:3000]

    # For lith
    train_data = stratified_sands.iloc[0:40000]
    test_data = stratified_sands.iloc[40000:]
    
    pred_index = test_data.index
    
    test_classes = test_data[target].unique().tolist()
    train_classes = train_data[target].unique().tolist()
    print(test_classes, train_classes)
    
    dep_var = target
    cat_names = [] #['well']
    cont_names = ['DENS', 'DTS', 'GR', 'PEF', 'RESD', 'RESM', 'RESS', 'NEUT', 'SP']
    # cont_names = ['tvdss', 'BS', 'CALI', 'DENS', 'DRHO', 'DTC', 'GR', 'NEUT', 'PEF','RESD', 'RESM', 'RESS', 'SP', 'DTS', 'GR_CORR','NEUT_CORR', 'RESD_CORR', 'RESS_CORR', 'TEMP', 'TENS']
    procs = [FillMissing, Categorify, Normalize]
    # procs = [FillMissing, Normalize]

    test = TabularList.from_df(test_data, path='.', cat_names=cat_names, cont_names=cont_names)

    data = (TabularList.from_df(train_data, path='.', cat_names=cat_names, cont_names=cont_names, procs=procs)
                               .random_split_by_pct()
                               .label_from_df(cols=dep_var)
                               .add_test(test)
                               .databunch())

    learn = tabular_learner(data, layers=[200,100], metrics=[accuracy])
    learn.fit(3)

    softmax, _ = learn.get_preds(ds_type=DatasetType.Test)
    preds = pandas.np.argmax(softmax, axis=1)
    
    # TODO: The classes don't align somehow
    # Mismatch w/ validation accuracy that is constantly >98%
    if test_classes[0] != train_classes[0]:
        preds = 1 - preds
        
    y_true = test_data[target]
    y_pred = pandas.Series(data=[test_classes[i] for i in preds], index=pred_index)
    confusion_matrices.append(confusion_matrix(y_true, y_pred))
    kamus = scoring_function(y_true, y_pred)
    f1_score = kamus['f1_score']
    accuracy_score = kamus['accuracy_score']
    f1_scores.append(f1_score)
    accuracy_scores.append(accuracy_score)

100%|██████████| 10/10 [02:27<00:00, 14.56s/it]


In [9]:
for i in confusion_matrices:
    print(i)
    print('\n')

[[4886  103]
 [   0 5011]]


[[ 215 4804]
 [4981    0]]


[[  97 4961]
 [4942    0]]


[[ 102 4938]
 [4951    9]]


[[ 300 4734]
 [4966    0]]


[[ 243 4780]
 [4977    0]]


[[4886  175]
 [   0 4939]]


[[ 148 4846]
 [5006    0]]


[[4882   62]
 [   0 5056]]


[[4831  180]
 [   0 4989]]




In [10]:
import matplotlib.pyplot as plt
%matplotlib inline

In [11]:
f1_scores

[0.9898271604938271,
 0.0,
 0.0,
 0.0018168971434339356,
 0.0,
 0.0,
 0.9825922610166119,
 0.0,
 0.9939060349911539,
 0.9822799763733018]

In [12]:
corrected_f1_score =list(map(lambda x:(1-x) if x < 0.5 else x, f1_scores))

In [13]:
corrected_accuracy_score =list(map(lambda x:(1-x) if x < 0.5 else x, accuracy_scores))

In [14]:
import numpy

In [15]:
numpy.array(corrected_f1_score).mean()

0.9946788535731461

In [16]:
numpy.array(corrected_f1_score).std()

0.006912357117955646

In [17]:
numpy.array(corrected_accuracy_score).mean()

0.9836599999999999

In [18]:
numpy.array(corrected_accuracy_score).std()

0.007027830390668233