# Base Case

In [1]:
# Feb 23

In [2]:
import pandas
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
import lightgbm as lgb
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import f1_score

# Load data + Preprocess

In [3]:
# main = pandas.read_csv('../data/processed/completed_data/mainset.csv', index_col=0)
# addi = pandas.read_csv('../data/processed/completed_data/addi.csv', index_col=0)

In [4]:
# data = pandas.concat([main.reset_index(), addi.reset_index()], axis=0, sort=False).reset_index(drop=True)

In [5]:
# data.rename(columns={'index':'tvdss'}, inplace=True)

In [6]:
# data['well'].unique()

array(['cheal-a10', 'cheal-a11', 'cheal-a12', 'cheal-b8', 'cheal-c3',
       'cheal-c4', 'cheal-g1', 'cheal-g2', 'cheal-g3', 'cheal-a6',
       'cheal-b1', 'cheal-1', 'cheal-2', 'cheal-a4', 'cheal-a3x',
       'cheal-b4', 'cheal-a7'], dtype=object)

In [7]:
# data.drop(columns=['md', 'DEPTH', 'tvd', 'TVD'], inplace=True) # 'md'

In [18]:
# data.columns

Index(['tvdss', 'BS', 'CALI', 'DENS', 'DRHO', 'DTC', 'GR', 'NEUT', 'PEF',
       'RESD', 'RESM', 'RESS', 'SP', 'fm', 'well', 'DTS', 'GR_CORR',
       'NEUT_CORR', 'RESD_CORR', 'RESS_CORR', 'TEMP', 'TENS'],
      dtype='object')

In [21]:
# data.to_csv('../data/processed/completed_data/datablob.csv')

In [23]:
data = pandas.read_csv('../data/processed/completed_data/datablob.csv', index_col=0)

In [24]:
data.columns

Index(['tvdss', 'BS', 'CALI', 'DENS', 'DRHO', 'DTC', 'GR', 'NEUT', 'PEF',
       'RESD', 'RESM', 'RESS', 'SP', 'fm', 'well', 'DTS', 'GR_CORR',
       'NEUT_CORR', 'RESD_CORR', 'RESS_CORR', 'TEMP', 'TENS'],
      dtype='object')

In [25]:
data['well'].unique()

array(['cheal-a10', 'cheal-a11', 'cheal-a12', 'cheal-b8', 'cheal-c3',
       'cheal-c4', 'cheal-g1', 'cheal-g2', 'cheal-g3', 'cheal-a6',
       'cheal-b1', 'cheal-1', 'cheal-2', 'cheal-a4', 'cheal-a3x',
       'cheal-b4', 'cheal-a7'], dtype=object)

In [26]:
for remove_well in data['well'].unique():
    holdout_data =  data[data['well'] == remove_well]
    train_data = data[data['well'] != remove_well]

    # Should still randomize the rows before passing to train
    train_data = train_data.sample(frac=1).reset_index(drop=True)
    holdout_data = holdout_data.sample(frac=1).reset_index(drop=True)

    # Store target variable
    # Store categorical & numerical variable names
    target = 'fm'
    cat_names = ['well']
    num_names = data.columns.tolist()
    num_names.remove('well')
    num_names.remove('fm')
    
    train_data = train_data.dropna(subset=[target])
    holdout_data = holdout_data.dropna(subset=[target])
    
    X_train = train_data.drop(columns=target)
    X_test = holdout_data.drop(columns=target)
    y_train = train_data[target]
    y_test = holdout_data[target]
    
    scaler = StandardScaler()
    scaler.fit(X_train.drop(columns=cat_names))

    # Scale X_train's numerical variables
    transformed_X_train = scaler.transform(X_train.drop(columns=cat_names))
    transformed_X_train = pandas.np.concatenate((transformed_X_train, X_train[cat_names]), axis=1)

    # Scale X_test's numerical variables
    transformed_X_test = scaler.transform(X_test.drop(columns=cat_names))
    transformed_X_test = pandas.np.concatenate((transformed_X_test, X_test[cat_names]), axis=1)

    # Need to rebuild as dropping the columns prolly messed w/ the order
    feature_names = X_train.drop(columns=cat_names).columns.tolist() + cat_names

    # Putting them into df
    transformed_X_train = pandas.DataFrame(data=transformed_X_train, columns=feature_names)
    transformed_X_test = pandas.DataFrame(data=transformed_X_test, columns=feature_names)

    # Change to categorical type
    transformed_X_train[cat_names] = transformed_X_train[cat_names].astype('category')
    transformed_X_test[cat_names] = transformed_X_test[cat_names].astype('category')

    # No idea how this happened but somehow we lost the float datatype to object in the process
    transformed_X_train[num_names] = transformed_X_train[num_names].astype(float)
    transformed_X_test[num_names] = transformed_X_test[num_names].astype(float)

    # Labels was 1 to 9, but we need this to be 0 to 8
    oe = OrdinalEncoder()
    oe.fit(y_train.values.reshape(-1, 1))
    transformed_y_train = oe.transform(y_train.values.reshape(-1, 1))
    transformed_y_test = oe.transform(y_test.values.reshape(-1, 1))

    num_class = len(oe.categories_[0])

    train_data = lgb.Dataset(transformed_X_train, label=transformed_y_train.ravel())
    test_data = lgb.Dataset(transformed_X_test, label=transformed_y_test.ravel())

    params = {
        'objective':'multiclass',
        'num_class':num_class,
        'metric':'softmax',
    }

    model = lgb.train(params, train_set=train_data)

    # Output is the softmax probabilities so need to grab the class w/ highest confidence
    transformed_y_train_hat = pandas.np.argmax(model.predict(transformed_X_train), axis=1)
    transformed_y_test_hat = pandas.np.argmax(model.predict(transformed_X_test), axis=1)
    print(f'Well removed: {remove_well}')
    print(f1_score(transformed_y_test, transformed_y_test_hat, average='micro'))

Well removed: cheal-a10
0.7852813852813852
Well removed: cheal-a11
0.897510133178923
Well removed: cheal-a12
0.8316378038362972


ValueError: Found unknown categories ['MM1 SST', 'Tikorangi LST', 'Lower Manganui Marker'] in column 0 during transform