In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import os
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from keras import Input
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import preprocessing, linear_model, tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


1. Read in Data

In [2]:
df = pd.read_csv('../data/liqdata_augmented.csv')

In [3]:
print(df.describe())

         velocity  weightpercentw  diametermm  thicknessmm    heightin  \
count  256.000000      256.000000  256.000000   256.000000  256.000000   
mean     4.447728        0.375000    0.017500     0.002250   43.750000   
std      1.424205        0.365578    0.005601     0.001349   25.636983   
min      2.545294        0.100000    0.010000     0.000500   13.000000   
25%      3.487517        0.137500    0.013750     0.001250   25.000000   
50%      4.446090        0.200000    0.017500     0.002250   40.500000   
75%      5.406302        0.437500    0.021250     0.003250   59.250000   
max      6.353439        1.000000    0.025000     0.004000   81.000000   

       craterdiameterfromouteredgesmm  craterdiameterfromouteredgesmmno  \
count                      256.000000                        256.000000   
mean                        53.556250                         64.258594   
std                         21.430322                         25.712146   
min                         13.50

2. Separate Column Paramters into separate int pd vects and scalars

In [4]:
out_full = df.iloc[:, 0]
vel = df.iloc[:, 2]
wp = df.iloc[:, 3]
d_mm = df.iloc[:, 4]
t_mm = df.iloc[:, 5]
sigma = df.iloc[:, 24]
nu = df.iloc[:, 25]
pi1 = df.iloc[:, 26]
pi2 = df.iloc[:, 27]
pi3 = df.iloc[:, 28]
pi4 = df.iloc[:, 29]
pi5 = df.iloc[:, 30]
pi6 = df.iloc[:, 31]
out_trunk = df.iloc[:, 41]
rho = 1000
g = 9.82

print(out_trunk)

0                      Splash
1                      Splash
2                      Splash
3      Broken or Intact Sheet
4                      Splash
                ...          
251                      Lump
252                      Lump
253                      Lump
254                      Lump
255                      Lump
Name: newcat1, Length: 256, dtype: object


3. Concat pi groups together and the dimension paramters together

In [5]:
params = pd.concat([vel, wp, d_mm, t_mm, sigma, nu], axis=1)
print(params)

     velocity  weightpercentw  diametermm  thicknessmm  sigma    nu
0    6.353439             0.1       0.010       0.0005     13  0.23
1    6.353439             0.1       0.010       0.0015     13  0.23
2    6.353439             0.1       0.010       0.0030     13  0.23
3    6.353439             0.1       0.010       0.0040     13  0.23
4    6.353439             0.1       0.015       0.0005     13  0.23
..        ...             ...         ...          ...    ...   ...
251  2.545294             1.0       0.020       0.0040    106  2.10
252  2.545294             1.0       0.025       0.0005    106  2.10
253  2.545294             1.0       0.025       0.0015    106  2.10
254  2.545294             1.0       0.025       0.0030    106  2.10
255  2.545294             1.0       0.025       0.0040    106  2.10

[256 rows x 6 columns]


In [6]:
pi_groups = pd.concat([pi1, pi2, pi3, pi4, pi5, pi6], axis=1)
print(pi_groups)

             pi1           pi2       pi3       pi4         pi5         pi6
0    3105.091400  2.457000e+09  0.061437  0.050000   13.750660  275.013180
1    3105.091400  2.457000e+09  0.552930  0.150000   40.889824  272.598820
2    3105.091400  2.457000e+09  2.211720  0.300000   80.716718  269.055720
3    3105.091400  2.457000e+09  3.931947  0.400000  106.697760  266.744380
4    3105.091400  1.092000e+09  0.061437  0.033333   13.750660  412.519770
..           ...           ...       ...       ...         ...         ...
251    61.118151  6.009071e+07  0.384581  0.200000    4.491864   22.459320
252    61.118151  3.845805e+07  0.006009  0.020000    0.600072   30.003622
253    61.118151  3.845805e+07  0.054082  0.060000    1.765548   29.425803
254    61.118151  3.845805e+07  0.216327  0.120000    3.431956   28.599632
255    61.118151  3.845805e+07  0.384581  0.160000    4.491864   28.074151

[256 rows x 6 columns]


4. Create the Model

In [8]:
def train_and_predict_using_model(model_name = "", model =None):
    model.fit(X_train, Y_train)
    Y_pred_train = model.predict(X_train)
    cm_train = confusion_matrix(Y_train, Y_pred_train)
    print(model_name)
    print("====================================")
    print("Training Confusion Matrix: ")
    print(cm_train)
    acc_train = accuracy_score(Y_train, Y_pred_train)
    
    print("Training Accuracy: %.2f%%" % (acc_train*100))
    print("====================================")
    
    Y_pred = model.predict(X_test)
    cm_test = confusion_matrix(Y_test, Y_pred)
    print("Testing Confusion Matrix: ")
    print(cm_test)
    acc_test = acc_train = accuracy_score(Y_test, Y_pred)
    
    print("Testing Accuracy: %.2f%%" % (acc_test*100))
    print("====================================")

In [20]:
def train_and_predict_using_pimodel(model_name = "", model =None):
    model.fit(scaled_X_train_pi, Y_train_pi)
    Y_pred_train_pi = model.predict(scaled_X_train_pi)
    cm_train_pi = confusion_matrix(Y_train_pi, Y_pred_train_pi)
    print(model_name)
    print("====================================")
    print("Training Confusion Matrix: ")
    print(cm_train_pi)
    acc_train = (np.trace(cm_train_pi)) / np.sum(np.sum(cm_train_pi))
    
    print("Training Accuracy: %.2f%%" % (acc_train*100))
    print("====================================")
    
    Y_pred_pi = model.predict(scaled_X_test_pi)
    cm_test_pi = confusion_matrix(Y_test_pi, Y_pred_pi)
    print("Testing Confusion Matrix: ")
    print(cm_test_pi)
    acc_test = acc_train = np.trace(cm_test_pi) / np.sum(np.sum(cm_test_pi))
    
    print("Testing Accuracy: %.2f%%" % (acc_test*100))
    print("====================================")

In [133]:
X_train, X_test, Y_train, Y_test = train_test_split(params, out_trunk, test_size=0.5, random_state=42)
X_train_pi, X_test_pi, Y_train_pi, Y_test_pi = train_test_split(pi_groups, out_trunk, test_size=0.25)

# scaler = preprocessing.StandardScaler().fit(X_train)
# scaled_X_train = scaler.transform(X_train)
# scaled_X_test = scaler.transform(X_test)

scaler_pi = preprocessing.StandardScaler().fit(X_train_pi)
scaled_X_train_pi = scaler_pi.transform(X_train_pi)
scaled_X_test_pi = scaler_pi.transform(X_test_pi)
encoder = LabelEncoder()
encoder.fit(out_trunk)
encoded_ytest_pi = encoder.transform(Y_test_pi)
encoded_ytrain_pi = encoder.transform(Y_train_pi)
encoded_y_pi = encoder.transform(out_trunk)
# One-hot encode the target variable
encoded_ytrain_pi_onehot = to_categorical(encoded_ytrain_pi)
encoded_ytest_pi_onehot = to_categorical(encoded_ytest_pi)
encoded_y_pi_onehot = to_categorical(encoded_y_pi)

In [84]:


print(encoded_y_pi_onehot)



[[0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 ...
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]]


5. Try some stuff


In [128]:
# Logistic Regression
linear_classifier = linear_model.LogisticRegression(random_state=123)
train_and_predict_using_model("Logistic Regression", linear_classifier)

Logistic Regression
Training Confusion Matrix: 
[[46  5  0 11]
 [ 3 11  0  0]
 [ 0  1  8  0]
 [ 9  0  0 34]]
Training Accuracy: 77.34%
Testing Confusion Matrix: 
[[24  5  0  9]
 [10  8  0  0]
 [ 0  2  8  0]
 [20  0  0 42]]
Testing Accuracy: 64.06%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [127]:
linear_classifier_pi = linear_model.LogisticRegression()
train_and_predict_using_pimodel("Pi Logistic Regression", linear_classifier_pi)

Pi Logistic Regression
Training Confusion Matrix: 
[[59  0  0  3]
 [ 8  6  0  0]
 [ 8  1  0  0]
 [ 9  0  0 34]]
Training Accuracy: 77.34%
Testing Confusion Matrix: 
[[37  0  0  1]
 [14  4  0  0]
 [ 4  6  0  0]
 [ 8  0  0 54]]
Testing Accuracy: 74.22%


Decision Tree



In [25]:
# Decision Tree
decision_tree_clf = tree.DecisionTreeClassifier()
train_and_predict_using_model('Decision Tree', decision_tree_clf)

Decision Tree
Training Confusion Matrix: 
[[62  0  0  0]
 [ 0 14  0  0]
 [ 0  0  9  0]
 [ 0  0  0 43]]
Training Accuracy: 100.00%
Testing Confusion Matrix: 
[[25  3  0 10]
 [ 9  9  0  0]
 [ 1  0  9  0]
 [ 8  0  0 54]]
Testing Accuracy: 75.78%


In [39]:
# Decision Tree and Random Forests
decision_tree_clf_pi = tree.DecisionTreeClassifier()
train_and_predict_using_pimodel('Decision Tree', decision_tree_clf_pi)

Decision Tree
Training Confusion Matrix: 
[[62  0  0  0]
 [ 0 14  0  0]
 [ 0  0  9  0]
 [ 0  0  0 43]]
Training Accuracy: 100.00%
Testing Confusion Matrix: 
[[32  3  0  3]
 [ 5 12  1  0]
 [ 0  0 10  0]
 [12  0  0 50]]
Testing Accuracy: 81.25%


Random Forest

In [40]:
forest = RandomForestClassifier(n_estimators=100, random_state=123, max_depth=5, max_features=6)
train_and_predict_using_model('Random Forest', forest)

Random Forest
Training Confusion Matrix: 
[[61  0  0  1]
 [ 2 12  0  0]
 [ 0  1  8  0]
 [ 2  0  0 41]]
Training Accuracy: 95.31%
Testing Confusion Matrix: 
[[32  2  0  4]
 [11  7  0  0]
 [ 1  1  8  0]
 [10  0  0 52]]
Testing Accuracy: 77.34%


In [45]:
forest = RandomForestClassifier(n_estimators=100, random_state=123, max_depth=5, max_features=8)
train_and_predict_using_pimodel('Random Forest', forest)

Random Forest
Training Confusion Matrix: 
[[62  0  0  0]
 [ 1 13  0  0]
 [ 0  0  9  0]
 [ 1  0  0 42]]
Training Accuracy: 98.44%
Testing Confusion Matrix: 
[[34  2  0  2]
 [ 4 13  1  0]
 [ 0  1  9  0]
 [12  0  0 50]]
Testing Accuracy: 82.81%


Neural Net


In [119]:
def train_and_predict_pinets(opt=None, model=None):
    
    if opt is None:
        model.compile(loss='categorical_crossentropy', metrics=['accuracy'])
    else:
    
        model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
    
    model.fit(scaled_X_train_pi, encoded_ytrain_pi_onehot, epochs=200, batch_size=4, verbose=0)
    
    
    Y_pred_train_pi = model.predict(scaled_X_train_pi)

    # Evaluate the model
    scores = model.evaluate(scaled_X_train_pi, encoded_ytrain_pi_onehot)

    print("Neural Network Trainset: \n%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))

    Y_pred_pi = model.predict(scaled_X_test_pi)

    # Convert predictions to class labels
    Y_pred_pi_labels = np.argmax(Y_pred_pi, axis=1)
    Y_test_pi_labels = np.argmax(encoded_ytest_pi_onehot, axis=1)
    # import pdb; pdb.set_trace()
    # cm_test = confusion_matrix(Y_test_pi_labels, Y_pred_pi_labels)
    # print("Testing Confusion Matrix: ")
    # print(cm_test)
    acc_test = accuracy_score(Y_test_pi_labels, Y_pred_pi_labels)

    print("Testing Accuracy: %.2f%%" % (acc_test * 100))
    print("====================================")
    
    print("Classification Report")
    print(classification_report(Y_test_pi_labels, Y_pred_pi_labels))
    
    print("====================================")
    # print(Y_pred_pi)

In [123]:
# Neural Network
def create_pinet(comp = False):

    pi_net = Sequential()

    pi_net.add(Input(shape=(6,))) 
    pi_net.add(Dense(64, activation='tanh'))
    pi_net.add(Dense(128, activation='relu'))
    pi_net.add(Dropout(0.5))

    pi_net.add(Dense(64, activation='relu'))
    pi_net.add(Dropout(0.5))
    pi_net.add(Dense(32, activation='gelu'))
    pi_net.add(Dense(4, activation='softmax'))
    
    if comp is True:
        pi_net.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
        
    return pi_net




In [None]:



# pi_net.add(Input(shape=(6,))) 
# pi_net.add(Dense(10, activation='relu'))
# pi_net.add(Dense(256, activation='relu'))
# pi_net.add(Dropout(0.1))
# pi_net.add(Dense(256, activation='tanh'))
# pi_net.add(Dropout(0.1))

# pi_net.add(Dense(38, activation='elu'))
# pi_net.add(Dropout(0.2))
# pi_net.add(Dense(16, activation='tanh'))
# pi_net.add(Dropout(0.2))
# pi_net.add(Dense(4, activation='sigmoid'))
# Add a final Dense layer with 1 output and 'sigmoid' activation function
# pi_net.add(Dense(1, activation='softmax'))

In [132]:
train_and_predict_pinets(model=create_pinet(True), opt='adam')

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9843 - loss: 0.0774  
Neural Network Trainset: 
compile_metrics: 98.44%
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Testing Accuracy: 84.38%
Classification Report
              precision    recall  f1-score   support

           0       0.76      0.93      0.83        27
           1       1.00      0.67      0.80         9
           2       1.00      1.00      1.00         6
           3       0.89      0.77      0.83        22

    accuracy                           0.84        64
   macro avg       0.91      0.84      0.87        64
weighted avg       0.86      0.84      0.84        64



Keras Classifier

In [106]:
# Use KerasClassifier for scikit-learn compatibility
model = KerasClassifier(build_fn=create_pinet(True), epochs=200, batch_size=4, verbose=0)

# Perform cross-validation
kfold = KFold(n_splits=4, shuffle=True)
results = cross_val_score(model, pi_groups, encoded_y_pi_onehot, cv=kfold)

print(f'Cross-Validation Accuracy: {results.mean():.2f} (+/- {results.std():.2f})')

# train_and_predict_pinets(model=create_pinet(True), opt='adam')

  saveable.load_own_variables(weights_store.get(inner_path))
  X, y = self._initialize(X, y)
  saveable.load_own_variables(weights_store.get(inner_path))
  X, y = self._initialize(X, y)
  saveable.load_own_variables(weights_store.get(inner_path))
  X, y = self._initialize(X, y)
  saveable.load_own_variables(weights_store.get(inner_path))
  X, y = self._initialize(X, y)


Cross-Validation Accuracy: 0.39 (+/- 0.02)
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 0.0042  
Neural Network Trainset: 
compile_metrics: 100.00%
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Testing Accuracy: 80.47%
Classification Report
              precision    recall  f1-score   support

           0       0.72      0.82      0.77        38
           1       0.62      0.44      0.52        18
           2       0.62      0.80      0.70        10
           3       0.95      0.90      0.93        62

    accuracy                           0.80       128
   macro avg       0.73      0.74      0.73       128
weighted avg       0.81      0.80      0.80       128

