In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics


from sklearn.ensemble import RandomForestClassifier
from sklearn.mixture import GaussianMixture

In [3]:
def remove_NaN(df, t):
    return df.dropna(axis=1, thresh=int(((100-t)/100)*df.shape[0] + 1))

In [4]:
def scale(X_train, X_val, X_test):
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
    X_val = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    return X_train, X_val, X_test

In [5]:
def impute(X_train, X_val, X_test):
    imputer = SimpleImputer(strategy = 'most_frequent')
    imputer.fit(X_train)
    X_train = pd.DataFrame(imputer.transform(X_train), columns=X_train.columns)
    X_val = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns)
    X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)
    return X_train, X_val, X_test

In [6]:
sm = SMOTE(random_state=42)
def over_sampling(X, Y): 
    x, y = sm.fit_resample(X, Y)
    return x,y


In [7]:
def score_calc(model, y_val,  y_pred):
    print(model)
    print("Confusion Matrix")
    print(metrics.confusion_matrix(y_val, y_pred))
    print(f"Validation F3 Score: {metrics.fbeta_score(y_val, y_pred, beta=3)}")
    print(f"Classification Report\n{metrics.classification_report(y_val, y_pred)}")

In [8]:
def save_model(model, test_pred, name, best_estimator=False):
    if best_estimator:
        print(f"Best Estimator: {model.best_estimator_}")
    dataset = pd.DataFrame({'Id':  np.arange(test_pred.size), 'Predicted':test_pred}, columns=['Id', 'Predicted'])
    dataset.to_csv(f"{name}_MB3VKJ.csv", index=False)

In [9]:
X = pd.read_csv('data/X_train.csv')
X.drop(["Id"], axis=1, inplace=True)
y = pd.read_csv('data/Y_train.csv')
y.drop(["Id"], axis=1, inplace=True)
X_test = pd.read_csv('data/X_test.csv')
X_test.drop(["Id"], axis=1, inplace=True)
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.25)


# 20, 30, 40, 60
X_train_20 = remove_NaN(X_train, 20)
X_val_20 = X_val[X_train_20.columns.tolist()]
X_test_20 = X_test[X_train_20.columns.tolist()]
X_train_20, X_val_20, X_test_20 = scale(X_train_20, X_val_20, X_test_20)
X_train_20, X_val_20, X_test_20 = impute(X_train_20, X_val_20, X_test_20)

X_train_30 = remove_NaN(X_train, 30)
X_val_30 = X_val[X_train_30.columns.tolist()]
X_test_30 = X_test[X_train_30.columns.tolist()]
X_train_30, X_val_30, X_test_30 = scale(X_train_30, X_val_30, X_test_30)
X_train_30, X_val_30, X_test_30 = impute(X_train_30, X_val_30, X_test_30)

X_train_40 = remove_NaN(X_train, 40)
X_val_40 = X_val[X_train_40.columns.tolist()]
X_test_40 = X_test[X_train_40.columns.tolist()]
X_train_40, X_val_40, X_test_40 = scale(X_train_40, X_val_40, X_test_40)
X_train_40, X_val_40, X_test_40 = impute(X_train_40, X_val_40, X_test_40)

X_train_60 = remove_NaN(X_train, 60)
X_val_60 = X_val[X_train_60.columns.tolist()]
X_test_60 = X_test[X_train_60.columns.tolist()]
X_train_60, X_val_60, X_test_60 = scale(X_train_60, X_val_60, X_test_60)
X_train_60, X_val_60, X_test_60 = impute(X_train_60, X_val_60, X_test_60)


# # 0.99 0.95
pca_95 = PCA(0.95)
X_train_20_95 = pca_95.fit(X_train_20).transform(X_train_20)
X_val_20_95 = pca_95.transform(X_val_20)
X_test_20_95 = pca_95.transform(X_test_20)

pca_95 = PCA(0.95)
X_train_30_95 = pca_95.fit(X_train_30).transform(X_train_30)
X_val_30_95 = pca_95.transform(X_val_30)
X_test_30_95 = pca_95.transform(X_test_30)

pca_95 = PCA(0.95)
X_train_40_95 = pca_95.fit(X_train_40).transform(X_train_40)
X_val_40_95 = pca_95.transform(X_val_40)
X_test_40_95 = pca_95.transform(X_test_40)

pca_95 = PCA(0.95)
X_train_60_95 = pca_95.fit(X_train_60).transform(X_train_60)
X_val_60_95 = pca_95.transform(X_val_60)
X_test_60_95 = pca_95.transform(X_test_60)

pca_99 = PCA(0.99)
X_train_20_99 = pca_99.fit(X_train_20).transform(X_train_20)
X_val_20_99 = pca_99.transform(X_val_20)
X_test_20_99 = pca_99.transform(X_test_20)

pca_99 = PCA(0.99)
X_train_30_99 = pca_99.fit(X_train_30).transform(X_train_30)
X_val_30_99 = pca_99.transform(X_val_30)
X_test_30_99 = pca_99.transform(X_test_30)

pca_99 = PCA(0.99)
X_train_40_99 = pca_99.fit(X_train_40).transform(X_train_40)
X_val_40_99 = pca_99.transform(X_val_40)
X_test_40_99 = pca_99.transform(X_test_40)

pca_99 = PCA(0.99)
X_train_60_99 = pca_99.fit(X_train_60).transform(X_train_60)
X_val_60_99 = pca_99.transform(X_val_60)
X_test_60_99 = pca_99.transform(X_test_60)

In [10]:
X, y = over_sampling(X_train_20_95, y_train)
y_train_ravel = y.values.ravel()

params = [{'criterion': ['gini', 'entropy'], 
           'max_features': ['sqrt'], 
           'n_estimators': range(125,201,25),
           'max_depth': [10,20,30]}]
model = RandomForestClassifier(random_state=42)
gridSearch = GridSearchCV(model, params, cv=5, scoring='recall', verbose=10, n_jobs=5) 
gridSearch.fit(X, y_train_ravel)
y_pred_val = gridSearch.predict(X_val_20_95)
test_pred = gridSearch.predict(X_test_20_95)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 2/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 3/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 4/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 5/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 4/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.981 total time=  43.4s
[CV 1/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.988 total time=  43.5s
[CV 5/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.983 total time=  43.4s
[CV 1/5; 2/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=150
[CV 2/5; 2/24] START criterion=gini, max_depth=10, max_feature

In [22]:
# X_train_20_95
score_calc("Random Forest NAN 20 PCA 95%", y_val,  y_pred_val)
save_model(gridSearch, test_pred, "Random_Forest_20_95", True)

Random Forest NAN 20 PCA 95%
Confusion Matrix
[[9683  112]
 [  24  156]]
Validation F3 Score: 0.8262711864406782
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      9795
           1       0.58      0.87      0.70       180

    accuracy                           0.99      9975
   macro avg       0.79      0.93      0.84      9975
weighted avg       0.99      0.99      0.99      9975

Best Estimator: RandomForestClassifier(criterion='entropy', max_depth=30, max_features='sqrt',
                       n_estimators=150, random_state=42)


In [10]:
X, y = over_sampling(X_train_20_95, y_train)
y_train_ravel = y.values.ravel()

params = [{"n_components": [2],
          "covariance_type": ["spherical", "tied", "diag", "full"]}]
model = GaussianMixture(random_state=42)
CgridSearch = GridSearchCV(model, params, cv=5, scoring='recall', verbose=10, n_jobs=5) 
CgridSearch.fit(X, y_train_ravel)
y_pred_val = CgridSearch.predict(X_train_20_95)
test_pred = CgridSearch.predict(X_test_20_95)

train_pred = CgridSearch.predict(X_train_20_95)
pred = np.reshape(train_pred, (train_pred.shape[0], 1))
X_train_20_95_cluster = np.append(X_train_20_95, pred, axis=1)

y_pred_val = CgridSearch.predict(X_val_20_95)
pred = np.reshape(y_pred_val, (y_pred_val.shape[0], 1))
X_val_20_95_cluster = np.append(X_val_20_95, pred, axis=1)

test_pred = CgridSearch.predict(X_test_20_95)
pred = np.reshape(test_pred, (test_pred.shape[0], 1))
X_test_20_95_cluster = np.append(X_test_20_95, pred, axis=1)

print(X_train_20_95_cluster[0].size)
print(X_val_20_95_cluster[0].size)
print(X_test_20_95_cluster[0].size)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 3/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 1/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 2/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 4/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 5/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 5/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.964 total time=   0.9s
[CV 4/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.962 total time=   0.9s
[CV 2/5; 2/4] START covariance_type=tied, n_components=2........................
[CV 1/5; 2/4] START covariance_type=tied, n_components=2........................
[CV 3/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.959 total time=   1.0s
[CV 3/5; 2/4] START covariance_type=tied, n_components=2........................
[CV 1/5; 1/4]

In [11]:
X, y = over_sampling(X_train_20_95_cluster, y_train)
y_train_ravel = y.values.ravel()

params = [{'criterion': ['gini', 'entropy'], 
           'max_features': ['sqrt'], 
           'n_estimators': range(125,201,25),
           'max_depth': [10,20,30]}]
model = RandomForestClassifier(random_state=42)
gridSearch = GridSearchCV(model, params, cv=5, scoring='recall', verbose=10, n_jobs=5) 
gridSearch.fit(X, y_train_ravel)
y_pred_val = gridSearch.predict(X_val_20_95_cluster)
test_pred = gridSearch.predict(X_test_20_95_cluster)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 2/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 3/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 1/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 4/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 5/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 4/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.990 total time=  47.5s
[CV 5/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.988 total time=  47.5s
[CV 1/5; 2/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=150
[CV 2/5; 2/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=150
[CV 2/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=

In [12]:
score_calc("Random Forest With Cluster NAN 20 PCA 95%", y_val,  y_pred_val)
save_model(gridSearch, test_pred, "Random_Forest_Cluster_20_95", True)

Random Forest With Cluster NAN 20 PCA 95%
Confusion Matrix
[[9673  121]
 [  22  159]]
Validation F3 Score: 0.8328968046097434
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      9794
           1       0.57      0.88      0.69       181

    accuracy                           0.99      9975
   macro avg       0.78      0.93      0.84      9975
weighted avg       0.99      0.99      0.99      9975

Best Estimator: RandomForestClassifier(criterion='entropy', max_depth=20, max_features='sqrt',
                       n_estimators=200, random_state=42)


In [23]:
X, y = over_sampling(X_train_30_95, y_train)
y_train_ravel = y.values.ravel()

params = [{'criterion': ['gini', 'entropy'], 
           'max_features': ['sqrt'], 
           'n_estimators': range(125,201,25),
           'max_depth': [10,20,30]}]
model = RandomForestClassifier(random_state=42)
gridSearch = GridSearchCV(model, params, cv=5, scoring='recall', verbose=10, n_jobs=5) 
gridSearch.fit(X, y_train_ravel)
y_pred_val = gridSearch.predict(X_val_30_95)
test_pred = gridSearch.predict(X_test_30_95)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 2/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 3/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 4/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 5/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 1/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.981 total time=  45.7s
[CV 5/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.980 total time=  45.7s
[CV 1/5; 2/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=150
[CV 2/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.981 total time=  45.7s
[CV 2/5; 2/24] START criterion=gini, max_depth=10, max_feature

In [24]:
score_calc("Random Forest NAN 30 PCA 95%", y_val,  y_pred_val)
save_model(gridSearch, test_pred, "Random_Forest_30_95", True)

Random Forest NAN 30 PCA 95%
Confusion Matrix
[[9677  118]
 [  29  151]]
Validation F3 Score: 0.7993647432503972
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      9795
           1       0.56      0.84      0.67       180

    accuracy                           0.99      9975
   macro avg       0.78      0.91      0.83      9975
weighted avg       0.99      0.99      0.99      9975

Best Estimator: RandomForestClassifier(criterion='entropy', max_depth=20, max_features='sqrt',
                       n_estimators=175, random_state=42)


In [13]:
X, y = over_sampling(X_train_30_95, y_train)
y_train_ravel = y.values.ravel()

params = [{"n_components": [2],
          "covariance_type": ["spherical", "tied", "diag", "full"]}]
model = GaussianMixture(random_state=42)
CgridSearch = GridSearchCV(model, params, cv=5, scoring='recall', verbose=10, n_jobs=5) 
CgridSearch.fit(X, y_train_ravel)
y_pred_val = CgridSearch.predict(X_train_30_95)
test_pred = CgridSearch.predict(X_test_30_95)

train_pred = CgridSearch.predict(X_train_30_95)
pred = np.reshape(train_pred, (train_pred.shape[0], 1))
X_train_30_95_cluster = np.append(X_train_30_95, pred, axis=1)

y_pred_val = CgridSearch.predict(X_val_30_95)
pred = np.reshape(y_pred_val, (y_pred_val.shape[0], 1))
X_val_30_95_cluster = np.append(X_val_30_95, pred, axis=1)

test_pred = CgridSearch.predict(X_test_30_95)
pred = np.reshape(test_pred, (test_pred.shape[0], 1))
X_test_30_95_cluster = np.append(X_test_30_95, pred, axis=1)

print(X_train_30_95_cluster[0].size)
print(X_val_30_95_cluster[0].size)
print(X_test_30_95_cluster[0].size)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 2/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 1/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 4/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 3/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 5/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 4/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.960 total time=   1.1s
[CV 5/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.964 total time=   1.1s
[CV 1/5; 2/4] START covariance_type=tied, n_components=2........................
[CV 2/5; 2/4] START covariance_type=tied, n_components=2........................
[CV 3/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.957 total time=   1.2s
[CV 3/5; 2/4] START covariance_type=tied, n_components=2........................
[CV 1/5; 1/4]

In [14]:
X, y = over_sampling(X_train_30_95_cluster, y_train)
y_train_ravel = y.values.ravel()

params = [{'criterion': ['gini', 'entropy'], 
           'max_features': ['sqrt'], 
           'n_estimators': range(125,201,25),
           'max_depth': [10,20,30]}]
model = RandomForestClassifier(random_state=42)
gridSearch = GridSearchCV(model, params, cv=5, scoring='recall', verbose=10, n_jobs=5) 
gridSearch.fit(X, y_train_ravel)
y_pred_val = gridSearch.predict(X_val_30_95_cluster)
test_pred = gridSearch.predict(X_test_30_95_cluster)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 3/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 1/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 2/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 5/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 4/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 1/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.985 total time=  46.1s
[CV 1/5; 2/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=150
[CV 5/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.984 total time=  46.4s
[CV 4/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.985 total time=  46.4s
[CV 3/5; 2/24] START criterion=gini, max_depth=10, max_feature

In [15]:
score_calc("Random Forest With Cluster NAN 30 PCA 95%", y_val,  y_pred_val)
save_model(gridSearch, test_pred, "Random_Forest_Cluster_30_95", True)

Random Forest With Cluster NAN 30 PCA 95%
Confusion Matrix
[[9665  129]
 [  22  159]]
Validation F3 Score: 0.8294209702660408
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      9794
           1       0.55      0.88      0.68       181

    accuracy                           0.98      9975
   macro avg       0.77      0.93      0.84      9975
weighted avg       0.99      0.98      0.99      9975

Best Estimator: RandomForestClassifier(criterion='entropy', max_depth=30, max_features='sqrt',
                       n_estimators=150, random_state=42)


In [25]:
X, y = over_sampling(X_train_40_95, y_train)
y_train_ravel = y.values.ravel()

params = [{'criterion': ['gini', 'entropy'], 
           'max_features': ['sqrt'], 
           'n_estimators': range(125,201,25),
           'max_depth': [10,20,30]}]
model = RandomForestClassifier(random_state=42)
gridSearch = GridSearchCV(model, params, cv=5, scoring='recall', verbose=10, n_jobs=5) 
gridSearch.fit(X, y_train_ravel)
y_pred_val = gridSearch.predict(X_val_40_95)
test_pred = gridSearch.predict(X_test_40_95)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 2/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 1/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 3/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 4/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 5/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 2/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.983 total time=  56.6s
[CV 1/5; 2/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=150
[CV 1/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.982 total time=  56.6s
[CV 2/5; 2/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=150
[CV 3/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=

In [26]:
score_calc("Random Forest NAN 40 PCA 95%", y_val,  y_pred_val)
save_model(gridSearch, test_pred, "Random_Forest_40_95", True)

Random Forest NAN 40 PCA 95%
Confusion Matrix
[[9688  107]
 [  26  154]]
Validation F3 Score: 0.8187134502923978
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      9795
           1       0.59      0.86      0.70       180

    accuracy                           0.99      9975
   macro avg       0.79      0.92      0.85      9975
weighted avg       0.99      0.99      0.99      9975

Best Estimator: RandomForestClassifier(criterion='entropy', max_depth=20, max_features='sqrt',
                       n_estimators=125, random_state=42)


In [16]:
X, y = over_sampling(X_train_40_95, y_train)
y_train_ravel = y.values.ravel()

params = [{"n_components": [2],
          "covariance_type": ["spherical", "tied", "diag", "full"]}]
model = GaussianMixture(random_state=42)
CgridSearch = GridSearchCV(model, params, cv=5, scoring='recall', verbose=10, n_jobs=5) 
CgridSearch.fit(X, y_train_ravel)
y_pred_val = CgridSearch.predict(X_train_40_95)
test_pred = CgridSearch.predict(X_test_40_95)

train_pred = CgridSearch.predict(X_train_40_95)
pred = np.reshape(train_pred, (train_pred.shape[0], 1))
X_train_40_95_cluster = np.append(X_train_40_95, pred, axis=1)

y_pred_val = CgridSearch.predict(X_val_40_95)
pred = np.reshape(y_pred_val, (y_pred_val.shape[0], 1))
X_val_40_95_cluster = np.append(X_val_40_95, pred, axis=1)

test_pred = CgridSearch.predict(X_test_40_95)
pred = np.reshape(test_pred, (test_pred.shape[0], 1))
X_test_40_95_cluster = np.append(X_test_40_95, pred, axis=1)

print(X_train_40_95_cluster[0].size)
print(X_val_40_95_cluster[0].size)
print(X_test_40_95_cluster[0].size)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 3/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 2/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 4/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 5/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 4/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.947 total time=   0.9s
[CV 1/5; 2/4] START covariance_type=tied, n_components=2........................
[CV 5/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.951 total time=   1.0s
[CV 2/5; 2/4] START covariance_type=tied, n_components=2........................
[CV 3/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.940 total time=   1.1s
[CV 3/5; 2/4] START covariance_type=tied, n_components=2........................
[CV 1/5; 1/4]

In [17]:
X, y = over_sampling(X_train_40_95_cluster, y_train)
y_train_ravel = y.values.ravel()

params = [{'criterion': ['gini', 'entropy'], 
           'max_features': ['sqrt'], 
           'n_estimators': range(125,201,25),
           'max_depth': [10,20,30]}]
model = RandomForestClassifier(random_state=42)
gridSearch = GridSearchCV(model, params, cv=5, scoring='recall', verbose=10, n_jobs=5) 
gridSearch.fit(X, y_train_ravel)
y_pred_val = gridSearch.predict(X_val_40_95_cluster)
test_pred = gridSearch.predict(X_test_40_95_cluster)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 2/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 1/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 3/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 4/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 5/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 2/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.986 total time=  46.1s
[CV 1/5; 2/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=150
[CV 3/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.988 total time=  46.3s
[CV 2/5; 2/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=150
[CV 1/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=

In [18]:
score_calc("Random Forest With Cluster NAN 40 PCA 95%", y_val,  y_pred_val)
save_model(gridSearch, test_pred, "Random_Forest_Cluster_40_95", True)

Random Forest With Cluster NAN 40 PCA 95%
Confusion Matrix
[[9671  123]
 [  21  160]]
Validation F3 Score: 0.8368200836820083
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      9794
           1       0.57      0.88      0.69       181

    accuracy                           0.99      9975
   macro avg       0.78      0.94      0.84      9975
weighted avg       0.99      0.99      0.99      9975

Best Estimator: RandomForestClassifier(criterion='entropy', max_depth=20, max_features='sqrt',
                       n_estimators=150, random_state=42)


In [27]:
X, y = over_sampling(X_train_60_95, y_train)
y_train_ravel = y.values.ravel()

params = [{'criterion': ['gini', 'entropy'], 
           'max_features': ['sqrt'], 
           'n_estimators': range(125,201,25),
           'max_depth': [10,20,30]}]
model = RandomForestClassifier(random_state=42)
gridSearch = GridSearchCV(model, params, cv=5, scoring='recall', verbose=10, n_jobs=5) 
gridSearch.fit(X, y_train_ravel)
y_pred_val = gridSearch.predict(X_val_60_95)
test_pred = gridSearch.predict(X_test_60_95)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 5/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 2/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 3/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 4/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 1/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 2/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.985 total time= 1.0min
[CV 1/5; 2/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=150
[CV 1/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.982 total time= 1.0min
[CV 4/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.980 total time= 1.0min
[CV 2/5; 2/24] START criterion=gini, max_depth=10, max_feature

In [28]:
score_calc("Random Forest NAN 60 PCA 95%", y_val,  y_pred_val)
save_model(gridSearch, test_pred, "Random_Forest_60_95", True)

Random Forest NAN 60 PCA 95%
Confusion Matrix
[[9680  115]
 [  28  152]]
Validation F3 Score: 0.8055113937466879
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      9795
           1       0.57      0.84      0.68       180

    accuracy                           0.99      9975
   macro avg       0.78      0.92      0.84      9975
weighted avg       0.99      0.99      0.99      9975

Best Estimator: RandomForestClassifier(criterion='entropy', max_depth=30, max_features='sqrt',
                       n_estimators=150, random_state=42)


In [19]:
X, y = over_sampling(X_train_60_95, y_train)
y_train_ravel = y.values.ravel()

params = [{"n_components": [2],
          "covariance_type": ["spherical", "tied", "diag", "full"]}]
model = GaussianMixture(random_state=42)
CgridSearch = GridSearchCV(model, params, cv=5, scoring='recall', verbose=10, n_jobs=5) 
CgridSearch.fit(X, y_train_ravel)
y_pred_val = CgridSearch.predict(X_train_60_95)
test_pred = CgridSearch.predict(X_test_60_95)

train_pred = CgridSearch.predict(X_train_60_95)
pred = np.reshape(train_pred, (train_pred.shape[0], 1))
X_train_60_95_cluster = np.append(X_train_60_95, pred, axis=1)

y_pred_val = CgridSearch.predict(X_val_60_95)
pred = np.reshape(y_pred_val, (y_pred_val.shape[0], 1))
X_val_60_95_cluster = np.append(X_val_60_95, pred, axis=1)

test_pred = CgridSearch.predict(X_test_60_95)
pred = np.reshape(test_pred, (test_pred.shape[0], 1))
X_test_60_95_cluster = np.append(X_test_60_95, pred, axis=1)

print(X_train_60_95_cluster[0].size)
print(X_val_60_95_cluster[0].size)
print(X_test_60_95_cluster[0].size)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 2/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 4/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 3/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 5/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 4/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.938 total time=   1.1s[CV 5/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.942 total time=   1.1s

[CV 2/5; 2/4] START covariance_type=tied, n_components=2........................
[CV 1/5; 2/4] START covariance_type=tied, n_components=2........................
[CV 3/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.070 total time=   1.1s
[CV 3/5; 2/4] START covariance_type=tied, n_components=2........................
[CV 1/5; 1/4]

In [21]:
X, y = over_sampling(X_train_60_95_cluster, y_train)
y_train_ravel = y.values.ravel()

params = [{'criterion': ['gini', 'entropy'], 
           'max_features': ['sqrt'], 
           'n_estimators': range(125,201,25),
           'max_depth': [10,20,30]}]
model = RandomForestClassifier(random_state=42)
gridSearch = GridSearchCV(model, params, cv=5, scoring='recall', verbose=10, n_jobs=5) 
gridSearch.fit(X, y_train_ravel)
y_pred_val = gridSearch.predict(X_val_60_95_cluster)
test_pred = gridSearch.predict(X_test_60_95_cluster)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 4/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 2/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 3/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 5/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 4/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.988 total time=  42.0s
[CV 1/5; 2/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=150
[CV 2/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.989 total time=  42.1s
[CV 2/5; 2/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=150
[CV 3/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=

In [22]:
score_calc("Random Forest With Cluster NAN 60 PCA 95%", y_val,  y_pred_val)
save_model(gridSearch, test_pred, "Random_Forest_Cluster_60_95", True)

Random Forest With Cluster NAN 60 PCA 95%
Confusion Matrix
[[9662  132]
 [  18  163]]
Validation F3 Score: 0.8471933471933472
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      9794
           1       0.55      0.90      0.68       181

    accuracy                           0.98      9975
   macro avg       0.78      0.94      0.84      9975
weighted avg       0.99      0.98      0.99      9975

Best Estimator: RandomForestClassifier(criterion='entropy', max_depth=20, max_features='sqrt',
                       n_estimators=150, random_state=42)


In [11]:
X, y = over_sampling(X_train_20_99, y_train)
y_train_ravel = y.values.ravel()

params = [{'criterion': ['gini', 'entropy'], 
           'max_features': ['sqrt'], 
           'n_estimators': range(125,201,25),
           'max_depth': [10,20,30]}]
model = RandomForestClassifier(random_state=42)
gridSearch = GridSearchCV(model, params, cv=5, scoring='recall', verbose=10, n_jobs=5) 
gridSearch.fit(X, y_train_ravel)
y_pred_val = gridSearch.predict(X_val_20_99)
test_pred = gridSearch.predict(X_test_20_99)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 2/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 1/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 3/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 4/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 5/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 3/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.973 total time=  54.8s
[CV 4/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.974 total time=  54.8s
[CV 5/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.971 total time=  54.8s
[CV 1/5; 2/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=150
[CV 1/5; 1/24] END criterion=gini, max_depth=10, max_features=

In [12]:
score_calc("Random Forest NAN 20 PCA 99%", y_val,  y_pred_val)
save_model(gridSearch, test_pred, "Random_Forest_20_99", True)

Random Forest NAN 20 PCA 99%
Confusion Matrix
[[9654  141]
 [  11  169]]
Validation F3 Score: 0.8756476683937823
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      9795
           1       0.55      0.94      0.69       180

    accuracy                           0.98      9975
   macro avg       0.77      0.96      0.84      9975
weighted avg       0.99      0.98      0.99      9975

Best Estimator: RandomForestClassifier(criterion='entropy', max_depth=30, max_features='sqrt',
                       n_estimators=175, random_state=42)


In [9]:
X, y = over_sampling(X_train_20_99, y_train)
y_train_ravel = y.values.ravel()

params = [{"n_components": [2],
          "covariance_type": ["spherical", "tied", "diag", "full"]}]
model = GaussianMixture(random_state=42)
CgridSearch = GridSearchCV(model, params, cv=5, scoring='recall', verbose=10, n_jobs=5) 
CgridSearch.fit(X, y_train_ravel)
y_pred_val = CgridSearch.predict(X_val_20_99)
test_pred = CgridSearch.predict(X_test_20_99)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 2/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 3/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 4/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 5/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 4/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.959 total time=   1.3s
[CV 3/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.962 total time=   1.4s
[CV 5/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.960 total time=   1.3s
[CV 3/5; 2/4] START covariance_type=tied, n_components=2........................
[CV 2/5; 2/4] START covariance_type=tied, n_components=2........................
[CV 1/5; 2/4] START covariance_type=tied, n_components=2........................
[CV 1/5; 1/4]

In [34]:
score_calc("GMM NAN 20 PCA 99%", y_val,  y_pred_val)

Random Forest NAN 20 PCA 99%
Confusion Matrix
[[8575 1220]
 [   3  177]]
Validation F3 Score: 0.5866755054690088
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.88      0.93      9795
           1       0.13      0.98      0.22       180

    accuracy                           0.88      9975
   macro avg       0.56      0.93      0.58      9975
weighted avg       0.98      0.88      0.92      9975



In [10]:
train_pred = CgridSearch.predict(X_train_20_99)
pred = np.reshape(train_pred, (train_pred.shape[0], 1))
X_train_20_99_cluster = np.append(X_train_20_99, pred, axis=1)

y_pred_val = CgridSearch.predict(X_val_20_99)
pred = np.reshape(y_pred_val, (y_pred_val.shape[0], 1))
X_val_20_99_cluster = np.append(X_val_20_99, pred, axis=1)

test_pred = CgridSearch.predict(X_test_20_99)
pred = np.reshape(test_pred, (test_pred.shape[0], 1))
X_test_20_99_cluster = np.append(X_test_20_99, pred, axis=1)

print(X_train_20_99_cluster[0].size)
print(X_val_20_99_cluster[0].size)
print(X_test_20_99_cluster[0].size)




98
98
98


In [11]:
X, y = over_sampling(X_train_20_99_cluster, y_train)
y_train_ravel = y.values.ravel()

params = [{'criterion': ['gini', 'entropy'], 
           'max_features': ['sqrt'], 
           'n_estimators': range(125,201,25),
           'max_depth': [10,20,30]}]
model = RandomForestClassifier(random_state=42)
gridSearch = GridSearchCV(model, params, cv=5, scoring='recall', verbose=10, n_jobs=5) 
gridSearch.fit(X, y_train_ravel)
y_pred_val = gridSearch.predict(X_val_20_99_cluster)
test_pred = gridSearch.predict(X_test_20_99_cluster)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 2/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 4/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 5/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 3/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 4/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.983 total time=  50.3s
[CV 1/5; 2/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=150
[CV 1/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.985 total time=  50.4s
[CV 2/5; 2/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=150
[CV 2/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=

In [12]:
score_calc("Random Forest With Cluster NAN 20 PCA 99%", y_val,  y_pred_val)
save_model(gridSearch, test_pred, "Random_Forest_Cluster_20_99", True)

Random Forest With Cluster NAN 20 PCA 99%
Confusion Matrix
[[9649  146]
 [  16  164]]
Validation F3 Score: 0.8497409326424871
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      9795
           1       0.53      0.91      0.67       180

    accuracy                           0.98      9975
   macro avg       0.76      0.95      0.83      9975
weighted avg       0.99      0.98      0.99      9975

Best Estimator: RandomForestClassifier(criterion='entropy', max_depth=30, max_features='sqrt',
                       n_estimators=175, random_state=42)


In [11]:
X, y = over_sampling(X_train_30_99, y_train)
y_train_ravel = y.values.ravel()

params = [{"n_components": [2],
          "covariance_type": ["spherical", "tied", "diag", "full"]}]
model = GaussianMixture(random_state=42)
CgridSearch = GridSearchCV(model, params, cv=5, scoring='recall', verbose=10, n_jobs=5) 
CgridSearch.fit(X, y_train_ravel)
y_pred_val = CgridSearch.predict(X_train_30_99)
test_pred = CgridSearch.predict(X_test_30_99)

train_pred = CgridSearch.predict(X_train_30_99)
pred = np.reshape(train_pred, (train_pred.shape[0], 1))
X_train_30_99_cluster = np.append(X_train_30_99, pred, axis=1)

y_pred_val = CgridSearch.predict(X_val_30_99)
pred = np.reshape(y_pred_val, (y_pred_val.shape[0], 1))
X_val_30_99_cluster = np.append(X_val_30_99, pred, axis=1)

test_pred = CgridSearch.predict(X_test_30_99)
pred = np.reshape(test_pred, (test_pred.shape[0], 1))
X_test_30_99_cluster = np.append(X_test_30_99, pred, axis=1)

print(X_train_30_99_cluster[0].size)
print(X_val_30_99_cluster[0].size)
print(X_test_30_99_cluster[0].size)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 2/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 3/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 4/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 5/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 4/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.960 total time=   1.4s
[CV 5/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.955 total time=   1.3s
[CV 1/5; 2/4] START covariance_type=tied, n_components=2........................
[CV 2/5; 2/4] START covariance_type=tied, n_components=2........................
[CV 3/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.959 total time=   1.5s
[CV 3/5; 2/4] START covariance_type=tied, n_components=2........................
[CV 2/5; 1/4]

In [12]:
X, y = over_sampling(X_train_30_99_cluster, y_train)
y_train_ravel = y.values.ravel()

params = [{'criterion': ['gini', 'entropy'], 
           'max_features': ['sqrt'], 
           'n_estimators': range(125,201,25),
           'max_depth': [10,20,30]}]
model = RandomForestClassifier(random_state=42)
gridSearch = GridSearchCV(model, params, cv=5, scoring='recall', verbose=10, n_jobs=5) 
gridSearch.fit(X, y_train_ravel)
y_pred_val = gridSearch.predict(X_val_30_99_cluster)
test_pred = gridSearch.predict(X_test_30_99_cluster)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 2/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 3/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 4/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 1/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 5/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 1/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.992 total time=  54.1s
[CV 2/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.991 total time=  54.1s
[CV 1/5; 2/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=150
[CV 2/5; 2/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=150
[CV 5/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=

In [13]:
score_calc("Random Forest With Cluster NAN 30 PCA 99%", y_val,  y_pred_val)
save_model(gridSearch, test_pred, "Random_Forest_Cluster_30_99", True)

Random Forest With Cluster NAN 30 PCA 99%
Confusion Matrix
[[9672  122]
 [  20  161]]
Validation F3 Score: 0.8420502092050208
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      9794
           1       0.57      0.89      0.69       181

    accuracy                           0.99      9975
   macro avg       0.78      0.94      0.84      9975
weighted avg       0.99      0.99      0.99      9975

Best Estimator: RandomForestClassifier(criterion='entropy', max_depth=30, max_features='sqrt',
                       n_estimators=200, random_state=42)


In [13]:
X, y = over_sampling(X_train_40_99, y_train)
y_train_ravel = y.values.ravel()

params = [{'criterion': ['gini', 'entropy'], 
           'max_features': ['sqrt'], 
           'n_estimators': range(125,201,25),
           'max_depth': [10,20,30]}]
model = RandomForestClassifier(random_state=42)
gridSearch = GridSearchCV(model, params, cv=5, scoring='recall', verbose=10, n_jobs=5) 
gridSearch.fit(X, y_train_ravel)
y_pred_val = gridSearch.predict(X_val_40_99)
test_pred = gridSearch.predict(X_test_40_99)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 2/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125[CV 3/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125

[CV 4/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 5/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 2/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.973 total time=  58.9s
[CV 5/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.974 total time=  58.8s
[CV 1/5; 2/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=150
[CV 3/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.972 total time=  58.9s
[CV 2/5; 2/24] START criterion=gini, max_depth=10, max_feature

In [14]:
score_calc("Random Forest NAN 40 PCA 99%", y_val,  y_pred_val)
save_model(gridSearch, test_pred, "Random_Forest_40_99", True)

Random Forest NAN 40 PCA 99%
Confusion Matrix
[[9665  130]
 [  12  168]]
Validation F3 Score: 0.875912408759124
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      9795
           1       0.56      0.93      0.70       180

    accuracy                           0.99      9975
   macro avg       0.78      0.96      0.85      9975
weighted avg       0.99      0.99      0.99      9975

Best Estimator: RandomForestClassifier(criterion='entropy', max_depth=20, max_features='sqrt',
                       n_estimators=125, random_state=42)


In [9]:
X, y = over_sampling(X_train_40_99, y_train)
y_train_ravel = y.values.ravel()

params = [{"n_components": [2],
          "covariance_type": ["spherical", "tied", "diag", "full"]}]
model = GaussianMixture(random_state=42)
CgridSearch = GridSearchCV(model, params, cv=5, scoring='recall', verbose=10, n_jobs=5) 
CgridSearch.fit(X, y_train_ravel)
y_pred_val = CgridSearch.predict(X_train_40_99)
test_pred = CgridSearch.predict(X_test_40_99)

train_pred = CgridSearch.predict(X_train_40_99)
pred = np.reshape(train_pred, (train_pred.shape[0], 1))
X_train_40_99_cluster = np.append(X_train_40_99, pred, axis=1)

y_pred_val = CgridSearch.predict(X_val_40_99)
pred = np.reshape(y_pred_val, (y_pred_val.shape[0], 1))
X_val_40_99_cluster = np.append(X_val_40_99, pred, axis=1)

test_pred = CgridSearch.predict(X_test_40_99)
pred = np.reshape(test_pred, (test_pred.shape[0], 1))
X_test_40_99_cluster = np.append(X_test_40_99, pred, axis=1)

print(X_train_40_99_cluster[0].size)
print(X_val_40_99_cluster[0].size)
print(X_test_40_99_cluster[0].size)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 2/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 3/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 4/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 5/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 5/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.948 total time=   1.3s
[CV 4/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.946 total time=   1.4s
[CV 1/5; 2/4] START covariance_type=tied, n_components=2........................
[CV 2/5; 2/4] START covariance_type=tied, n_components=2........................
[CV 3/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.947 total time=   1.6s
[CV 3/5; 2/4] START covariance_type=tied, n_components=2........................
[CV 1/5; 1/4]

In [10]:
X, y = over_sampling(X_train_40_99_cluster, y_train)
y_train_ravel = y.values.ravel()

params = [{'criterion': ['gini', 'entropy'], 
           'max_features': ['sqrt'], 
           'n_estimators': range(125,201,25),
           'max_depth': [10,20,30]}]
model = RandomForestClassifier(random_state=42)
gridSearch = GridSearchCV(model, params, cv=5, scoring='recall', verbose=10, n_jobs=5) 
gridSearch.fit(X, y_train_ravel)
y_pred_val = gridSearch.predict(X_val_40_99_cluster)
test_pred = gridSearch.predict(X_test_40_99_cluster)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 2/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 4/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 1/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 3/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 5/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 1/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.986 total time=  57.5s
[CV 1/5; 2/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=150
[CV 4/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.986 total time=  57.6s
[CV 2/5; 2/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=150
[CV 5/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=

In [11]:
score_calc("Random Forest With Cluster NAN 40 PCA 99%", y_val,  y_pred_val)
save_model(gridSearch, test_pred, "Random_Forest_Cluster_40_99", True)

Random Forest With Cluster NAN 40 PCA 99%
Confusion Matrix
[[9654  140]
 [  27  154]]
Validation F3 Score: 0.8008320332813313
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      9794
           1       0.52      0.85      0.65       181

    accuracy                           0.98      9975
   macro avg       0.76      0.92      0.82      9975
weighted avg       0.99      0.98      0.99      9975

Best Estimator: RandomForestClassifier(criterion='entropy', max_depth=20, max_features='sqrt',
                       n_estimators=125, random_state=42)


In [15]:
X, y = over_sampling(X_train_60_99, y_train)
y_train_ravel = y.values.ravel()

params = [{'criterion': ['gini', 'entropy'], 
           'max_features': ['sqrt'], 
           'n_estimators': range(125,201,25),
           'max_depth': [10,20,30]}]
model = RandomForestClassifier(random_state=42)
gridSearch = GridSearchCV(model, params, cv=5, scoring='recall', verbose=10, n_jobs=5) 
gridSearch.fit(X, y_train_ravel)
y_pred_val = gridSearch.predict(X_val_60_99)
test_pred = gridSearch.predict(X_test_60_99)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 2/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 4/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 3/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 5/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 1/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.976 total time= 1.2min
[CV 2/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.975 total time= 1.2min
[CV 3/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.976 total time= 1.2min
[CV 1/5; 2/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=150
[CV 3/5; 2/24] START criterion=gini, max_depth=10, max_feature

In [16]:
score_calc("Random Forest NAN 60 PCA 99%", y_val,  y_pred_val)
save_model(gridSearch, test_pred, "Random_Forest_60_99", True)

Random Forest NAN 60 PCA 99%
Confusion Matrix
[[9660  135]
 [  13  167]]
Validation F3 Score: 0.8688865764828305
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      9795
           1       0.55      0.93      0.69       180

    accuracy                           0.99      9975
   macro avg       0.78      0.96      0.84      9975
weighted avg       0.99      0.99      0.99      9975

Best Estimator: RandomForestClassifier(criterion='entropy', max_depth=20, max_features='sqrt',
                       n_estimators=200, random_state=42)


In [23]:
X, y = over_sampling(X_train_60_99, y_train)
y_train_ravel = y.values.ravel()

params = [{"n_components": [2],
          "covariance_type": ["spherical", "tied", "diag", "full"]}]
model = GaussianMixture(random_state=42)
CgridSearch = GridSearchCV(model, params, cv=5, scoring='recall', verbose=10, n_jobs=5) 
CgridSearch.fit(X, y_train_ravel)
y_pred_val = CgridSearch.predict(X_val_60_99)
test_pred = CgridSearch.predict(X_test_60_99)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 2/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 3/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 4/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 5/5; 1/4] START covariance_type=spherical, n_components=2...................
[CV 4/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.063 total time=   1.3s
[CV 5/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.940 total time=   1.3s
[CV 1/5; 2/4] START covariance_type=tied, n_components=2........................
[CV 2/5; 2/4] START covariance_type=tied, n_components=2........................
[CV 2/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.955 total time=   1.6s
[CV 3/5; 1/4] END covariance_type=spherical, n_components=2;, score=0.933 total time=   1.6s
[

In [24]:
score_calc("GMM NAN 60 PCA 99%", y_val,  y_pred_val)

GMM NAN 60 PCA 99%
Confusion Matrix
[[1200 8594]
 [ 179    2]]
Validation F3 Score: 0.0019559902200489
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.12      0.21      9794
           1       0.00      0.01      0.00       181

    accuracy                           0.12      9975
   macro avg       0.44      0.07      0.11      9975
weighted avg       0.85      0.12      0.21      9975



In [25]:
train_pred = CgridSearch.predict(X_train_60_99)
pred = np.reshape(train_pred, (train_pred.shape[0], 1))
X_train_60_99_cluster = np.append(X_train_60_99, pred, axis=1)

y_pred_val = CgridSearch.predict(X_val_60_99)
pred = np.reshape(y_pred_val, (y_pred_val.shape[0], 1))
X_val_60_99_cluster = np.append(X_val_60_99, pred, axis=1)

test_pred = CgridSearch.predict(X_test_60_99)
pred = np.reshape(test_pred, (test_pred.shape[0], 1))
X_test_60_99_cluster = np.append(X_test_60_99, pred, axis=1)

print(X_train_60_99_cluster[0].size)
print(X_val_60_99_cluster[0].size)
print(X_test_60_99_cluster[0].size)

109
109
109


In [26]:
X, y = over_sampling(X_train_60_99_cluster, y_train)
y_train_ravel = y.values.ravel()

params = [{'criterion': ['gini', 'entropy'], 
           'max_features': ['sqrt'], 
           'n_estimators': range(125,201,25),
           'max_depth': [10,20,30]}]
model = RandomForestClassifier(random_state=42)
gridSearch = GridSearchCV(model, params, cv=5, scoring='recall', verbose=10, n_jobs=5) 
gridSearch.fit(X, y_train_ravel)
y_pred_val = gridSearch.predict(X_val_60_99_cluster)
test_pred = gridSearch.predict(X_test_60_99_cluster)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 2/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 3/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 4/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 5/5; 1/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125
[CV 2/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.986 total time=  53.9s
[CV 1/5; 2/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=150
[CV 4/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=0.985 total time=  53.9s
[CV 2/5; 2/24] START criterion=gini, max_depth=10, max_features=sqrt, n_estimators=150
[CV 1/5; 1/24] END criterion=gini, max_depth=10, max_features=sqrt, n_estimators=125;, score=

In [27]:
score_calc("Random Forest With Cluster NAN 60 PCA 99%", y_val,  y_pred_val)
save_model(gridSearch, test_pred, "Random_Forest_Cluster_60_99", True)

Random Forest With Cluster NAN 60 PCA 99%
Confusion Matrix
[[9664  130]
 [  21  160]]
Validation F3 Score: 0.8337675872850443
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      9794
           1       0.55      0.88      0.68       181

    accuracy                           0.98      9975
   macro avg       0.77      0.94      0.84      9975
weighted avg       0.99      0.98      0.99      9975

Best Estimator: RandomForestClassifier(criterion='entropy', max_depth=20, max_features='sqrt',
                       n_estimators=175, random_state=42)
