In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [None]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [None]:
# Database details: ...

columns = [
    "", "", "", "",
    "", "", "", ""
]

target = [""]

In [None]:
# Load the data
file_path = Path('fileName.csv')
df = pd.read_csv(file_path, skiprows=1)[:-2]
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Remove specific features or details
all_but_specific_detail = df['columnName'] != 'class'
df = df.loc[all_but_specific_detail]

# convert format to numerical
df[''] = df[''].str.replace('%', '')
df[''] = df[''].astype('float') / 100


# Convert to string based on values or other strings
x = {'': ''}   
df = df.replace(x)

x = dict.fromkeys(['', '', '', ''], '')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

In [None]:
# Create the OneHotEncoder instance
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False)
# Fit the encoder and produce encoded DataFrame
encode_df = pd.DataFrame(enc.fit_transform(df.ColumnName.values.reshape(-1,1)))
# Rename encoded columns
encode_df.columns = enc.get_feature_names(['ColumnName'])
# Merge the two DataFrames together and drop the 'ColumnName' column
df.merge(encode_df,left_index=True,right_index=True).drop('ColumnName',1)

# Split the Data into Training and Testing

In [None]:
# Create our features
X = pd.get_dummies(df.drop(columns=target))
# Create our target
y = df['']

In [None]:
X.describe()

In [None]:
# Check the balance of our target values
# Class imbalance to be addressed in specific section below
y.value_counts()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

# Scaling the Data

In [None]:
# Creating StandardScaler instance
scaler = StandardScaler()
# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Algorithms to train and evaluate performance

Determine which algorithm results in the best performance.   
Logistic Regression.   
Support Vector Machines (not added yet).  
Balanced Random Forest Classifier.      
Easy Ensemble AdaBoost Classifier.  
Gradien Boosting Classifier.  
Neural Networks and Deep Learning (not added yet).

For each algorithm, we perform the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Logistic Regression

In [None]:
# Train the Logistic Regression model
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(solver='lbfgs', random_state=1)
lr_model.fit(X_train_scaled, y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

### Balanced Random Forest Classifier

In [None]:
# Train the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brfc_model = BalancedRandomForestClassifier(n_estimators =100, random_state=1)
brfc_model.fit(X_train_scaled, y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = brfc_model.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
sorted(zip(brfc_model.feature_importances_, X.columns), reverse=True)

### Easy Ensemble AdaBoost Classifier

In [None]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec_model = EasyEnsembleClassifier(n_estimators =100, random_state=1)
eec_model.fit(X_train_scaled, y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = eec_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

### Gradien Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
                                            learning_rate=learning_rate,
                                            max_features=5,
                                            max_depth=3,
                                            random_state=0)
    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train_scaled,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test_scaled,
            y_test)))
    print()

In [None]:
# Choose a learning rate and create classifier
classifier = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=0.5,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)
# Fit the model
classifier.fit(X_train_scaled, y_train)
# Make Prediction
predictions = classifier.predict(X_test_scaled)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

# Class Imbalance
Compare sampling algorithms performance.   
Naive random oversampling  
SMOTE   
Cluster Centroids  
SMOTEENN  

For each algorithm, we perform the folliowing steps:  
View the count of the target classes using Counter from the collections library.  
Use the resampled data to train a logistic regression model (other model to be added).  
Calculate the balanced accuracy score from sklearn.metrics.  
Print the confusion matrix from sklearn.metrics.  
Generate a classication report using the imbalanced_classification_report from imbalanced-learn. 

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests.  

### Naive Random Oversampling

In [None]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)
print(Counter(y_resampled['']))

In [None]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

### SMOTE Oversampling

In [None]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train_scaled, y_train)
print(Counter(y_resampled['']))

In [None]:
# Train the Logistic Regression model using the resampled data
model2 = LogisticRegression(solver='lbfgs', random_state=1)
model2.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
y_pred = model2.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

### Cluster Centroids Undersampling

In [None]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train_scaled, y_train)
print(Counter(y_resampled['']))

In [None]:
# Train the Logistic Regression model using the resampled data
model3 = LogisticRegression(solver='lbfgs', random_state=1)
model3.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
y_pred = model3.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

### SMOTEENN

In [None]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled, y_train)
print(Counter(y_resampled['']))

In [None]:
# Train the Logistic Regression model using the resampled data
model4 = LogisticRegression(solver='lbfgs', random_state=1)
model4.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
y_pred = model4.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))