In [1]:
# Load Libaries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Preprocessing
df = pd.read_csv('data.csv')
df = df.drop(df.columns[0], axis=1)
print(df.head())

# Define features and target
X = df.values[:, 1:7]
Y = df.values[:,0]

# Split Data in Train and Test Data
X_traindata, X_testdata, Y_traindata, Y_testdata = train_test_split(X, Y, test_size=0.30,random_state=101)

   returnLabel  totalAmount  c_0  c_1  c_2  c_3  c_4  c_5
0            0          240    1    0    3    1    3    0
1            0           79    0    0    1    0    0    0
2            0           24    0    0    1    1    0    0
3            0          504    0    3    0    0    1    2
4            0          190    1    0    1    1    2    0


In [3]:
# Modify the Data to have one dataframe
df1 = pd.DataFrame(X_traindata, columns=['totalAmount', 'c_1', 'c_2', 'c_3', 'c_4', 'c_5'])
df2 = pd.DataFrame(Y_traindata, columns=['returnLabel'])
df_train = pd.concat([df2, df1], axis=1)

# Class count
count_label_0, count_label_1 = df_train.returnLabel.value_counts()

# Divide by class
df_label_0 = df_train[df_train['returnLabel'] == 0]
df_label_1 = df_train[df_train['returnLabel'] == 1]

# print(df_label_0.head())
# print(df_label_1.head())

# Count the returnLabel for 0 and 1
print("ReturnLabels 0: ", df_label_0['returnLabel'].count())
print("ReturnLabels 1: ", df_label_1['returnLabel'].count())

ReturnLabels 0:  21080
ReturnLabels 1:  620


In [9]:
# Random undersampling
# Reduce the record for the ReturnLabel = 1 to the size of the record for the ReturnLabel = 0
df_label_0_undersampling = df_label_0.sample(count_label_1, random_state=1)
df_undersampled = pd.concat([df_label_0_undersampling, df_label_1], axis=0)
print(df_undersampled.head())

       returnLabel  totalAmount  c_1  c_2  c_3  c_4  c_5
19457            0           60    2    0    1    0    0
21613            0          440    0    0    4    0    1
14937            0          145    1    2    1    0    0
9666             0           77    0    0    1    0    0
18153            0           10    0    0    1    0    0


In [10]:
# Preprocessing for the RandomForest with the undersampled data
# Define features and target
X_under = df.values[:, 1:7]
Y_under = df.values[:,0]

In [11]:
# RandomForest with the undersampled data
rf = RandomForestClassifier(random_state=0, n_jobs=-1)
print(rf)

# Train the model
model = rf.fit(X_under, Y_under)

predictions = model.predict(X_testdata)
print(classification_report(Y_testdata,predictions))
print("Accuracy:", accuracy_score(Y_testdata, predictions))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      9007
           1       0.95      0.60      0.74       293

   micro avg       0.99      0.99      0.99      9300
   macro avg       0.97      0.80      0.87      9300
weighted avg       0.99      0.99      0.98      9300

Accuracy: 0.9864516129032258


In [12]:
# Compare with a RandomForest with imbalanced data
rfComp = RandomForestClassifier(random_state=0, n_jobs=-1)
print(rfComp)

# Train the model
modelComp = rf.fit(X_traindata, Y_traindata)

predictionsComp = modelComp.predict(X_testdata)
print(classification_report(Y_testdata,predictionsComp))
print("Accuracy:", accuracy_score(Y_testdata, predictionsComp))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      9007
           1       0.32      0.11      0.16       293

   micro avg       0.96      0.96      0.96      9300
   macro avg       0.65      0.55      0.57      9300
weighted avg       0.95      0.96      0.96      9300

Accuracy: 0.9646236559139785
