In [15]:
###Basic Imports required for random 
import pandas as pd
from sklearn import model_selection , preprocessing
from sklearn.model_selection import cross_val_score , train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

In [16]:
# As Random Forest classifier doesnt accept String values , all categorical data with string categories need to converted to
# numeric categorical values which is done using LabelEncoder() method

## The fit_transform() method helps to convert the values and store them in the attribute.

df = pd.read_csv("../Datasets/FinalMergedDataset/sample_dataset.csv")

df["Source"] = preprocessing.LabelEncoder().fit_transform(df["Source"])

df["Destination"] = preprocessing.LabelEncoder().fit_transform(df["Destination"])

df["Flight Name"] = preprocessing.LabelEncoder().fit_transform(df["Flight Name"])

df["type"] = preprocessing.LabelEncoder().fit_transform(df["type"])


df = df[["Source","Destination","Flight Name","type" , "Status" ,"Delay" , "Delay_Status"]]
df.head(20)

Unnamed: 0,Source,Destination,Flight Name,type,Status,Delay,Delay_Status
0,19,40,27,0,0,1.0,On time
1,19,40,2,0,0,29.0,Delayed
2,43,10,48,1,1,13.0,On time
3,43,21,27,1,1,25.0,Delayed
4,8,40,27,0,0,15.0,On time
5,1,40,23,0,0,48.0,Delayed
6,23,40,25,0,0,23.0,Delayed
7,43,14,27,1,1,50.0,Delayed
8,43,44,2,1,1,72.0,Delayed
9,29,40,25,0,0,31.0,Delayed


In [17]:
# X and Y are independent and dependent variables , in order to do that , we split the dataset into the class to be predicted
# and class of all other attributes namely stored in the variables X and Y respectively
X = df.drop("Delay_Status" , axis =1 )
Y = df["Delay_Status"]

In [18]:
# In order to train and test the model , the train_test_split divides the dataset into 2 paritions , 1 for training 
#and 1 for testing the data

X_train , X_test , Y_train , Y_test = train_test_split(X,Y,test_size = 0.2)

In [19]:
#This cell initializes the Random Classifier model and fits the training data into the model
rfc = RandomForestClassifier()
rfc.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [20]:
# To predict the data we send the test data of other attributes to be predicted
rfc_predict = rfc.predict(X_test)

In [21]:
# A score for accuracty is generated from the above test data
score = cross_val_score(rfc,X,Y,cv = 10 , scoring="roc_auc")

In [22]:
# Confusion matrix of the test data

print("=== Confusion Matrix ===")
print(confusion_matrix(Y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(Y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", score.mean())

=== Confusion Matrix ===
[[53  0]
 [ 0 45]]


=== Classification Report ===
              precision    recall  f1-score   support

     Delayed       1.00      1.00      1.00        53
     On time       1.00      1.00      1.00        45

    accuracy                           1.00        98
   macro avg       1.00      1.00      1.00        98
weighted avg       1.00      1.00      1.00        98



=== All AUC Scores ===
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  1.0


In [23]:
### if partition size is changed to the inverse ratio of training and testing
# ie. Training data is 0.1% and testing data is 99.9%
X_train , X_test , Y_train , Y_test = train_test_split(X,Y,test_size = 0.99)
rfc = RandomForestClassifier()
rfc.fit(X_train,Y_train)

rfc_predict = rfc.predict(X_test)
score = cross_val_score(rfc,X,Y,cv = 10 , scoring="roc_auc")

print("=== Confusion Matrix ===")
print(confusion_matrix(Y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(Y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", score.mean())

=== Confusion Matrix ===
[[230  66]
 [  0 188]]


=== Classification Report ===
              precision    recall  f1-score   support

     Delayed       1.00      0.78      0.87       296
     On time       0.74      1.00      0.85       188

    accuracy                           0.86       484
   macro avg       0.87      0.89      0.86       484
weighted avg       0.90      0.86      0.87       484



=== All AUC Scores ===
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  1.0
