In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import zscore
import matplotlib.pyplot as pt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score
from six import StringIO  
from sklearn.tree import export_graphviz
import pydotplus, graphviz
from IPython.display import Image  
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix

In [None]:
#!pip install graphviz
#brew install graphviz
#!conda install python-graphviz

In [None]:
#!pip install pydotplus

In [None]:
#conda install -c anaconda graphviz   

In [None]:
data_train = pd.read_csv('/Users/huzaifkherani/Desktop/AML/Project/DATA/data.csv')
data_test = pd.read_csv('/Users/huzaifkherani/Desktop/AML/Project/DATA/test.csv')

In [None]:
data_train.head()

# Exploratory Data Analysis

In [None]:
data_train.info()

In [None]:
data_train.isnull().sum() # Checking if there is any null value in the dataset

In [None]:
data_train.drop(['Accident_ID'],axis=1,inplace=True)

In [None]:
# Drop the target and check how the features correlate
data_train.drop("Severity", axis=1).corr()

# Observations
#### Days since inspection has a strong -ve correlation with Safety Score
#### Turbulence in gforces has a strong -ve correlation with Control Metric
#### Accident type code has a weak +ve correlation with the Safety Score
#### Adverse Weather Metric has a strong -ve correlation with Accident Type Code
#### Adverse Weather Metric has a weak +ve correlation with Max Elevation

In [None]:
# Checking Corelation
pt.figure(figsize = (15, 7))
pt.subplot(1, 2, 1)
pt.title("Train Data")
sns.heatmap(data_train.corr())
pt.savefig('Correlation Heatmap.png')

In [None]:
inspec = data_train[data_train["Days_Since_Inspection"] == 1]
inspec

In [None]:
pt.figure()
pt.xticks(rotation = 90)
sns.countplot(data_train['Severity'])
pt.savefig('Severity vs Count graph.png')

In [None]:
# "Accident_Type_Code" and "Severity" are a Categorical variable hence, removing it 
pt.figure(figsize=(16,6))
data_train.boxplot(column=['Safety_Score', 'Days_Since_Inspection', 'Total_Safety_Complaints', 'Control_Metric', 
                                   'Cabin_Temperature', 'Accident_Type_Code', 'Violations'])
pt.savefig('Box plot 1.png')

In [None]:
pt.figure(figsize=(12,6))
data_train.boxplot(column=['Max_Elevation'])
pt.savefig('Boxplot 2.png')

In [None]:
pt.figure(figsize=(12,6))
data_train.boxplot(column=['Turbulence_In_gforces', 'Adverse_Weather_Metric'])
pt.savefig('Boxplot 3.png')

In [None]:
data_train.info()

In [None]:
data_num = pd.DataFrame(data_train, columns = data_train.columns[data_train.dtypes == 'float64']) 
data_num.head()

In [None]:
# Applying zscore

In [None]:
data_num=data_num.apply(zscore)

In [None]:
data_num.head()

# Removing all records with z-score greater and lesser than 3 and -3 respectively.

In [None]:
floats = data_num.columns[data_num.dtypes == 'float64']
for columns in floats:
    indexNames_larger = data_num[data_num[columns]>3].index
    indexNames_lesser = data_num[data_num[columns]<-3].index
    # Delete these row indexes from dataFrame
    data_num.drop(indexNames_larger , inplace=True)
    data_num.drop(indexNames_lesser , inplace=True)
    data_train.drop(indexNames_larger , inplace=True)
    data_train.drop(indexNames_lesser , inplace=True)

In [None]:
data_num.info()

# 493 records were removed as they were considered outliers

# Merging the scaled columns back to the original dataframe

In [None]:
data_train.drop(data_train.columns[data_train.dtypes == 'float64'],axis=1,inplace=True)

In [None]:
data_train.head()

In [None]:
for column in data_num.columns:
    data_train[column]=data_num[column]

In [None]:
data_train.head()

# Label Encoding the Target Column

In [None]:
data_train['Severity'].unique()

In [None]:
encoder=LabelEncoder()
data_train['Severity']=encoder.fit_transform(data_train['Severity'])

In [None]:
data_train.head()

In [None]:
# Checking the unique values for dependent Variable (Severity)
data_train.Severity.unique()

In [None]:
# Checking the Unique Values in Accident_Type_Code
data_train.Accident_Type_Code.unique()

In [None]:
# Checking the Unqiue Values in Violations
data_train.Violations.unique()

In [None]:
# Checking the Unqiue Values in days since inspection
data_train.Days_Since_Inspection.unique()

In [None]:
data_train['Severity'].describe()
# A description (4 level factor) on the severity of the crash

In [None]:
data_train['Safety_Score'].describe() 
# It gives a measure of how safe the plane was deemed to be.

In [None]:
data_train['Days_Since_Inspection'].describe() 
# It gives measure of how long the plane without inspection before incident

In [None]:
data_train['Total_Safety_Complaints'].describe()
# No. of complaints from mechanics prior to accident.

In [None]:
data_train['Control_Metric'].describe()
# An estimation of how much control the pilot had during the incident given the factors at play.

In [None]:
data_train['Turbulence_In_gforces'].describe()
# Recorded turbulence experienced at the time of accident.

In [None]:
data_train['Cabin_Temperature'].describe()
# Last recorded temp before incident.

In [None]:
data_train['Max_Elevation'].describe()
# Height from the ground in mts.

In [None]:
data_train['Violations'].describe()
# Number of Violations aircraft received during inspection.

In [None]:
sns.barplot(x = "Days_Since_Inspection", y = "Severity", data = data_train)
pt.savefig('Days since inspection vs Severity.png')

In [None]:
sns.barplot(x = "Accident_Type_Code", y = "Severity", data = data_train)
pt.savefig('Accident type code vs Severity.png')

# Feature Engineering

In [None]:
data_train['Total_Safety_Complaints'] = np.power(2, data_train['Total_Safety_Complaints'])
data_train['Days_Since_Inspection'] = np.power(2, data_train['Days_Since_Inspection'])
data_train['Safety_Score'] = np.power(2, data_train['Safety_Score'])

In [None]:
X=data_train.drop(['Severity'],axis=1)

In [None]:
y=data_train['Severity']

# Splitting Data into Train and Test

In [None]:
# Split dataset into training set and Validation set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1) # 90% training and 10% test


In [None]:
print(X_train.shape)
print(X_test.shape)

# Decision Tree

In [None]:
#making the instance
from sklearn.model_selection import GridSearchCV
model= DecisionTreeClassifier(random_state=1234)

#Hyper Parameters Set
param_grid = {'max_features': ['auto', 'sqrt', 'log2'],
          'min_samples_split': [5,10,15,20,25,50,100], 
          'min_samples_leaf':[5,6,7,8,9,10,11],
             'max_depth':[5,10,15,25,100],
             'criterion':['gini','entropy']}


# Create grid search object
clf1 = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5)

# Fit on data
best_clf_dt = clf1.fit(X_train, y_train)

#Predict
predictions = best_clf_dt.predict(X_test)

print("Accuracy", accuracy_score(y_test,predictions))
print("CLASSIFICATION - REPORT \n")
print("Confusion matrix \n",confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

In [None]:
clf1.best_estimator_

In [None]:
dot_data = StringIO()  
export_graphviz(clf1.best_estimator_, out_file=dot_data, filled=True,rounded=True,
                feature_names=X.columns,
                class_names=['Highly_Fatal_And_Damaging','Significant_Damage_And_Serious_Injuries', 'Minor_Damage_And_Injuries','Significant_Damage_And_Fatalities' ]) 
                 
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())


# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

#making the instance
model= RandomForestClassifier(random_state=1234)

#Hyper Parameters Set
param_grid = {'criterion':['gini','entropy'],
          'n_estimators':[1,2,3,4,5],
          'min_samples_leaf':[1,2,3],
          'min_samples_split':[3,4,5,6,7]}

# Create grid search object
clf = GridSearchCV(model, param_grid=param_grid, n_jobs=-1, cv=5)

# Fit on data
best_clf_rf = clf.fit(X_train, y_train)

#Predict
predictions = best_clf_rf.predict(X_test)

#Check Prediction Score
print("Accuracy of Random Forest: ",accuracy_score(y_test, predictions))

#Print Classification Report
print("Confusion matrix \n",confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

In [None]:
clf.best_estimator_

# Gradient Boosting

In [None]:
param_grid = {"n_estimators":[10,20,40,100],'max_depth':[3,4,5,6]}

In [None]:
gb_model = GradientBoostingClassifier()

In [None]:
grid = GridSearchCV(gb_model,param_grid)

In [None]:
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
predictions = grid.predict(X_test)

In [None]:
predictions

In [None]:
print(classification_report(y_test,predictions))

In [None]:
grid.best_estimator_.feature_importances_

In [None]:
# example of grid searching key hyperparameters for gradient boosting on a classification dataset
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
# define the model with default hyperparameters
model = GradientBoostingClassifier()
# define the grid of values to search
grid = dict()
grid['n_estimators'] = [10, 50, 100, 500]
grid['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 1.0]
grid['subsample'] = [0.5, 0.7, 1.0]
grid['max_depth'] = [3, 7, 9]
# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy')
# execute the grid search
grid_result = grid_search.fit(X, y)
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Extreme Gradient Boosting

In [None]:
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

#Pipeline
pipe_XGB = Pipeline([('XGB', XGBClassifier())]) 

#Parameter-grid
param_grid = {'XGB__learning_rate':[0.1,0.2],'XGB__max_depth' :[5,10], 'XGB__gamma':[0.1,0.3]} 
 
#Using RandomSearchCV
Random_XGB = RandomizedSearchCV( pipe_XGB , param_distributions=param_grid, cv= 10, n_iter=3) 
#Fitting the data in the model
Random_XGB.fit(X_train, y_train)

print(" Best cross-validation score obtained is: {:.2f}". format( Random_XGB.best_score_)) 
print(" Best parameters as part of Gridsearch is: ", Random_XGB.best_params_) 
print(" Train set score obtained is: {:.2f}". format( Random_XGB.score( X_train, y_train)))
print(" Test set score obtained is: {:.2f}". format( Random_XGB.score( X_test, y_test)))

In [None]:
y_pred=Random_XGB.predict(X_test)

In [None]:
accuracy_score=metrics.accuracy_score(y_test,y_pred)
percision_score=metrics.precision_score(y_test,y_pred,average='macro')
recall_score=metrics.recall_score(y_test,y_pred,average='macro')
f1_score=metrics.f1_score(y_test,y_pred,average='macro')
print("The Accuracy of this model is {0:.2f}%".format(accuracy_score*100))
print("The Percision of this model is {0:.2f}%".format(percision_score*100))
print("The Recall score of this model is {0:.2f}%".format(recall_score*100))
print("The f1 score of this model is {0:.2f}%".format(f1_score*100))

In [None]:
Random_XGB.cv_results_

# Predicting the test data

In [None]:
data_test.drop(['Accident_ID'],axis=1,inplace=True)
data_test.head()

In [None]:
data_test.info()

In [None]:
num = pd.DataFrame(data_test, columns =data_test.columns[data_test.dtypes == 'float64']) 
num.head()

In [None]:
num=num.apply(zscore)

In [None]:
data_test.drop(data_test.columns[data_test.dtypes == 'float64'],axis=1,inplace=True)
data_test.head()

In [None]:
for column in num.columns:
    data_test[column]=num[column]

In [None]:
data_test.head()

In [None]:
data_test['Total_Safety_Complaints'] = np.power(2, data_test['Total_Safety_Complaints'])
data_test['Days_Since_Inspection'] = np.power(2, data_test['Days_Since_Inspection'])
data_test['Safety_Score'] = np.power(2, data_test['Safety_Score'])

# Predictions using Extreme Gradient Boosting


In [None]:
testPredictions=Random_XGB.predict(data_test)

In [None]:
data_test['Severity']=encoder.inverse_transform(testPredictions)

In [None]:
data_test.head()

In [None]:
final_test = pd.read_csv('/Users/huzaifkherani/Desktop/AML/Project/DATA/test.csv')

In [None]:
final_test['Severity']=data_test['Severity']

In [None]:
final_test.head()