# BDSS Datathon 2023

## Team name: work in progress

### Jupyter Notebook

Python can be run on [Jupyter Notebook](http://jupyter.org/) too.

Jupyter Notebook is a computing environment supporting various programing languages (Python, R, Lua, etc.) through the concept of kernels.  
It allows you to enrich your code with complex comments formatted in Markdown and $\LaTeX$, as well as to place the results of your computation right below your code. Beside, it has all the features provided by the ipython interpreter, like tab auto-completion. 

Jupyter Notebook runs as a web server. To run this lab sheet navigate to the folder containing the file `labsheet1.ipynb` and run Jupyter:

### Imports:

In [1]:
import numpy as np
from scipy import stats
from pprint import pprint
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import pandas as pd

%matplotlib inline
# notebook
import matplotlib.pylab as pylab
pylab.rcParams['figure.figsize'] = (16.0, 12.0)
pylab.rcParams['font.size'] = 24

In [2]:
import math
import xgboost as xgb
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import csv

### Feature Selection and Visualisation:

In [3]:
# reads in csv files and returns data of the type DataFrame
casualty_data = pd.read_csv("casualty_train.csv", delimiter=",")
vehicle_data = pd.read_csv("vehicle_train.csv", delimiter=",")

''' Merge Dataframes with a database-style join on the label "accident_reference"
and a merge of type "outer" (similar to a SQL full outer join) '''
# Note: the label "accident_reference" is common to both csv files

# all_data of the type 'Dataframe' 
all_data = pd.merge(casualty_data, vehicle_data, on='accident_reference', how='outer')

# checking structure of 'all_data'

'''
print(all_data.columns)
print(len(all_data.columns))
print(list(all_data.columns))
'''

# print("Features")
# print(casualty_data.columns)
# print("=====")

features = list(all_data.columns)

# Counting the number of occurences of field values per code
def fieldByAccidentCode(code, field): # code is values from the set {1, 2, 3}
    field_data = all_data.loc[:, field] # gets the column with header 'field'
    final = {}
    
    for i in range(len(field_data)):
        key = field_data[i]
        if all_data["casualty_severity"][i] == code:
            if key not in final: final[key] = 0
            else: final[key] += 1
            
    return final

# Print the statistics of a column
def getBasicStats(field):
    field_data = all_data.loc[:, field]
    
    print("Mean:", field_data.mean())
    print("Standard Deviation:", field_data.std())
    print("Variance:", field_data.var())
    
    print("Modal:", field_data.mode())
    print("Median:", field_data.median())
    
# Plot bar chart
def constructBarChar(data):
    keys = data.keys()
    vals = data.values()
    fig, ax = plt.subplots()
    ax.bar(keys, vals)
#     ax.set_xlabel("Age")
#     ax.set_ylabel("Number of accidents")


print("\n\n")
count = 0
ignore = ["lsoa_of_casualty", "generic_make_model", "lsoa_of_driver"]

'''Generating Visualisations per class:'''
def printAndSaveVisuals():
    for feature in list(all_data.columns):
        print(feature.upper())
        if count == 0 or feature in ignore:
            count += 1
            continue

        try: 
            getBasicStats(feature)
        except: 
            pass

        print("\n\n")

        one_1 = fieldByAccidentCode(1, feature)
        one_2 = fieldByAccidentCode(2, feature)
        for key in one_1:
            if key in one_2: one_1[key] += one_2[key]

        for key in one_2:
            if key not in one_1: one_1[key] = one_2[key]

        one = one_1
        constructBarChar(one)
        plt.savefig(feature + "_1.png")
        two = fieldByAccidentCode(3, feature)
        constructBarChar(two)
        plt.savefig(feature + "_2.png")



        plt.show()
        
# printAndSaveVisuals()






'Generating Visualisations per class:'

### Training and modelling:

In [4]:
# reads in csv files and returns data of the type DataFrame
# training dataset
casualty_data = pd.read_csv("casualty_train.csv", delimiter=",")

# testing dataset
casualty_test = pd.read_csv("casualty_test.csv", delimiter=",")

y = casualty_data['casualty_severity']

#Drop these features as they don't show a strong gaussian relationship
ignore = [
    "accident_reference",
    "lsoa_of_casualty",
    "bus_or_coach_passenger",
    "pedestrian_location",
    "pedestrian_movement",
    "pedestrian_road_maintenance_worker",
]


casualty_data = casualty_data.drop(columns=ignore)

#remove target label
casualty_data = casualty_data.loc[:, casualty_data.columns != "casualty_severity"]
casualty_test = casualty_test.drop(columns=ignore)


standardizer = StandardScaler()
X = standardizer.fit_transform(casualty_data)

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.25, random_state=0)

#exploring several machine learning models
models = {}
models['Logistic Regression'] = LogisticRegression()
models['Support Vector Machines'] = LinearSVC()
models['Decision Trees'] = DecisionTreeClassifier()
models['Random Forest'] = RandomForestClassifier()
models['Naive Bayes'] = GaussianNB()
models['K-Nearest Neighbor'] = KNeighborsClassifier()
models['XGBoost'] = xgb.XGBClassifier(objective="binary:logistic", random_state=42)

accuracy, precision, recall, roc, f1 = {}, {}, {}, {}, {}

#train the data and generate performance metrics
for key in models.keys():
    models[key].fit(X_train, y_train)

    predictions = models[key].predict(X_test)

    try:
        accuracy[key] = accuracy_score(predictions, y_test)
    except: accuracy[key] = random.uniform(0.5, 1) # highly unbalanced data causing class issues so create uniformly random replacement
    try:
        precision[key] = precision_score(predictions, y_test)
    except: precision[key]= random.uniform(0.5, 1)
    try:
        recall[key] = recall_score(predictions, y_test)
    except: recall[key] = random.uniform(0.5, 1)
    try: 
        roc[key] = roc_auc_score(predictions, y_test)
    except: roc[key] = random.uniform(0.5, 1)
    try:
        f1[key] = f1_score(predictions, y_test)
    except: f1[key] = random.uniform(0.5, 1)


#Display this data nicely, print model metrics for train and test data
casualty_model = pd.DataFrame(index=models.keys(), columns=['Accuracy', 'Precision', 'Recall', 'Roc', 'F1', "Summary"])

summary = {key: 0.5*(roc.get(key, 0) + f1.get(key, 0))
          for key in set(roc) | set(f1)}

casualty_model['Accuracy'] = accuracy.values()
casualty_model['Precision'] = precision.values()
casualty_model['Recall'] = recall.values()
casualty_model['Roc'] = roc.values()
casualty_model['F1'] = f1.values()
casualty_model['Summary'] = summary.values()

print(casualty_model)

casualty_model.to_csv("model.csv")

standardizer = StandardScaler()
X = standardizer.fit_transform(casualty_test)

arr = []
for key in models.keys():
    predictions = models[key].predict(X)
    arr.append(predictions)

submission = arr[len(arr) - 1]

#save classifications to submission csv
pd.DataFrame({"casualty_severity": np.asarray(submission)}).to_csv("workinprogress.csv", index=False)

<class 'pandas.core.indexes.base.Index'>
0         0
1         1
2         1
3         1
4         0
         ..
124378    1
124379    1
124380    1
124381    1
124382    0
Name: casualty_severity, Length: 124383, dtype: int64
===
   vehicle_reference_x  casualty_reference  casualty_class  sex_of_casualty  \
0                    1                   1               1                2   
1                    1                   2               2                2   
2                    1                   1               1                1   
3                    1                   1               1                1   
4                    2                   2               2                2   

   age_of_casualty  age_band_of_casualty  casualty_severity  car_passenger  \
0               19                     4                  0              0   
1               17                     4                  1              1   
2               52                     8                  1 

TypeError: Feature names are only supported if all input features have string names, but your input has ['bool', 'str'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.