# Machine Learning Pipeline - Data Analysis

Implementation of each of the steps in the Machine Learning Pipeline. 

1. **Data Analysis**
2. Feature Engineering
3. Feature Selection
4. Model Training
5. Obtaining Predictions / Scoring

Plane Crash Dataset available on [Kaggle.com](https://www.kaggle.com/datasets/kamilkarczmarczyk/plane-crash-dataset-03042023). See below for more details.

===================================================================================================

Data description:
- Date: Date of accident, in the format - January 01, 2001
- Time: Local time, in 24 hr. format unless otherwise specified
- Airline/Op: Airline or operator of the aircraft
- Flight #: Flight number assigned by the aircraft operator
- Route: Complete or partial route flown prior to the accident
- AC Type: Aircraft type
- Reg: ICAO registration of the aircraft
- cn / ln: Construction or serial number / Line or fuselage number
- Aboard: Total aboard (passengers / crew)
- Fatalities: Total fatalities aboard (passengers / crew)
- Ground: Total killed on the ground
- Summary: Brief description of the accident and cause if known

# Data Analysis

In [None]:
# ! python -m spacy download en_core_web_sm

In [None]:
# to handle datasets
import pandas as pd
import numpy as np
import datetime as dt

# for plotting
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go

# for the yeo-johnson transformation
import scipy.stats as stats

# to display all the columns of the dataframe in the notebook
pd.pandas.set_option("display.max_columns", None)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

import spacy
import gensim
from typing import Union
import logging
import itertools

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

import spacy.cli
import gensim.downloader as api
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

%matplotlib inline

In [None]:
spacy.cli.download("en_core_web_lg")
nlp = spacy.load("en_core_web_lg")
# nlp = spacy.load("en_core_web_sm")

In [None]:
logging.getLogger().setLevel(logging.INFO)

In [None]:
# load dataset
raw_data = pd.read_csv("data/raw_data.csv", sep=";")

# rows and columns of the data
print(raw_data.shape)

# visualise the dataset
raw_data.head()

In [None]:
raw_data.info()

In [None]:
class DataSchema:
    DATE = "Date"
    TIME = "Time:"
    LOCATION = "Location:"
    AC_TYPE = "AC        Type:"
    OPERATOR = "Operator:"
    ROUTE = "Route:"
    CN_LN = "cn / ln:"
    FLIGHT_N = "Flight #:"
    IS_MILITARY = "Is_military"
    MILITARY_COUNTRY = "Military country"
    ABOARD_ALL = "Aboard_all"
    ABOARD_PASSENGERS = "Aboard_passengers"
    FATALITIES_ALL = "Fatalities_all"
    FATALITIES_PASSENGERS = "Fatalities_passengers"
    GROUND = "Ground:"
    REGISTRATION = "Registration:"
    SUMMARY = "Summary:"
    YEAR = "Year"
    MONTH = "Month"
    HOUR = "Hour"
    ROUTES_N = "Routes_Number"
    VECTOR = "Vector"
    FATALITIES = "Fatalities"
    SURVIVED = "Survived"
    SURVIVED_PCT = "Survived_pct"

In [None]:
redundant_columns = []
data = raw_data.copy()

In [None]:
data = data.replace("?", np.nan)
data.head()

In [None]:
def get_pct_missing_and_unique(col: str, alias: str = None) -> None:
    alias = alias if alias else col
    print(data[col].isna().value_counts())
    print(f"\033[31m{alias} field has {data[col].isna().sum()/data.shape[0]:.2%} missing values\033[0m")
    print(f"\033[32m{alias} field has {data[col].nunique()/data.shape[0]:.2%} unique values\033[0m")

## Missing values

In [None]:
# make a list of the variables that contain missing values
vars_with_na = [var for var in data.columns if data[var].isnull().sum() > 0]

# determine percentage of missing values (expressed as decimals)
# and display the result ordered by % of missin data

data[vars_with_na].isnull().mean().sort_values(ascending=False)

In [None]:
# plot

data[vars_with_na].isnull().mean().sort_values(
    ascending=False).plot.bar(figsize=(10, 4))
plt.ylabel('Percentage of missing data')
plt.axhline(y=0.80, color='r', linestyle='-')
plt.axhline(y=0.15, color='g', linestyle='-')

plt.show()

## Date: extract year and month

In [None]:
# check missing values
get_pct_missing_and_unique(DataSchema.DATE)

In [None]:
# extract year and month from date into separate columns
data[DataSchema.YEAR] = pd.to_datetime(data[DataSchema.DATE]).dt.year
data[DataSchema.MONTH] = pd.to_datetime(data[DataSchema.DATE]).dt.month
redundant_columns.append(DataSchema.DATE)

## Time: extract hour

In [None]:
# check missing values
get_pct_missing_and_unique(DataSchema.TIME, "Time")

In [None]:
# extract hour from time if time is not NaN else assign time to 25
# TODO change hour 25 for eg. with most frequent value for same year, month, operator
data[DataSchema.HOUR] = data[DataSchema.TIME].apply(
    lambda x: dt.datetime.strptime(x, '%H:%M:%S').hour if x is not np.nan else 25)
redundant_columns.append(DataSchema.TIME)

## Location: check number of unique values, extract state

In [None]:
get_pct_missing_and_unique(DataSchema.LOCATION, "Location")

In [None]:
data[data[DataSchema.LOCATION].isna()]

In [None]:
def get_locations(text: str) -> Union[None, str]:    
    if text is np.nan:
        return text
    locations = [] 
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_=="GPE":
            locations.append(ent.text)
    return locations[-1] if locations else text

data[DataSchema.LOCATION] = data[DataSchema.LOCATION].apply(lambda x: get_locations(x))

In [None]:
print(f"\033[31mLocation field after parsing for country has {data[DataSchema.LOCATION].nunique()/data.shape[0]:.2%} unique values\033[0m")

## AC Type: check for missing values

In [None]:
get_pct_missing_and_unique(DataSchema.AC_TYPE, "Aircraft Type")

In [None]:
data[data[DataSchema.AC_TYPE].isna()]

In [None]:
# TODO group Aircraft types

## Operator

In [None]:
get_pct_missing_and_unique(DataSchema.OPERATOR, "Operator")

In [None]:
data[data[DataSchema.OPERATOR].isna()]

In [None]:
data[DataSchema.OPERATOR]

In [None]:
def get_operatos_list(text: str) -> Union[None, list]:
    if text is np.nan:
        return text
    if "test" in text.lower():
        return text
    operators = []
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "ORG":
            operators.append(ent.text)
    return operators[-1] if operators else text

# test
# x = [np.nan, "Military - U.S. Army Air Corps", "Test", "American Flyers Airline", "KLM Royal Dutch Airlines", "Air Canada"]
# for i in x:
#     y = get_operatos_list(i)
#     print(y)
#     print("==========================")

In [None]:
tmp = data.copy()
tmp["tmp"] = tmp[DataSchema.OPERATOR].apply(lambda x: get_operatos_list(x))

In [None]:
tmp["tmp"].sample(5)

In [None]:
print(f"\033[35mOperator field after organisation extraction has {tmp['tmp'].nunique()/data.shape[0]:.2%} unique values\033[0m")

In [None]:
# TODO consider to use tf-idf
redundant_columns.append(DataSchema.OPERATOR)

## Route

In [None]:
get_pct_missing_and_unique(DataSchema.ROUTE, "Route")

In [None]:
data[DataSchema.ROUTE].sample(5)

In [None]:
def get_multiple_locations(text: str) -> Union[None, str]:    
    if text is np.nan:
        return text
    route = text.split(" - ")    
    for point in route:
        doc = nlp(point)
        for ent in doc.ents:
            if ent.label_=="GPE":
                point = ent.text
    flatten = list(itertools.chain(*[[j] if isinstance(j, str) else j for j in [i.split(", ") for i in route]]))
    return flatten


# test
# routes = [
#     "Sukhumi - Kutaisi", "Test flight", np.nan, "Bombing run", 
#     "Boston - NY - Washington DC - Jacksonville - Miami", "Lima - Pucallpa - Iquitos"]
# for _ in routes:
#     x = get_multiple_locations(_)
#     print(x)

In [None]:
data[DataSchema.ROUTE] = data[DataSchema.ROUTE].apply(lambda x: get_multiple_locations(x))

In [None]:
data[DataSchema.ROUTE].sample(5)

In [None]:
data[DataSchema.ROUTES_N] = data[DataSchema.ROUTE].apply(lambda x: 0 if x is np.nan else len(x))

In [None]:
data[DataSchema.ROUTES_N].value_counts()

In [None]:
redundant_columns.append(DataSchema.ROUTE)

## Construction or serial number / Line or fuselage number

In [None]:
get_pct_missing_and_unique(DataSchema.CN_LN, "Construction/fuselage number")

In [None]:
data[data[DataSchema.CN_LN].notna()][DataSchema.CN_LN]

In [None]:
redundant_columns.append(DataSchema.CN_LN)

## Flight number

In [None]:
get_pct_missing_and_unique(DataSchema.FLIGHT_N, "Flight number")

In [None]:
data[data[DataSchema.FLIGHT_N].notna()][DataSchema.FLIGHT_N].sample(5)

In [None]:
redundant_columns.append(DataSchema.FLIGHT_N)

## Is Military

In [None]:
get_pct_missing_and_unique(DataSchema.IS_MILITARY, "Is Military")

In [None]:
# TODO consider to drop military samples
redundant_columns.append(DataSchema.IS_MILITARY)

## Military country

In [None]:
get_pct_missing_and_unique(DataSchema.MILITARY_COUNTRY)

In [None]:
redundant_columns.append(DataSchema.MILITARY_COUNTRY)

## Aboard All

In [None]:
get_pct_missing_and_unique(DataSchema.ABOARD_ALL, "Aboard All")

In [None]:
data.dropna(subset=[DataSchema.ABOARD_ALL], inplace=True)

## Abroad Passengers

In [None]:
get_pct_missing_and_unique(DataSchema.ABOARD_PASSENGERS, "Abroad Passengers")

## Fatalities All

In [None]:
get_pct_missing_and_unique(DataSchema.FATALITIES_ALL, "Fatalities All")

In [None]:
data.dropna(subset=[DataSchema.FATALITIES_ALL], inplace=True)

## Fatalities Passengers

In [None]:
get_pct_missing_and_unique(DataSchema.FATALITIES_PASSENGERS, "Fatalities Passengers")

## Ground

In [None]:
get_pct_missing_and_unique(DataSchema.GROUND, "Ground")

In [None]:
data[DataSchema.GROUND].fillna(0, inplace = True)

In [None]:
data[DataSchema.GROUND].value_counts()
# data[data[DataSchema.GROUND]==2750]

In [None]:
# TODO replace np.nan with mean which is 0 and change type to int

In [None]:
fatalities = [
    DataSchema.ABOARD_ALL, DataSchema.ABOARD_PASSENGERS, 
    DataSchema.FATALITIES_ALL, DataSchema.FATALITIES_PASSENGERS, DataSchema.GROUND]
data[fatalities].sample(10)

In [None]:
def get_pct_fatalities(row):
    if row[DataSchema.ABOARD_ALL] is not np.nan and row[DataSchema.FATALITIES_ALL]is not np.nan and int(row[DataSchema.ABOARD_ALL])!=0:
        return int(row[DataSchema.FATALITIES_ALL])/int(row[DataSchema.ABOARD_ALL])*100
    return -1

all = data.apply(lambda x: get_pct_fatalities(x), axis=1)

def get_pct_fatalities_passwnger(row):
    if row[DataSchema.ABOARD_ALL] is not np.nan and row[DataSchema.FATALITIES_PASSENGERS]is not np.nan and int(row[DataSchema.ABOARD_ALL])!=0:
        return int(row[DataSchema.FATALITIES_PASSENGERS])/int(row[DataSchema.ABOARD_ALL])*100
    return -1

passengers = data.apply(lambda x: get_pct_fatalities(x), axis=1)

def get_pct_fatalities_passwnger(row):
    if row[DataSchema.ABOARD_ALL] is not np.nan and row[DataSchema.FATALITIES_PASSENGERS]is not np.nan and int(row[DataSchema.ABOARD_ALL])!=0:
        return (int(row[DataSchema.FATALITIES_ALL])-int(row[DataSchema.FATALITIES_PASSENGERS]))/int(row[DataSchema.ABOARD_ALL])*100
    return -1

crew = data.apply(lambda x: get_pct_fatalities(x), axis=1)

sns.histplot(all)

In [None]:
redundant_columns.append(DataSchema.GROUND)

## Registration

In [None]:
get_pct_missing_and_unique(DataSchema.REGISTRATION, "Registration")

In [None]:
data[DataSchema.REGISTRATION].sample(5)

In [None]:
redundant_columns.append(DataSchema.REGISTRATION)

## Summary

In [None]:
get_pct_missing_and_unique(DataSchema.SUMMARY, "Summary")

In [None]:
data.dropna(subset=[DataSchema.SUMMARY], inplace=True)
data[DataSchema.SUMMARY]

In [None]:
info = api.info()
for model_name, model_data in sorted(info["models"].items()):
    print("%s (%d records) %s..." % (model_name, model_data.get("num_records", -1), model_data["description"][:40]))

In [None]:
w2v = api.load("word2vec-google-news-300")

### Castom data Word2Vec model

In [None]:
break

In [None]:
def get_preprocessed(text):
    doc = nlp(text)

    filtered = []
    for token in doc:
        if token.is_punct or token.is_stop:
            continue
        filtered.append(token.lemma_)
    return filtered

corpus = data[DataSchema.SUMMARY].apply(lambda text: get_preprocessed(text)).tolist()
model_default = Word2Vec(sentences=corpus) # 100-dimentional vector by default
model_vs100 = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)
model_vs50 = Word2Vec(sentences=corpus, vector_size=50, window=5, min_count=1, workers=4)
# model.save("word2vec.model")
# model = Word2Vec.load("word2vec.model")


In [None]:
# plot embadings in 2-dimensional spase

def reduce_dimensions(model):
    n_components = 2 # dimentions
    # extract vocabulary from model and vectors in order to associate them in the graph
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)
    #apply TSNE
    tsne = TSNE(n_components=n_components, random_state=42)
    vectors = tsne.fit_transform(vectors)
    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


def plot_embeddings(x_vals, y_vals, labels):
    fig = go.Figure()
    trace = go.Scatter(x=x_vals, y=y_vals, mode="markers", text=labels)
    fig.add_trace(trace)
    fig.update_layout(title="Word2Vec - Visualisation embedding with TSNE")
    fig.show()
    return fig


x_vals, y_vals, labels = reduce_dimensions(model)
plot = plot_embeddings(x_vals, y_vals, labels)

In [None]:
# preprocessing (remove stop words, lemmitize)

def text2vec(text):
    doc = nlp(text)

    filtered = []
    for token in doc:
        if token.is_punct or token.is_stop:
            continue
        filtered.append(token.lemma_)
    return w2v.get_mean_vector(filtered, pre_normalize=True)

### Sense check

In [None]:
# print(w2v.similarity("plane","passanger"))
# print(w2v.most_similar(positive=["plane"], topn=3))

In [None]:
data[DataSchema.VECTOR] = data[DataSchema.SUMMARY].apply(lambda text: text2vec(text))

In [None]:
data[DataSchema.VECTOR]

# Target

In [None]:
for col in [DataSchema.ABOARD_ALL, DataSchema.FATALITIES_ALL, DataSchema.GROUND]:
    data[col] = data[col].astype(int)

In [None]:
data[DataSchema.FATALITIES] = data[DataSchema.FATALITIES_ALL]+data[DataSchema.GROUND]
data[DataSchema.SURVIVED] = data[DataSchema.ABOARD_ALL]-data[DataSchema.FATALITIES_ALL]

In [None]:
data[DataSchema.FATALITIES].value_counts()

In [None]:
data[DataSchema.SURVIVED].value_counts()

In [None]:
data[DataSchema.SURVIVED] = np.where(data[DataSchema.SURVIVED] > 0, 1, data[DataSchema.SURVIVED])

In [None]:
data[DataSchema.SURVIVED].value_counts()

In [None]:
data[DataSchema.SURVIVED_PCT] = data[DataSchema.SURVIVED]/data[DataSchema.ABOARD_ALL]*100
data[DataSchema.SURVIVED_PCT].value_counts()

### Summary vectors as features to predict survival percentage

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data[DataSchema.VECTOR].values,
    data[DataSchema.SURVIVED],
    test_size=0.2,
    random_state=42,
    stratify=data[DataSchema.SURVIVED]
)

In [None]:
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

In [None]:
print(f"{X_train_2d.shape=}")
print(f"{X_test_2d.shape=}")

In [None]:
clf = GradientBoostingClassifier()
clf.fit(X_train_2d, y_train)
y_pred = clf.predict(X_test_2d)
print(classification_report(y_test, y_pred))