# Data Preparation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


### Task 2:
1.	The data is divided over two tables, which is inconvenient for doing analysis. Using the merge function, merge them into one data frame using the Accident_Index field. 

In [None]:
# TODO: read the first csv file 
dfA = pd.read_csv('Accidents_2015.csv', header = 0,
                 quotechar='"',sep=",",
                 na_values = ['na', '-', '.', ''], low_memory=False)
# TODO: read the first csv file 
dfC = pd.read_csv('Casualties_2015.csv', header = 0,
                 quotechar='"',sep=",",
                 na_values = ['na', '-', '.', ''], low_memory=False)

# TODO: Perform the merge 
df_merged = pd.merge(dfA, dfC, left_on='Accident_Index', 
                        right_on = 'Accident_Index')
df_merged

2.	The "Accident_Severity" variable needs to be recoded. You need to replace the code by: 1=Minor, 2=Medium, 3=Severe.

In [None]:
severity_rank = ['Minor', 'Medium', 'Severe']
dfA_2 = dfA.copy()
for i in range(len(severity_rank)):
    j = i + 1
    dfA_2.loc[dfA_2['Accident_Severity'] == j, 'Accident_Severity'] = severity_rank[i]
dfA_2

3.	For a set of variables, missing values has been replaced by -1. Detect these values and report the names of the columns in each table that contain such values.

In [None]:
dfA_3 = dfA.copy()
for col in dfA_3.columns:
    dfA_3.loc[dfA_3[col] == -1] = None

4.	For all variables, check if there are any clearly extreme values, or values that do not belong in that column. If you find any, remove these records from the dataset.

In [None]:
dataTypeSeries = dfA.dtypes
continuous_var = list()
for col_idx in range(len(dfA.columns)):
    if (dataTypeSeries[col_idx] == "float64"):
        continuous_var.append(dfA.columns[col_idx])
continuous_var


In [None]:
# TODO: for each column except the label column, compute the standard deviation of the columns
# report all the values that are at distance > 3 * std from the mean value as outliers.
def stat_outliers (col):
    outliers = set()
    m = col.mean()
    s = col.std()
    for p in col.unique():
        if (abs(p-m) > 3 * s):
            outliers.add(p)
    return outliers

for col in continuous_var:
    print("Outliers in column ", dfA[col], " are: ", stat_outliers(dfA[col]))

5.	Compute a variable called is_minor , that checks whether a casualty was a minor or an adult. Being adult is defined as having an age of 18 or above. The column should only contain the values ‘Yes’ and ‘No’. The field name is "Age_of_Casualty" in the "Casualties_2015.csv" table. 

In [None]:
dfC_4 = dfC.copy()
dfC_4.loc[dfC_4['Age_of_Casualty'] < 18, 'is_Minor'] = "Yes"
dfC_4.loc[dfC_4['Age_of_Casualty'] >= 18, 'is_Minor'] = "No"
dfC_4

6.	The 'Location_Easting_OSGR' variable has about 27 of missing values - solve this with imputation of the average of the 'Location_Easting_OSGR' of all records. That means, calculate the average of all the available values in 'Location_Easting_OSGR' and fill the missing cells in the column with the average value. 

### Task 3:

For this task, use the dataset pid.csv. This dataset were publicly available but it has been removed from the repository so use it for this assignment and don’t redistribute it. The name of the table and the names of the columns has been changed to anonymize the data. 

In [None]:
df_pid = pd.read_csv('pid.csv', header = 0,
                 quotechar='"',sep=",",
                 na_values = ['na', '-', '.', ''])

1.	Remove the disguised values from the table -- We need to remove the values that equal 0 from columns C,D and F as these are missing values but they have been disguised by the value 0. Remove the value but keep the record (i.e.) change the value to null

In [None]:
df_pid_1 = df_pid.copy()
for col in ['C', 'D', 'F']:
    df_pid_1.loc[df_pid_1[col] == 0] = None

2.	Remove the Label column and remove one of the columns if their correlation is greater than 0.5. That is, if there are two columns with correlation value > 0.5 then remove one of them and keep the other. The input for this step is the original dataframe not the one that has been produced at step 1.

In [None]:
df_pid_cor = df_pid.copy()
df_pid_cor = df_pid_cor[df_pid_cor.columns[0:8]]
corr_mat = df_pid_cor.corr()
corr_mat

In [None]:
# Find the columns that have correlation greater than 0.5 
# and store the name of one of them in the extra_cols list
extra_cols = list()
for i in range(len(df_pid_cor.columns)):
    j = 0
    while (j < i):        
        if (corr_mat.iloc[j, i] >= 0.5):
            f_col_name = corr_mat.columns[j]            
            extra_cols.append(f_col_name)
        j = j + 1

# Now, extract the columns other than the redundant ones
new_df_cols = list(set(df_pid_cor.columns).difference(set(extra_cols)))
new_df = df_pid_cor[new_df_cols]
new_df

3.	Use a Python or R-library to find the principal components and project the data on those components. Plot the projected data on the first and the second (principal components) PCs as a scatter plot. If you are working with R, use the (prcomp) R-function. For Python, check https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html 

In [None]:
features = df_pid.columns[:8]
# Separating out the features
x = df_pid.loc[:, features].values
# Standardizing the features
x = StandardScaler().fit_transform(x)

In [None]:
pca1 = PCA(n_components = 8)               # You can also use pca = PCA(2)
pcs1 = pca1.fit_transform(x)
pcsDF1 = pd.DataFrame(data = pcs1, columns = ['PC'+str(i+1) for i in range(8)])

In [None]:
pcsDF_red = pcsDF1[['PC1', 'PC2']]

In [None]:
eigenvectors1, eigenvalues1 = pca1.components_, pca1.explained_variance_
plt.bar(np.array(range(8)), eigenvalues1, color = 'green')
plt.xticks(np.array(range(8)), (['PC'+str(i+1) for i in range(8)]))
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
ax.scatter(pcsDF1["PC1"], pcsDF1["PC2"])
K = 2
mu = pcs1.mean(axis=0)

i = 1
for axis, color in zip(eigenvectors1[:K], ["red","green"]):
#     start, end = mu, mu + sigma * axis ### leads to "ValueError: too many values to unpack (expected 2)"

    # So I tried this but I don't think it's correct
    start, end = (mu)[:K], (mu + 2 * eigenvalues1[i-1] * axis)[:K]
    pc = 'PC'+str(i)
    ax.arrow(start[0], start[1], end[0], end[1], head_width=0.2, head_length=0.3, fc = color, ec=color)
    ax.annotate(pc, (end[0] + 0.05 * eigenvalues1[i-1], end[1] + 0.05 * eigenvalues1[i-1]),fontsize=14)
    i += 1


ax.set_aspect('equal')
plt.show()