# Assignment 1  
Simon Karumbi  
s3455453  
### Question 1
Importing and preprocessing of Credit Approval Data

In [1]:
import pandas as pd
import numpy as np
import os, ssl

if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
    getattr(ssl, '_create_unverified_context', None)): 
    ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
attributeNames = [
    'A1',
    'A2',
    'A3',
    'A4',
    'A5',
    'A6',
    'A7',
    'A8',
    'A9',
    'A10',
    'A11',
    'A12',
    'A13',
    'A14',
    'A15',
    'A16'
]

In [3]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data', names = attributeNames)

In [4]:
data.shape

(690, 16)

In [5]:
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [6]:
data.dtypes

A1      object
A2      object
A3     float64
A4      object
A5      object
A6      object
A7      object
A8     float64
A9      object
A10     object
A11      int64
A12     object
A13     object
A14     object
A15      int64
A16     object
dtype: object

In [7]:
data.describe(include = np.number).round(3)

Unnamed: 0,A3,A8,A11,A15
count,690.0,690.0,690.0,690.0
mean,4.759,2.223,2.4,1017.386
std,4.978,3.347,4.863,5210.103
min,0.0,0.0,0.0,0.0
25%,1.0,0.165,0.0,0.0
50%,2.75,1.0,0.0,5.0
75%,7.208,2.625,3.0,395.5
max,28.0,28.5,67.0,100000.0


In [8]:
data.describe(include = np.object)

Unnamed: 0,A1,A2,A4,A5,A6,A7,A9,A10,A12,A13,A14,A16
count,690,690,690,690,690,690,690,690,690,690,690,690
unique,3,350,4,4,15,10,2,2,2,3,171,2
top,b,?,u,g,c,v,t,f,f,g,0,-
freq,468,12,519,519,137,399,361,395,374,625,132,383


### Missing Values
Replacing unknown numerical values with NaN, so that they can be converted to the correct data type.

In [9]:
data['A2'] = data['A2'].replace({'?': np.nan}).astype('float64')
data['A14'] = data['A14'].replace({'?': np.nan}).astype('float64')

In [10]:
for col in (data.select_dtypes(include = 'object')):
    print(data[col].unique())

['b' 'a' '?']
['u' 'y' '?' 'l']
['g' 'p' '?' 'gg']
['w' 'q' 'm' 'r' 'cc' 'k' 'c' 'd' 'x' 'i' 'e' 'aa' 'ff' 'j' '?']
['v' 'h' 'bb' 'ff' 'j' 'z' '?' 'o' 'dd' 'n']
['t' 'f']
['t' 'f']
['f' 't']
['g' 's' 'p']
['+' '-']


### Missing Values (Categorical)  
Column 'A4' doesn't have any 't' values as described in the names document, however, all missing values are to be encoded as a missing value, and therefore the mode of the feature. 
Encoding all numerical 'NaN' values as the median of the feature. 
Encoding all categorical '?' values as the mode of the feature.

In [11]:
for col in (data.select_dtypes(include = 'object')):       
    data[col] = data[col].replace({'?': data[col].mode()[0]})

for col in (data.select_dtypes(exclude = 'object')):
    data[col] = data[col].fillna(data[col].median())

### Integer Encoding
Now discretising column A2 by equal-frequency binning named "low", "medium", and "high", then applying integer encoding.

We can also go ahead and perform a manual replace encoding for the A16 target feature, where we assume that '+' indicates a credit approval (1), and '-' indicates no approval (0).

In [12]:
data['A2'] = pd.qcut(data['A2'], q = 3, labels =('low','medium','high'))

In [13]:
data['A2'].value_counts()

medium    231
low       230
high      229
Name: A2, dtype: int64

In [14]:
level_mapping = {'low': 0, 'medium': 1, 'high': 2}
data['A2'] = data['A2'].replace(level_mapping)
data['A16'] = data['A16'].replace({'+': 1, '-': 0})

In [15]:
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,1,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,2,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,1,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,1,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,0,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


### One Hot Encoding
Now that the A2 has been integer encoded, we can go ahead and perform one hot encoding on the rest of the categorical features. We can separate the target feature A16 from the rest of the dataset, however, I believe for the purpose of haivng a perfectly clean dataset, scaling of the target feature is necessary. 

In [16]:
# Optional removal of the target feature prior to scaling, to be merged later
# target = data['A16'].values
# data = data.drop(columns = 'A16')

In [17]:
categorical = data.select_dtypes(include ='object').columns.tolist()

for col in categorical:
    if(len(data[col].unique() ==  2)):
        data[col] = pd.get_dummies(data[col], drop_first = True)

data = pd.get_dummies(data)     

In [18]:
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,1,1,0.0,1,0,0,0,1.25,1,1,1,0,0,202.0,0,1
1,0,2,4.46,1,0,0,0,3.04,1,1,6,0,0,43.0,560,1
2,0,1,0.5,1,0,0,0,1.5,1,0,0,0,0,280.0,824,1
3,1,1,1.54,1,0,0,0,3.75,1,1,5,1,0,100.0,3,1
4,1,0,5.625,1,0,0,0,1.71,1,0,0,0,0,120.0,0,1


### Scaling
We scale the features (including the target feature) for use in SciKitLearn and export to CSV

In [19]:
from sklearn import preprocessing

In [20]:
standard = preprocessing.StandardScaler().fit_transform(data)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [21]:
df_clean = pd.DataFrame(standard, columns = data.columns)
df_clean.rename(columns={'A16':'target'}, inplace=True)

In [22]:
for col in data.columns:
    data[col] = data[col].round(3)

### Summary and Export

In [23]:
df_clean.shape

(690, 16)

In [24]:
df_clean.describe(include='all').round(3) 

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
count,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0
mean,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,0.0
std,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001
min,-1.512,-1.224,-0.957,-1.784,-0.054,-0.518,-0.094,-0.665,-1.048,-0.864,-0.494,-0.919,-0.108,-1.067,-0.195,-0.895
25%,-1.512,-1.224,-0.756,0.561,-0.054,-0.518,-0.094,-0.616,-1.048,-0.864,-0.494,-0.919,-0.108,-0.602,-0.195,-0.895
50%,0.661,0.002,-0.404,0.561,-0.054,-0.518,-0.094,-0.366,0.955,-0.864,-0.494,-0.919,-0.108,-0.137,-0.194,-0.895
75%,0.661,1.228,0.492,0.561,-0.054,-0.518,-0.094,0.12,0.955,1.157,0.123,1.088,-0.108,0.514,-0.119,1.117
max,0.661,1.228,4.672,0.561,18.547,1.93,10.677,7.858,0.955,1.157,13.294,1.088,9.233,10.557,19.012,1.117


In [25]:
df_clean.head(5)

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,0.661438,0.001777,-0.956613,0.560612,-0.053916,-0.518056,-0.093659,-0.291083,0.95465,1.157144,-0.288101,-0.919195,-0.108306,0.107155,-0.195413,1.116941
1,-1.511858,1.227857,-0.060051,0.560612,-0.053916,-0.518056,-0.093659,0.24419,0.95465,1.157144,0.74083,-0.919195,-0.108306,-0.816912,-0.087852,1.116941
2,-1.511858,0.001777,-0.856102,0.560612,-0.053916,-0.518056,-0.093659,-0.216324,0.95465,-0.864196,-0.493887,-0.919195,-0.108306,0.560471,-0.037144,1.116941
3,0.661438,0.001777,-0.647038,0.560612,-0.053916,-0.518056,-0.093659,0.456505,0.95465,1.157144,0.535044,1.087908,-0.108306,-0.485643,-0.194837,1.116941
4,0.661438,-1.224303,0.174141,0.560612,-0.053916,-0.518056,-0.093659,-0.153526,0.95465,-0.864196,-0.493887,-0.919195,-0.108306,-0.369408,-0.195413,1.116941


In [26]:
df_clean.to_csv('df_clean.csv', index = False)

# Question 2


### Exercise 5
Setting up the data frame 

In [27]:
country = [
    'Afghanistan',
    'Haiti',
    'Nigeria',
    'Egypt',
    'Argentina',
    'China',
    'Brazil',
    'Israel',
    'USA',
    'Ireland',
    'UK',
    'Germany',
    'Canada',
    'Australia',
    'Sweden',
    'New Zealand'
]

In [28]:
Afghanistan = [59.61,23.21,74.30,4.44,0.40,1.5171]
Haiti = [45.00,47.67,73.10,0.09,3.40,1.7999]
Nigeria = [51.30,38.23,82.60,1.07,4.10,2.4493]
Egypt = [70.48,26.58,19.60,1.86,5.30,2.8622]
Argentina = [75.77,32.30,13.30,0.76,10.10,2.9961]
China = [74.87,29.98,13.70,1.95,6.40,3.6356]
Brazil = [73.12,42.93,14.50,1.43,7.20,3.7741]
Israel = [81.30,28.80,3.60,6.77,12.50,5.8069]
USA = [78.51,29.85,6.30,4.72,13.70,7.1357]
Ireland = [80.15,27.23,3.50,0.60,11.50,7.5360]
UK = [80.09,28.49,4.40,2.59,13.00,7.7751]
Germany = [80.24,22.07,3.50,1.31,12.00,8.0461]
Canada = [80.99,24.79,4.90,1.42,14.20,8.6725]
Australia = [82.09,25.40,4.20,1.86,11.50,8.8442]
Sweden = [81.43,22.18,2.40,1.27,12.80,9.2985]
NewZealand = [80.67,27.81,4.90,1.13,12.30,9.4627]

In [29]:
col_names = [
    'life_exp',
    'top10_income',
    'infant_mort',
    'mil_spend',
    'school_years',
    'cpi'
]

In [30]:
cpi = pd.DataFrame([Afghanistan,Haiti,Nigeria,Egypt,Argentina,China,Brazil,Israel,USA,Ireland,UK,Germany,Canada,Australia,Sweden,NewZealand], columns = col_names, index = country)

In [31]:
cpi

Unnamed: 0,life_exp,top10_income,infant_mort,mil_spend,school_years,cpi
Afghanistan,59.61,23.21,74.3,4.44,0.4,1.5171
Haiti,45.0,47.67,73.1,0.09,3.4,1.7999
Nigeria,51.3,38.23,82.6,1.07,4.1,2.4493
Egypt,70.48,26.58,19.6,1.86,5.3,2.8622
Argentina,75.77,32.3,13.3,0.76,10.1,2.9961
China,74.87,29.98,13.7,1.95,6.4,3.6356
Brazil,73.12,42.93,14.5,1.43,7.2,3.7741
Israel,81.3,28.8,3.6,6.77,12.5,5.8069
USA,78.51,29.85,6.3,4.72,13.7,7.1357
Ireland,80.15,27.23,3.5,0.6,11.5,7.536


In [32]:
Russia = (67.62, 31.68, 10.00, 3.87, 12.90, '?')

### Calculating the Manhattan Distance
Question 3a. Calculating the Manhattan Distance of for the three nearest neighbours of Russia

In [33]:
# Defining the function to caclulate 3NN
def manhattan(row, instance):
    difference = 0
    row, instance = np.array(row), np.array(instance)
    
    for i in range(len(row) - 1):
        difference += abs(row[i] - instance[i])
        
    return difference

In [34]:
# Creating an array to hold the differences between each of the countries in the dataframe
difference = []
for i in range(len(cpi)):
    difference.append(manhattan(cpi.iloc[i][:-1],Russia[:-1]))

In [35]:
# Adding differences as a new column and sorting from smallest to largest Manhattan Distance
cpi['difference'] = difference
cpi = cpi.sort_values(by=['difference'])
cpi

Unnamed: 0,life_exp,top10_income,infant_mort,mil_spend,school_years,cpi,difference
China,74.87,29.98,13.7,1.95,6.4,3.6356,14.57
Argentina,75.77,32.3,13.3,0.76,10.1,2.9961,15.18
USA,78.51,29.85,6.3,4.72,13.7,7.1357,17.27
Egypt,70.48,26.58,19.6,1.86,5.3,2.8622,19.57
UK,80.09,28.49,4.4,2.59,13.0,7.7751,22.54
Brazil,73.12,42.93,14.5,1.43,7.2,3.7741,23.69
New Zealand,80.67,27.81,4.9,1.13,12.3,9.4627,24.76
Israel,81.3,28.8,3.6,6.77,12.5,5.8069,25.86
Ireland,80.15,27.23,3.5,0.6,11.5,7.536,26.75
Canada,80.99,24.79,4.9,1.42,14.2,8.6725,27.81


In [36]:
# 3 nearest neighbours
knn = 3
estimate = 0

for i in range(knn):
    estimate += cpi.iloc[i].cpi
estimate = estimate/knn

print(estimate)

4.589133333333334


__Answer: The estimate for Russia's CPI using 3 Nearest Neighbours is 4.5891__

### Calculating Weighted KNN 
Question 3b. Calculating the __weighted knn__ where k = 16 

In [37]:
knn = 16 
estimate = 0 

for i in range(knn):
    estimate += cpi.iloc[i].cpi/(i+1)

estimate = estimate/knn
print(estimate)

1.0480002437926657


__Answer: The CPI for Russia when using weighted KNN over the full data set is 1.0480__

### Calcluating a Standardised, 3 Nearest Neighbours
Question 3b. Calculating the 3 nearest neighbours when using Range Normalisation to Standardise the Dataset

In [38]:
# Creating a copy of the dataset for Normalisation
cpi_copy = cpi.copy()

In [39]:
# Range Normalisation
for column in cpi.columns:
    max_value = cpi[column].max()
    min_value = cpi[column].min()
    cpi_copy[column] = (cpi[column] - min_value) / (max_value - min_value)
cpi_copy

Unnamed: 0,life_exp,top10_income,infant_mort,mil_spend,school_years,cpi,difference
China,0.805338,0.308984,0.140898,0.278443,0.434783,0.266626,0.0
Argentina,0.829604,0.399609,0.13591,0.100299,0.702899,0.186141,0.006709
USA,0.903478,0.303906,0.048628,0.693114,0.963768,0.707134,0.029696
Egypt,0.686978,0.176172,0.214464,0.26497,0.355072,0.169289,0.054993
UK,0.946077,0.250781,0.024938,0.374251,0.913043,0.787606,0.087659
Brazil,0.758156,0.814844,0.150873,0.200599,0.492754,0.284057,0.100308
New Zealand,0.961715,0.224219,0.031172,0.155689,0.862319,1.0,0.112077
Israel,0.9787,0.262891,0.014963,1.0,0.876812,0.539896,0.124175
Ireland,0.947695,0.201563,0.013716,0.076347,0.804348,0.757514,0.133964
Canada,0.970342,0.10625,0.031172,0.199102,1.0,0.900549,0.145623


In [40]:
# 3 nearest neighbours
knn = 3
estimate = 0

for i in range(knn):
    estimate += cpi_copy.iloc[i].cpi
estimate = estimate/knn

estimate = estimate * cpi['cpi'].max()
print(estimate)

3.6585946716841184


__Answer: 3.6586 is the estimated CPI of Russie using 3 nearest neighbours, standardised through range normalisation__

### Calculating the weighted Nearest Neighbour for Russia using Range Normalisation
Question 3d. What would the CPI estimate be for Russia when k = 16, using Range Normalisation?

In [41]:
# Using the Standardised, copied Data Frame from the previous section 
knn = 16 
estimate = 0 

for i in range(knn):
    estimate += cpi_copy.iloc[i].cpi/(i+1)

estimate = estimate/knn
estimate = estimate * cpi['cpi'].max()

print(estimate)

0.8663388467364912


__Answer: The estimate for Russie's CPI using weighted Nearest Neighbour is 0.8663__

### Russia's CPI
Question 3e. Russia's actual CPI was 2.4488

According to my calculations, using the KNN where K = 3, with a standardised range was most accurate, with a CPI of 3.6585. Using a KNN algorithm where K = 3 resulted in a higher difference, with a CPI of 4.5891, and both versions of the weight KNN algorithm resulted in lower scores with the non standardised estimate being 1.0480 and the standardised estimate falling far too short at 0.8663.

One reason that the weighted scores might have been more sensitive to  outliers such as the US, and the UK, which shared similar feature spaces to Russia but had drastically different CPI values. This was more heavily accentuated when using a standardised scale as this increased those difference by a multitude. Choosing a lower K value helped, however, this was still susceptible to noise, where the top 3 similar values shared very different CPI values. Using a standardised scale helped to smooth out the noise in the data and therefore resulted in a more accurate model. 