# Import libraries

In [None]:
%matplotlib inline

import pandas as pd
import numpy as np

# Load data

In [None]:
df_demo = pd.read_csv('Data/df_demography.csv', sep=',', dtype={'cpr': str})
df_pers = pd.read_csv('Data/df_personal_characteristic.csv', sep=',', dtype={'cpr': str})

In [None]:
df_pers.to_pickle('Data/df_personal_characteristic.pkl')

In [None]:
df_pers = pd.read_pickle('Data/df_personal_characteristic.pkl')

# Data preprocessing / Feature engineering

#### Merge data from different sources

In [None]:
df_demo.head(5)

In [None]:
df_pers.head(5)

In [None]:
df = pd.merge(df_demo, df_pers, on='cpr')

In [None]:
df.head(5)

In [None]:
len(df)

#### Generate new variables

In [None]:
df['gender'] = df['cpr'].str[9].apply(int)
df['gender'] = np.where(df['gender'] % 2 == 0, 'woman', 'man')

In [None]:
df.head(5)

In [None]:
df['year'] = df['cpr'].str[4:6]
df['year'] = '19' + df['year']

In [None]:
df['month'] = df['cpr'].str[2:4]

In [None]:
df['day'] = df['cpr'].str[0:2]

In [None]:
df['birthday'] = pd.to_datetime(df['day'] + df['month'] + df['year'], format='%d%m%Y')

In [None]:
df.head(5)

In [None]:
from datetime import datetime

df['age'] = datetime.now() - df['birthday']
df['age'] = df['age'].astype('timedelta64[Y]')

In [None]:
df.head(5)

In [None]:
df = df.iloc[:1500]

# Split into train and test set

In [None]:
X = df[['weight', 'height']].values
y = df['gender'].values

In [None]:
X

In [None]:
y

In [None]:
X[0]

In [None]:
y[0]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

# Build and train model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
model = KNeighborsClassifier(n_neighbors=5)

In [None]:
model.fit(X_train, y_train)

In [None]:
print(model.n_neighbors)
print(model.classes_)

# Make predictions

In [None]:
prediction = model.predict([[81, 1.81]]) # [81, 1.81]
print(prediction)

In [None]:
prediction_probability = model.predict_proba([[80, 1.7]])
print(prediction_probability)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred

# Performance metrics

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report

In [None]:
print(y_test[:10])
print(y_pred[:10])

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    #print(cm)
    plt.figure(figsize=(8, 8))
    cmap = matplotlib.colors.LinearSegmentedColormap.from_list('some_name', ['#ffffff', '#35af5d'], N=256, gamma=1.0)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    #thresh = max(cm.max() / 2., 0)
    thresh = 1
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if cm[i, j] > thresh:
            plt.text(j, i, format(cm[i, j], fmt),
                     horizontalalignment="center",
                     verticalalignment="center",
                     color="black",
                     fontsize=12)

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
plot_confusion_matrix(confusion_matrix(y_test, y_pred), ['No', 'Yes'])

In [None]:
accuracy_score(y_test, y_pred)

# sum(y_test == y_pred) / len(y_test)

In [None]:
precision_score(y_test, y_pred, pos_label='woman')

# tp / (tp + fp)

In [None]:
recall_score(y_test, y_pred, pos_label='woman')

# tp / (tp + fn)

In [None]:
print(classification_report(y_test, y_pred))

# Visualize

In [None]:
X_test[:10]

In [None]:
y_test[:10]

In [None]:
df[['weight', 'height']].head(5)

In [None]:
weights = X_test[:,0]

heights = X_test[:,1]

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,8))
plt.scatter(weights, heights, c='k', marker='.')
plt.axis([40, 110, 1.4, 2.2])
plt.xlabel('weight')
plt.ylabel('height')
plt.title('How is data distributed?')
plt.show()

In [None]:
# y_test == 'woman'

In [None]:
color_true = np.where(y_test == 'woman', 'b', 'r')
color_pred = np.where(y_pred == 'woman', 'c', 'm')

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(weights, heights, c=color_true, marker='.')
plt.axis([40, 110, 1.4, 2.2])
plt.xlabel('weight')
plt.ylabel('height')
plt.title('How is data distributed?')
plt.show()

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(weights, heights, c=color_pred, marker='.')
plt.axis([40, 110, 1.4, 2.2])
plt.xlabel('weight')
plt.ylabel('height')
plt.title('How is data distributed?')
plt.show()

# Øvelser - Fine tune model

- Prøv at træne modellen med forskellige antal naboer ($n$-værdier) og se hvordan det påvirker performance metrics.
- Prøv at tilføje en ny kolonne til træningsdata. Dvs. tilføj også ```salary``` til $X$ og træn modellen igen. Se hvordan det påvirker performance metrics.
- Importer en anden klassifikationsmodel og lav prædiktioner med denne på samme måde som overstående. Brug logistisk regression fra scikit learn.
  
  ```from sklearn.linear_model import LogisticRegression```

# Exercise - Fine tune model

- Try training the model with different numbers of neighbors ($n$) and see how it affects the performance metrics.
- Try adding a new column to the training data. I.e. add the ```salary``` column and retrain the model. See how it affects the performance metrics.
- Import another classification model and make prediction in the same way as above. Use logistic regression from scikit learn.
  
  ```from sklearn.linear_model import LogisticRegression```