## Import libraries

In [2]:
import pandas as pd
import numpy as np

## Load data

In [3]:
df_demo = pd.read_csv('Data/df_demography.csv', sep=',', dtype={'cpr': str})
df_pers = pd.read_csv('Data/df_personal_characteristic.csv', sep=',', dtype={'cpr': str})

In [4]:
df_pers.to_pickle('Data/df_personal_characteristic.pkl')

In [5]:
df_pers = pd.read_pickle('Data/df_personal_characteristic.pkl')

## Data preprocessing / Feature engineering

#### Merge datakilder

In [6]:
df_demo.head(5)

Unnamed: 0,cpr,municipality,salary
0,1203913055,Esbjerg,377218
1,1308952064,Sønderborg,286903
2,2410936634,Thisted,337835
3,1605952976,København,273928
4,2802942759,Silkeborg,318220


In [7]:
df_pers.head(5)

Unnamed: 0,cpr,height,weight,hair_len
0,1203913055,1.8,82.1,7.5
1,1308952064,1.58,61.6,47.2
2,2410936634,1.66,73.7,28.4
3,1605952976,1.71,74.2,46.8
4,2802942759,1.91,83.8,6.3


In [8]:
df = pd.merge(df_demo, df_pers, on='cpr')

In [9]:
df.head(5)

Unnamed: 0,cpr,municipality,salary,height,weight,hair_len
0,1203913055,Esbjerg,377218,1.8,82.1,7.5
1,1308952064,Sønderborg,286903,1.58,61.6,47.2
2,2410936634,Thisted,337835,1.66,73.7,28.4
3,1605952976,København,273928,1.71,74.2,46.8
4,2802942759,Silkeborg,318220,1.91,83.8,6.3


#### Generer nye variable

In [10]:
df['gender'] = df['cpr'].str[9].apply(int)
df['gender'] = np.where(df['gender'] % 2 == 0, 'woman', 'man')

In [11]:
df.head(5)

Unnamed: 0,cpr,municipality,salary,height,weight,hair_len,gender
0,1203913055,Esbjerg,377218,1.8,82.1,7.5,man
1,1308952064,Sønderborg,286903,1.58,61.6,47.2,woman
2,2410936634,Thisted,337835,1.66,73.7,28.4,woman
3,1605952976,København,273928,1.71,74.2,46.8,woman
4,2802942759,Silkeborg,318220,1.91,83.8,6.3,man


In [12]:
df['year'] = df['cpr'].str[4:6]
df['year'] = '19' + df['year']

In [13]:
df['month'] = df['cpr'].str[2:4]

In [14]:
df['day'] = df['cpr'].str[0:2]

In [15]:
df['birthday'] = pd.to_datetime(df['day'] + df['month'] + df['year'], format='%d%m%Y')

In [16]:
df.head(5)

Unnamed: 0,cpr,municipality,salary,height,weight,hair_len,gender,year,month,day,birthday
0,1203913055,Esbjerg,377218,1.8,82.1,7.5,man,1991,3,12,1991-03-12
1,1308952064,Sønderborg,286903,1.58,61.6,47.2,woman,1995,8,13,1995-08-13
2,2410936634,Thisted,337835,1.66,73.7,28.4,woman,1993,10,24,1993-10-24
3,1605952976,København,273928,1.71,74.2,46.8,woman,1995,5,16,1995-05-16
4,2802942759,Silkeborg,318220,1.91,83.8,6.3,man,1994,2,28,1994-02-28


In [17]:
from datetime import datetime

df['age'] = datetime.now() - df['birthday']
df['age'] = df['age'].astype('timedelta64[Y]')

In [18]:
df.head(5)

Unnamed: 0,cpr,municipality,salary,height,weight,hair_len,gender,year,month,day,birthday,age
0,1203913055,Esbjerg,377218,1.8,82.1,7.5,man,1991,3,12,1991-03-12,28.0
1,1308952064,Sønderborg,286903,1.58,61.6,47.2,woman,1995,8,13,1995-08-13,23.0
2,2410936634,Thisted,337835,1.66,73.7,28.4,woman,1993,10,24,1993-10-24,25.0
3,1605952976,København,273928,1.71,74.2,46.8,woman,1995,5,16,1995-05-16,24.0
4,2802942759,Silkeborg,318220,1.91,83.8,6.3,man,1994,2,28,1994-02-28,25.0


## Split into train and test set

In [19]:
X = df[['weight', 'height']].values
y = df['gender'].values

In [20]:
X

array([[82.1 ,  1.8 ],
       [61.6 ,  1.58],
       [73.7 ,  1.66],
       ...,
       [78.7 ,  1.67],
       [84.5 ,  1.91],
       [66.  ,  1.82]])

In [21]:
y

array(['man', 'woman', 'woman', ..., 'woman', 'man', 'woman'],
      dtype=object)

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [23]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(7767, 2)
(6213, 2)
(1554, 2)


## Build and train model

In [24]:
from sklearn.neighbors import KNeighborsClassifier

In [25]:
model = KNeighborsClassifier(n_neighbors=2)

In [27]:
model.fit(X_train, y_train) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=2, p=2,
           weights='uniform')

In [30]:
print(model.n_neighbors)
print(model.classes_)

2
['man' 'woman']


## Make predictions

In [None]:
pred = 

In [None]:
print(model.predict([[1.1]]))
print(model.predict_proba([[0.9]]))

In [None]:
y_pred = log_reg.predict(X_test)

## Performance metrics

In [None]:
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(sum(y_test == y_pred) / len(y_test))

## Visualize

In [None]:
N = 1500
df_gender = df[df['gender'] == 'man']

heights = df_gender['height'].values
heights = np.random.choice(heights, size=N)

weights = df_gender['weight'].values
weights = np.random.choice(weights, size=N)

slope, intercept, r_value, p_value, std_err = stats.linregress(weights, heights)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(7,7))
plt.plot(weights, heights, 'b.', weights, slope * weights + intercept, 'r-')
plt.axis([40, 110, 1.4, 2.2])
plt.xlabel('weight')
plt.ylabel('height')
plt.title('Hvordan er data fordelt?')
plt.show()

In [None]:
weights = X_scale[:, 0]
heights = X_scale[:, 1]

color_true = np.where(df['gender'] == 'man', 'b', 'r')
color_pred = np.where(df['cluster'] == 1, 'c', 'm')

## Fine tune model

## Øvelser