In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression  
import matplotlib.pyplot as plt
import datetime as dt
import pickle

In [None]:
from utils import fractional_years

## Clean Data

In [None]:
data_path = '../data'
df_raw = pd.read_excel(f'{data_path}/inscriptos_2018_4_6.xlsx')

In [None]:
attrs = ['nrodocumento','FechaNacimiento']
df = df_raw[attrs].copy() # Avoid Warnings about slice of a copy
df.rename(columns = {'nrodocumento':'dni','FechaNacimiento':'birthdate'}, inplace=True)
df['dni'] = df['dni'].astype(int)
df['birthyear'] = df.birthdate.apply(fractional_years)
df = df[(df.birthyear>1920)&(df.birthyear<2006)]
df = df[(df.dni>1e6)&(df.dni<100e6)]
df = df[df.birthyear!=2000]

In [None]:
df2 = df.drop_duplicates().copy()
df2.drop(columns=['birthdate'], inplace=True)

In [None]:
%matplotlib inline
%matplotlib notebook
df2.plot(x='dni', y='birthyear', style='o', color='black', alpha=0.05)

## Try with splines

In [None]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures, SplineTransformer
from sklearn.pipeline import make_pipeline

In [None]:
data = df2.copy()
filtered_data = df2[df2.birthyear<1960+df2.dni*(2000-1960)/19e6] # Filter Noise
knot_list = [3.5,4,5,6.7,7.4,8.8,10,15,18.7,18.8,18.9,19.05,19.2,20,30,40,46.8,46.9,47,47.1,60,90,91,92,93,94,95,96]
knots = (np.array(knot_list)*1e6).reshape(-1, 1)
data_sample = filtered_data.sample(int(filtered_data.shape[0]*1))
data_sample = data_sample.sort_values('dni')
bspline_model = make_pipeline(SplineTransformer(degree=3, knots=knots, extrapolation='constant'), Ridge(alpha=1e-3))
bspline_model.fit(data_sample.dni.values.reshape(-1, 1), data_sample.birthyear.values.reshape(-1, 1))

In [None]:
%matplotlib notebook
bspline_y_hat = bspline_model.predict(data.dni.values.reshape(-1,1)).reshape(-1)
plt.plot(data.dni,data.birthyear,'o', color='black', alpha=0.1)
plt.plot(data.dni, bspline_y_hat, 'o', color='red')

In [None]:
filename = 'mega_spline_model.pickle'
pickle.dump(bspline_model, open(filename, 'wb'))