Analise TSE
==========

Initial Configuration
-----------------------

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib

import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats.stats import pearsonr


%config InlineBackend.figure_format = 'retina' #set 'png' here when working on notebook
%matplotlib inline
%config IPCompleter.greedy=True

Load all data
---------------

In [None]:
complete_set = pd.read_csv("eleicoes_2006_a_2010.csv")
complete_set.head()

Create Train and test
-------------------------

In [None]:
#remove not representative data
complete_filtered = complete_set.drop(['sequencial_candidato', 'nome', 'uf', 'ocupacao', 'partido', 'estado_civil', 'cargo'], axis=1)


#complete_filtered = complete_filtered.drop(['ano'], axis=1)
#train_2006 = train_2006.drop(['ano'], axis=1)
#test_2010 = test_2010.drop(['ano'], axis=1)



Pre Processing
------------------


In [None]:
complete_set.isnull().any()

In [None]:
complete_set.isnull().sum()

In [None]:
complete_filtered = complete_filtered.drop(['recursos_de_outros_candidatos/comites', 'recursos_de_pessoas_fisicas', 'recursos_de_pessoas_juridicas', 'recursos_proprios'], axis=1)


Disclaimer
------------
Todas as colunas com valores nulos são relacionadas a receita, se fossem poucas linhas poderiamos apenas descartar esses valores, porém não é o caso. É uma decisão complicada mas como estamos seguindo esse pipeline https://www.kaggle.com/apapiu/regularized-linear-models "Replace the numeric missing values (NaN's) with the mean of their respective columns" aplicaremos a mesma técnica.

In [None]:
train_2006 = complete_set.loc[lambda complete_set : complete_filtered['ano'] == 2006]
test_2010 = complete_set.loc[lambda complete_set : complete_filtered['ano'] == 2010]

Skewness
========

Quantidade_doacoes
-------------------------

In [None]:
matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)

In [None]:
votos = pd.DataFrame({"log(votos + 1)":np.log1p(train_2006["votos"]), "votos":train_2006["votos"],})
votos.hist()

In [None]:


#log transform skewed numeric features:
numeric_feats = complete_filtered.dtypes[complete_set.dtypes != "object"].index

print(numeric_feats)

In [None]:
#log transform the target:
train_2006["votos"] = np.log1p(train_2006["votos"])
test_2010["votos"] = np.log1p(test_2010["votos"])


skewed_feats = train_2006[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

complete_filtered[skewed_feats] = np.log1p(complete_filtered[skewed_feats])



print(skewed_feats)

In [None]:
complete_filtered = pd.get_dummies(complete_filtered)

#complete_filtered.shape


In [None]:
#filling NA's with the mean of the column:
#complete_filtered.dropna(inplace=True)
#complete_filtered.shape
complete_filtered = complete_filtered.fillna(complete_filtered.mean())

In [None]:
train_2006_pre = complete_filtered.loc[lambda complete_filtered : complete_filtered['ano'] == 2006]
test_2010_pre = complete_filtered.loc[lambda complete_filtered : complete_filtered['ano'] == 2010]




#train_2006_pre = train_2006_pre.drop(['ano'], axis=1)
#train_2010_pre = train_2010_pre.drop(['ano'], axis=1)

train_2006_pre.shape

In [None]:
train_2006_pre.votos

In [None]:
from sklearn.model_selection import train_test_split

target = pd.DataFrame({"log(votos + 1)":np.log1p(complete_filtered["votos"]), "votos":complete_filtered["votos"],})


X_intermediate, X_test, y_intermediate, y_test = train_test_split(complete_filtered, 
                                                                  complete_filtered.votos, 
                                                                  shuffle=True,
                                                                  test_size=0.2, 
                                                                  random_state=15)

# train/validation split (gives us train and validation sets)
X_train, X_validation, y_train, y_validation = train_test_split(X_intermediate,
                                                                y_intermediate,
                                                                shuffle=False,
                                                                test_size=0.25,
                                                                random_state=2018)

#X_train.drop(['votos', 'ano'], axis=1)
#X_train.drop(['votos', 'ano'], axis=1)


In [None]:
def calc_train_error(X_train, y_train, model):
    '''returns in-sample error for already fit model.'''
    predictions = model.predict(X_train)
    mse = mean_squared_error(y_train, predictions)
    rmse = np.sqrt(mse)
    return mse
    
def calc_validation_error(X_test, y_test, model):
    '''returns out-of-sample error for already fit model.'''
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    return mse
    
def calc_metrics(X_train, y_train, X_test, y_test, model):
    '''fits model and returns the RMSE for in-sample error and out-of-sample error'''
    model.fit(X_train, y_train)
    train_error = calc_train_error(X_train, y_train, model)
    validation_error = calc_validation_error(X_test, y_test, model)
    return train_error, validation_error

In [None]:
X_train = train_2006_pre
y_train = train_2006_pre.votos
X_test = test_2010_pre
y_test = test_2010_pre.votos

In [None]:
y_train.shape

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression


lr = LinearRegression(fit_intercept=True)

train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, lr)
train_error, test_error = round(train_error, 3), round(test_error, 3)

print('train error: {} | test error: {}'.format(train_error, test_error))
print('train/test: {}'.format(round(test_error/train_error, 1)))

* Train error lower than test error.
* Test error 30% worse

#### Isso é um indicador de que possivelmente a variância está alta e há overfitting, para diminuir a complexidade precisamos usar regularizacao

## Model Tuning

In [None]:
votos_2006 = pd.DataFrame({"votos":train_2006_pre["votos"]})

In [None]:
from sklearn.model_selection import train_test_split

#X_intermediate, X_test, y_intermediate, y_test = train_test_split(complete_filtered, 
#                                                                  votos, 
#                                                                  shuffle=True,
#                                                                  test_size=0.2, 
#                                                                  random_state=15)

# train/validation split (gives us train and validation sets)
X_train, X_validation, y_train, y_validation = train_test_split(train_2006_pre,
                                                                votos_2006,
                                                                shuffle=False,
                                                                test_size=0.25,
                                                                random_state=2018)

X_train.drop(['votos', 'ano'], axis=1)
X_train.drop(['votos', 'ano'], axis=1)


In [None]:
#X_train.shape
#X_validation.shape
y_validation

In [None]:
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score

def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

In [None]:
alphas = [0, 0.001, 0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 1, 3, 5, 10, 30, 50, 100]
#ridge_fitting = pd.Series()
train_error = []
test_error = []

i = 0
for alpha in alphas:
    # instantiate and fit model
    ridge = Ridge(alpha=alpha, fit_intercept=True, random_state=99)
    ridge.fit(X_train, y_train)
    # calculate errors
    train_error.append(mean_squared_error(y_train, ridge.predict(X_train)))
    #new_validation_error = mean_squared_error(y_validation, ridge.predict(X_validation))
    test_error.append(mean_squared_error(y_test, ridge.predict(X_test)))
    # print errors as report
    print('alpha: {:7} | train error: {:5} | test error: {}'.
          format(alpha,
                 round(train_error[i],3),
                 #round(new_validation_error,3),
                 round(test_error[i],3)))
    i += 1 

#cv_ridge = [rmse_cv(Ridge(alpha = alpha)).mean() 
#            for alpha in alphas]

In [None]:
print(len(train_error))
print(len(alphas))

In [None]:
fig, ax = plt.subplots()

ax.plot(alphas, train_error)
ax.plot(alphas, test_error)

ax.set(xlabel='lambda', ylabel='train_error')
ax.grid()

In [None]:
model_ridge = Ridge()

In [None]:
alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
cv_ridge = [rmse_cv(Ridge(alpha = alpha)).mean() 
            for alpha in alphas]

In [None]:
cv_ridge = pd.Series(cv_ridge, index = alphas)
cv_ridge.plot(title = "Validation")
plt.xlabel("alpha")
plt.ylabel("rmse")


In [None]:
cv_ridge.min()


Lasso
-------

In [None]:
model_lasso = LassoCV(alphas = [1, 0.1, 0.001, 0.0005], max_iter=100000).fit(X_train, y)


In [None]:
rmse_cv(model_lasso).mean()

In [None]:
coef = pd.Series(model_lasso.coef_, index = X_train.columns)

In [None]:
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")

In [None]:
imp_coef = pd.concat([coef.sort_values().head(10),
                     coef.sort_values().tail(10)])

In [None]:
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Coefficients in the Lasso Model")

Residuals
--------------

In [None]:
#let's look at the residuals as well:
matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)

preds = pd.DataFrame({"preds":model_lasso.predict(X_train), "true":y})
preds["residuals"] = preds["true"] - preds["preds"]
preds.plot(x = "preds", y = "residuals",kind = "scatter")


In [None]:
# delete intermediate variables
del X_intermediate, y_intermediate

# print proportions
print('train: {}% | validation: {}% | test {}%'.format(round(len(y_train)/len(votos),2),
                                                       round(len(y_validation)/len(votos),2),
                                                       round(len(y_test)/len(votos),2)))