In [None]:
import pandas as pd
from sklearn import preprocessing
from sklearn import tree
from sklearn.model_selection import train_test_split
import graphviz 
import warnings
import numpy as np
import sqlite3
import json
import matplotlib
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, roc_curve, log_loss, confusion_matrix, precision_score, recall_score, classification_report, accuracy_score
import joblib
from joblib import dump, load
from pathlib import Path
import datetime

%matplotlib inline
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('csvs/data.csv')
df.head()

In [None]:
cand_data = json.loads(open('predictor_pol/candidatos.json','r').read())

In [None]:
def get_party(id):
    for p in cand_data:
        for c in p['candidates']:
            if c['id'] == id:
                return p['party']
    return 'n/a'

def get_name(id):
    for p in cand_data:
        for c in p['candidates']:
            if c['id'] == id:
                return c['name']
    return 'n/a'

In [None]:
df['partido'] = df.candidatoId.apply(get_party)
df['nombre'] = df.candidatoId.apply(get_name)

In [None]:
df.head(1)

# Análisis exploratorio

In [None]:
print("cantidad de registros: {}".format(len(df)))

In [None]:
plt.rcParams['figure.figsize'] = [10, 5]
df.groupby('partido').size().sort_values(ascending=False).plot.bar(title='Partido elegido por usuarios')

In [None]:
df.groupby('nombre').size().sort_values(ascending=False).head(10).plot.bar(title='Candidato elegido por usuarios')

In [None]:
by_day = pd.read_csv('csvs/encuestas_por_dia.csv')
by_day = by_day.sort_values(by='dia',ascending=False)

In [None]:
by_day.plot.barh(x='dia',y='cant',title='Cantidad de encuestas por dia')

In [None]:
d = datetime.datetime.now() - datetime.timedelta(days=6)
by_day = by_day[by_day['dia'] > d.strftime("%Y-%m-%d")]
by_day.plot.barh(x='dia',y='cant',title='Por dia (ultimos 5 dias)')

In [None]:
by_candidate = df[['nombre','fecha']]
by_candidate["dia"] = by_candidate.fecha.apply(lambda x: x[0:10])
by_candidate.drop(columns=['fecha'],inplace=True)
by_candidate.head(10)
#TODO aca me gustaria hacer unas graficas por dia por candidato

# Entrenamiento de modelo

In [None]:
df.head()

In [None]:
features = ['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19',
       '2', '20', '21', '22', '23', '24', '25', '26', '3', '4', '5', '6', '7',
       '8', '9']

In [None]:
def add_total(row, features):
    total = 0
    for i in features:
        total += row[i]
    return total

df['suma_respuestas'] = df.apply(lambda row: add_total(row, features),axis=1)

In [None]:
todo_5 = len(df[df['suma_respuestas'] == 26*5])
todo_3 = len(df[df['suma_respuestas'] == 26*3])
todo_1 = len(df[df['suma_respuestas'] == 26])
print('cant encuestas todo 5: {}'.format(todo_5))
print('cant encuestas todo 1: {}'.format(todo_1))
print('cant encuestas todo 3: {}'.format(todo_3))

In [None]:
#TODO conviene sacar las todo 5 y todo 1 ?

# LogisticRegression (en produccion)

In [None]:
from sklearn.linear_model import LogisticRegression

#Split train and test
df_train, df_test = train_test_split(df, test_size=0.20)

In [None]:
# Try to predict candidate
clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial',max_iter=700).fit(df_train[features], df_train.candidatoId)
df_test['predicted_candidate'] = clf.predict(df_test[features])

df_test['predicted_candidate'] = df_test['predicted_candidate'].apply(get_name)
df_test['candidato'] = df_test['candidatoId'].apply(get_name)


In [None]:
print(classification_report(df_test.nombre,df_test.predicted_candidate))

### este pareceria ser el mejor de los 3, se pone en produccion

# El modelo original

In [None]:
df_original = df[df.id < 206]

In [None]:
len(df_original)

In [None]:
n=17
k=7

#entreno con el df original y testeo con el resto de los datos

pca = PCA(n_components=n)
pca.fit(df_original[features])
x_train = pca.transform(df_original[features])
y_train = df_original.candidatoId
x_test = pca.transform(df[df.id>=206][features])
y_test = df[df.id>=206].candidatoId

candidate_model = KNeighborsClassifier(n_neighbors=n)
candidate_model.fit(x_train, y_train)
results = pd.DataFrame()
results['truth'] = y_test
results['truth'] = results.truth.apply(get_name)
results['prediction'] = candidate_model.predict(x_test)
results['prediction'] = results.prediction.apply(get_name)

print(classification_report(results.truth,results.prediction ))

# Ahora probamos un nuevo KNeighborsClassifier

In [None]:
n=22
k=22

pca = PCA(n_components=n)
pca.fit(df[features])
X = pca.transform(df[features])
y = df.candidatoId

#entreno con el 80% de los datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
candidate_model = KNeighborsClassifier(n_neighbors=n)
candidate_model.fit(X_train, y_train)
results = pd.DataFrame()
results['truth'] = y_test
results['truth'] = results.truth.apply(get_name)
results['prediction'] = candidate_model.predict(X_test)
results['prediction'] = results.prediction.apply(get_name)

print(classification_report(results.truth,results.prediction ))

### Gana el LR