In [205]:
import numpy as np
import pandas as pd
import pandas_profiling as pp
import seaborn as sns

# from sklearn.model_selection import train_test_split
# from sklearn import linear_model
# from sklearn import tree
# from sklearn import ensemble
# from sklearn import metrics

import json
import requests
%matplotlib inline

### Read all data and generate report with dataset overview

In [206]:
full_data = pd.read_csv("../data/test3.csv")
pp.ProfileReport(full_data).to_file(outputfile="./profile-test-full.html")

### Load Dataset
- Manually select features based on previous report analysis
- Load data, both to_predict and training datasets

In [250]:
feature_cols = ["NU_NOTA_CN", "NU_NOTA_CH", "NU_NOTA_LC", "NU_NOTA_REDACAO"]
mt_cols = ["CO_PROVA_MT", "TX_RESPOSTAS_MT"]
data_to_predict = pd.read_csv("../data/test3.csv",
                         usecols=["NU_INSCRICAO"]+feature_cols+mt_cols)
# pp.ProfileReport(data_to_predict).to_file(outputfile="./profile-test.html")
data_train = pd.read_csv("../data/train.csv",
                         usecols=["NU_INSCRICAO", "TX_GABARITO_MT"]+feature_cols+mt_cols).dropna()
# pp.ProfileReport(data_train).to_file(outputfile="./profile-train.html")

In [251]:
data_to_predict.TX_RESPOSTAS_MT[0]

'BCEDADCCAECEABABDCEBABEDAAECCDDBDBABDADB'

### Random Strategy

In [209]:
options = ['A', 'B', 'C', 'D', 'E']
def generate_answer():
    answer = np.random.choice(options, 5, replace=True)
    return ''.join(answer)
random_answers = data_to_predict.TX_RESPOSTAS_MT.apply(lambda x: generate_answer())
result = (
    data_to_predict
    .assign(TX_RESPOSTAS_MT=random_answers)
    [["NU_INSCRICAO", "TX_RESPOSTAS_MT"]]
)

In [210]:
random_guys = data_to_predict[data_to_predict.TX_RESPOSTAS_MT == 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC'].NU_INSCRICAO
result["TX_RESPOSTAS_MT"] = np.where(result["NU_INSCRICAO"].isin(random_guys), 'CCCCC', result["TX_RESPOSTAS_MT"])

### Random using previous answers frequencies as probabilities

In [246]:
options = ['A', 'B', 'C', 'D', 'E']
def generate_prob_answer(prev_answers):
    prev_answers = np.array(list(prev_answers) + options)
    unique, counts = np.unique(prev_answers, return_counts=True)
    if '*' in unique:
        answer = np.random.choice(options, 5, replace=True)
        return ''.join(answer)
    probs = counts/np.sum(counts)
    answer = np.random.choice(options, 5, p=probs, replace=True)
    return ''.join(answer)
random_answers = data_to_predict.TX_RESPOSTAS_MT.apply(lambda x: generate_prob_answer(x))
result = (
    data_to_predict
    .assign(TX_RESPOSTAS_MT=random_answers)
    [["NU_INSCRICAO", "TX_RESPOSTAS_MT"]]
)

### Use the gabarito

In [293]:
unique, indicies = np.unique(data_train.CO_PROVA_MT, return_index=True)
gabaritos = np.array([])
for i in indicies:
    gabaritos = np.append(gabaritos, data_train.TX_GABARITO_MT.values[i])
dict_mt_answers = dict(zip(unique, gabaritos))
data_to_predict['TX_GABARITO_MT'] = data_to_predict.CO_PROVA_MT.replace(dict_mt_answers)

#### Everybody right answered last 5

In [294]:
result = (
    data_to_predict
    .assign(TX_RESPOSTAS_MT=lambda df: df.TX_GABARITO_MT.apply(lambda ans: ans[-5:]))
    [["NU_INSCRICAO", "TX_RESPOSTAS_MT"]]
)

In [295]:
random_guys = data_to_predict[data_to_predict.TX_RESPOSTAS_MT == 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC'].NU_INSCRICAO
result["TX_RESPOSTAS_MT"] = np.where(result["NU_INSCRICAO"].isin(random_guys), 'CCCCC', result["TX_RESPOSTAS_MT"])

### Generate Submission

In [296]:
answer = result.to_dict("records")
submission = {
    "token": "5cfb5e6838b5b71058949df3e8746d3ff1c31a73",
    "email": "igor.a.r.y@gmail.com",
    "answer": answer
}
with open("submission.json", "w") as fp:
    json.dump(submission, fp)

### Send Post request to codenation API

In [297]:
url = 'https://api.codenation.com.br/v1/user/acceleration/data-science/challenge/enem-3/submit'
r = requests.post(url, json=submission)
r.json()

{'score': 25.474999999999998}