In [93]:
import numpy as np
import pandas as pd
import pandas_profiling as pp
import seaborn as sns

# from sklearn.model_selection import train_test_split
# from sklearn import linear_model
# from sklearn import tree
# from sklearn import ensemble
# from sklearn import metrics

import json
import requests
%matplotlib inline

### Read all data and generate report with dataset overview

In [94]:
full_data = pd.read_csv("../data/test3.csv")
pp.ProfileReport(full_data).to_file(outputfile="./profile-test-full.html")

### Load Dataset
- Manually select features based on previous report analysis
- Load data, both to_predict and training datasets

In [95]:
feature_cols = ["NU_NOTA_CN", "NU_NOTA_CH", "NU_NOTA_LC", "NU_NOTA_REDACAO"]
mt_cols = ["CO_PROVA_MT", "TX_RESPOSTAS_MT"]
data_to_predict = pd.read_csv("../data/test3.csv",
                         usecols=["NU_INSCRICAO"]+feature_cols+mt_cols)
# pp.ProfileReport(data_to_predict).to_file(outputfile="./profile-test.html")
data_train = pd.read_csv("../data/train.csv",
                         usecols=["NU_INSCRICAO", "TX_GABARITO_MT"]+feature_cols+mt_cols).dropna()
# pp.ProfileReport(data_train).to_file(outputfile="./profile-train.html")

In [96]:
data_to_predict.TX_RESPOSTAS_MT[0]

'BCEDADCCAECEABABDCEBABEDAAECCDDBDBABDADB'

### Random Strategy

In [97]:
options = ['A', 'B', 'C', 'D', 'E']
def generate_answer():
    answer = np.random.choice(options, 5, replace=True)
    return ''.join(answer)
random_answers = data_to_predict.TX_RESPOSTAS_MT.apply(lambda x: generate_answer())
result = (
    data_to_predict
    .assign(TX_RESPOSTAS_MT=random_answers)
    [["NU_INSCRICAO", "TX_RESPOSTAS_MT"]]
)

In [98]:
random_guys = data_to_predict[data_to_predict.TX_RESPOSTAS_MT == 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC'].NU_INSCRICAO
result["TX_RESPOSTAS_MT"] = np.where(result["NU_INSCRICAO"].isin(random_guys), 'CCCCC', result["TX_RESPOSTAS_MT"])

### Random using previous answers frequencies as probabilities

In [99]:
options = ['A', 'B', 'C', 'D', 'E']
def generate_prob_answer(prev_answers):
    prev_answers = np.array(list(prev_answers) + options)
    unique, counts = np.unique(prev_answers, return_counts=True)
    if '*' in unique:
        answer = np.random.choice(options, 5, replace=True)
        return ''.join(answer)
    probs = counts/np.sum(counts)
    answer = np.random.choice(options, 5, p=probs, replace=True)
    return ''.join(answer)
random_answers = data_to_predict.TX_RESPOSTAS_MT.apply(lambda x: generate_prob_answer(x))
result = (
    data_to_predict
    .assign(TX_RESPOSTAS_MT=random_answers)
    [["NU_INSCRICAO", "TX_RESPOSTAS_MT"]]
)

### Use the gabarito

In [100]:
unique, indicies = np.unique(data_train.CO_PROVA_MT, return_index=True)
gabaritos = np.array([])
for i in indicies:
    gabaritos = np.append(gabaritos, data_train.TX_GABARITO_MT.values[i])
dict_mt_answers = dict(zip(unique, gabaritos))
data_to_predict['TX_GABARITO_MT'] = data_to_predict.CO_PROVA_MT.replace(dict_mt_answers)

#### Everybody right answered last 5

In [101]:
result = (
    data_to_predict
    .assign(TX_RESPOSTAS_MT=lambda df: df.TX_GABARITO_MT.apply(lambda ans: ans[-5:]))
    [["NU_INSCRICAO", "TX_RESPOSTAS_MT"]]
)

In [102]:
random_guys = data_to_predict[data_to_predict.TX_RESPOSTAS_MT == 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC'].NU_INSCRICAO
result["TX_RESPOSTAS_MT"] = np.where(result["NU_INSCRICAO"].isin(random_guys), 'CCCCC', result["TX_RESPOSTAS_MT"])

#### Consider previous right answers

In [103]:
def levenshtein(seq1, seq2):  
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )
    return (matrix[size_x - 1, size_y - 1])

In [104]:
total_answered = 40
score = pd.Series([])
for index, row in data_to_predict.iterrows():
    score[index] = 1 - levenshtein(row['TX_RESPOSTAS_MT'], row['TX_GABARITO_MT'])/total_answered
data_to_predict['SCORE'] = score

In [105]:
options = ['A', 'B', 'C', 'D', 'E']
def toss(prob):
    return np.random.choice([True, False], p=[prob, 1-prob])

def generate_prob_right_answer(row):
    prev_answers = np.array(list(row['TX_RESPOSTAS_MT']) + options)
    unique, counts = np.unique(prev_answers, return_counts=True)
    if '*' in unique:
        probs = np.ones(5)/5
    else:
        probs = counts/np.sum(counts)
            
    last_five_gabarito = row['TX_GABARITO_MT'][-5:]
    score = row['SCORE']
    answer = np.array([])
    for i in range(5):
        right_answer = last_five_gabarito[i]
        if score > 0.2:
            answer = np.append(answer, right_answer)
        else:
            random_answer = np.random.choice(options, 1, p=probs)
            answer = np.append(answer, random_answer)
    return ''.join(answer)

answers = pd.Series([])
for index, row in data_to_predict.iterrows():
    answers[index] = generate_prob_right_answer(row)
result = (
    data_to_predict
    .assign(TX_RESPOSTAS_MT=answers)
    [["NU_INSCRICAO", "TX_RESPOSTAS_MT"]]
)
result.head()

Unnamed: 0,NU_INSCRICAO,TX_RESPOSTAS_MT
0,060edb14439f3a7b0e736493d8dc5a45de16ed51,ADCEE
1,63929f181ec794c4a94176b61f0d1def6f4799fe,EACBA
2,11c89c0cb5dc38aa74b242f702bf9df1edc23b5e,BAEBC
3,297931c1902ee1c5cd7bf2ee16b148dedd8e3a9f,EACBA
4,14126c55c1c250f0d769526cf4d5383add31873d,ADCEE


In [106]:
random_guys = data_to_predict[data_to_predict.TX_RESPOSTAS_MT == 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC'].NU_INSCRICAO
result["TX_RESPOSTAS_MT"] = np.where(result["NU_INSCRICAO"].isin(random_guys), 'CCCCC', result["TX_RESPOSTAS_MT"])

### Use most frequent answers separeted by code from train data

In [123]:
codes, indicies = np.unique(data_train.CO_PROVA_MT, return_index=True)
data_train = (
    data_train
    .assign(ans0=lambda df: df.TX_RESPOSTAS_MT.apply(lambda x: x[-5]))
    .assign(ans1=lambda df: df.TX_RESPOSTAS_MT.apply(lambda x: x[-4]))
    .assign(ans2=lambda df: df.TX_RESPOSTAS_MT.apply(lambda x: x[-3]))
    .assign(ans3=lambda df: df.TX_RESPOSTAS_MT.apply(lambda x: x[-2]))
    .assign(ans4=lambda df: df.TX_RESPOSTAS_MT.apply(lambda x: x[-1]))
)
dict_most_freq_5 = {}
for code in codes:
    data = data_train[data_train.CO_PROVA_MT == code]
    most_freq_5 = np.array([])
    for i in range(5):
        most_freq = data["ans{}".format(i)].value_counts().index[0]
        most_freq_5 = np.append(most_freq_5, most_freq)
    dict_most_freq_5[code] = ''.join(most_freq_5)

result = (
    data_to_predict
    .assign(TX_RESPOSTAS_MT=lambda df: df.CO_PROVA_MT.replace(dict_most_freq_5))
    [["NU_INSCRICAO", "TX_RESPOSTAS_MT"]]
)
result.head()

Unnamed: 0,NU_INSCRICAO,TX_RESPOSTAS_MT
0,060edb14439f3a7b0e736493d8dc5a45de16ed51,ADCAE
1,63929f181ec794c4a94176b61f0d1def6f4799fe,BCCBA
2,11c89c0cb5dc38aa74b242f702bf9df1edc23b5e,CAABA
3,297931c1902ee1c5cd7bf2ee16b148dedd8e3a9f,BCCBA
4,14126c55c1c250f0d769526cf4d5383add31873d,ADCAE


In [108]:
random_guys = data_to_predict[data_to_predict.TX_RESPOSTAS_MT == 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC'].NU_INSCRICAO
result["TX_RESPOSTAS_MT"] = np.where(result["NU_INSCRICAO"].isin(random_guys), 'CCCCC', result["TX_RESPOSTAS_MT"])

### Use frequence distribution answers separeted by code from train data

In [130]:
options = ['A', 'B', 'C', 'D', 'E']
codes, indicies = np.unique(data_train.CO_PROVA_MT, return_index=True)
data_train = (
    data_train
    .assign(ans0=lambda df: df.TX_RESPOSTAS_MT.apply(lambda x: x[-5]))
    .assign(ans1=lambda df: df.TX_RESPOSTAS_MT.apply(lambda x: x[-4]))
    .assign(ans2=lambda df: df.TX_RESPOSTAS_MT.apply(lambda x: x[-3]))
    .assign(ans3=lambda df: df.TX_RESPOSTAS_MT.apply(lambda x: x[-2]))
    .assign(ans4=lambda df: df.TX_RESPOSTAS_MT.apply(lambda x: x[-1]))
)
dict_dist_freq_5 = {}
for code in codes:
    data = data_train[data_train.CO_PROVA_MT == code]
    dist_freq_5 = np.empty((0, 5))
    for i in range(5):
        most_freq = data["ans{}".format(i)].value_counts().index[0]
        ans_data = np.append(data["ans{}".format(i)], options)
        clean_data = np.where(np.isin(ans_data, options), ans_data, most_freq)
        unique, dist_freq = np.unique(clean_data, return_counts=True)
        dist_freq = dist_freq/sum(dist_freq)
        dist_freq_5 = np.append(dist_freq_5, [dist_freq], axis=0)
    dict_dist_freq_5[code] = dist_freq_5

def generate_prob_answer_from_code(code):
    dist_freqs = dict_dist_freq_5[code]
    answer = np.array([])
    for dist_freq in dist_freqs:
        choice = np.random.choice(options, 1, p=dist_freq)
        answer = np.append(answer, choice)
    return ''.join(answer)
    
result = (
    data_to_predict
    .assign(TX_RESPOSTAS_MT=
            lambda df: df
                .CO_PROVA_MT.apply(lambda code: generate_prob_answer_from_code(code))
           )
    [["NU_INSCRICAO", "TX_RESPOSTAS_MT"]]
)
random_guys = data_to_predict[data_to_predict.TX_RESPOSTAS_MT == 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC'].NU_INSCRICAO
result["TX_RESPOSTAS_MT"] = np.where(result["NU_INSCRICAO"].isin(random_guys), 'CCCCC', result["TX_RESPOSTAS_MT"])

### Gabarito + Most frequent answer

In [177]:
def generate_prob_right_answer2(row):
    code = row['CO_PROVA_MT']
    most_freqs = dict_most_freq_5[code]
    last_five_gabarito = row['TX_GABARITO_MT'][-5:]
    score = row['SCORE']
    answer = np.array([])
    for i in range(5):
        right_answer = last_five_gabarito[i]
        if score > 0.46:
            answer = np.append(answer, right_answer)
        else:
            frequent_answer = most_freqs[i]
            answer = np.append(answer, frequent_answer)
    return ''.join(answer)

for index, row in data_to_predict.iterrows():
    answers[index] = generate_prob_right_answer2(row)
result = (
    data_to_predict
    .assign(TX_RESPOSTAS_MT=answers)
    [["NU_INSCRICAO", "TX_RESPOSTAS_MT"]]
)
random_guys = data_to_predict[data_to_predict.TX_RESPOSTAS_MT == 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC'].NU_INSCRICAO
result["TX_RESPOSTAS_MT"] = np.where(result["NU_INSCRICAO"].isin(random_guys), 'CCCCC', result["TX_RESPOSTAS_MT"])

### Generate Submission

In [178]:
answer = result.to_dict("records")
submission = {
    "token": "5cfb5e6838b5b71058949df3e8746d3ff1c31a73",
    "email": "igor.a.r.y@gmail.com",
    "answer": answer
}
with open("submission.json", "w") as fp:
    json.dump(submission, fp)

### Send Post request to codenation API

In [179]:
url = 'https://api.codenation.com.br/v1/user/acceleration/data-science/challenge/enem-3/submit'
r = requests.post(url, json=submission)
r.json()

{'score': 29.95}