In [35]:
import pandas as pd
import re

# Modalities A and B
The A and B modalities are very similar in the way they're structured, so it makes sense to treat them similarly. For the Modality C, some few adjustments are going to have to be made, so we'll deal with it later.

<b>Objectives</b>
- Retrieve the amount of right answers per question
- Retrieve the amount of candidates for each score

In [578]:
# Full results CSV's
mod_a_results = pd.read_csv('./data/mod_a.csv', names=['code', 'score'], index_col=False)
mod_b_results = pd.read_csv('./data/mod_b.csv', names=['code', 'score'], index_col=False)

# Number of canditades for each modality
total_mod_A = len(mod_a_results.index)
total_mod_B = len(mod_b_results.index)

total = {'A': mod_a_results, 'B': mod_b_results}

In [566]:
# Reading individual rooms results
files = {
    'A': [
        open('./data/mod_a_sala_4.txt', encoding='utf-8').read(),
        open('./data/mod_a_sala_5.txt', encoding='utf-8').read(),
        open('./data/mod_a_sala_6.txt', encoding='utf-8').read(),
        open('./data/mod_a_sala_7.txt', encoding='utf-8').read()
    ],
    'B': [
        open('./data/mod_b_sala_11.txt', encoding='utf-8').read(),
        open('./data/mod_b_sala_12.txt', encoding='utf-8').read(),
        open('./data/mod_b_sala_13.txt', encoding='utf-8').read(),
        open('./data/mod_b_sala_14.txt', encoding='utf-8').read(),
        open('./data/mod_b_sala_15.txt', encoding='utf-8').read(),
        open('./data/mod_b_extra.txt', encoding='utf-8').read()
    ]
}

In [567]:
# Using regex to retrieve only the answer list

answers = {'A': [], 'B': []}
for modality in files:
    for room in modalities[modality]:
        
        # Removing break lines and whitespaces for easier regex pattern
        string = room.replace('\n', '').replace(' ','');

        # .*? - Anything until you find the next pattern
        # \d* - Any combination of digits (e.g.: 1, 5, 20), indicating the question number
        # \.  - The dot character between the question number and the option marked
        # \w  - Any character, indicating the option that the student marked
        # \+* - The plus signal, indicating if it's the correct answer (asterisk because it might not be a right answer)
        matches = re.findall('.*?(\d*\.\w\+*)', string)

        # extend() instead of append() in order to add each individual element to list
        answers[modality].extend(matches)

In [568]:
# Checking which answers are right ones based on weather or not  it contains a plus sign besides it

right_answers = {'A': [], 'B': []}
for modality in answers:
    for answer in answers[modality]:
        
        # Matches two groups: the question number and the plus sign
        matches = re.match('(\d*)\.\w(\+)*', answer)
        number = matches.group(1)
        is_right = matches.group(2)

        # Checking if answer list is over and it's a right question
        if (number and is_right):
            question_number = int(number)

            # Appends 0 to the list if it hasn't already
            if (len(right_answers[modality]) < question_number): right_answers[modality].append(1)
            else: right_answers[modality][question_number-1] += 1
                

In [570]:
# Organizing Data Frame, adding some new columns and exporting to CSV format

questions = pd.DataFrame(data = right_answers)
questions = questions.set_index([list(range(1,21))])
questions.index.name = 'Question'
questions['%A'] = questions.apply(lambda x: ((x[0]/total_mod_A)*100).round(1), axis=1)
questions['%B'] = questions.apply(lambda x: ((x[1]/total_mod_B)*100).round(1), axis=1)
questions['normalized_A'] = questions.apply(lambda x: (x[0]/max(questions['A'])).round(1), axis=1)
questions['normalized_B'] = questions.apply(lambda x: (x[1]/max(questions['B'])).round(1), axis=1)

mod_a = questions[['A', '%A', 'normalized_A']].copy()
mod_a = mod_a.rename(columns = {'A': 'Total', '%A': '%', 'normalized_A': 'Normalized'})

mod_b = questions[['B', '%B', 'normalized_B']].copy()
mod_b = mod_b.rename(columns = {'B': 'Total', '%B': '%', 'normalized_B': 'Normalized'})

mod_a.to_csv('./outputs/questions_mod_a.csv')
mod_b.to_csv('./outputs/questions_mod_b.csv')

In [611]:
# Preparing the scores DataFrame

scores = {'A': [], 'B': []}
for i in list(range(21)):
    score_measure_mod_a = len(mod_a_results[mod_a_results.score == i].index)
    score_measure_mod_b = len(mod_b_results[mod_b_results.score == i].index)
    
    scores['A'].append(score_measure_mod_a)
    scores['B'].append(score_measure_mod_b)
    
scores = pd.DataFrame(scores).set_index([list(range(0,21))])

In [616]:
scores.index.name = 'Score'
scores['%A'] = scores.apply(lambda x: ((x[0]/total_mod_A)*100).round(2), axis=1)
scores['%B'] = scores.apply(lambda x: ((x[1]/total_mod_B)*100).round(2), axis=1)
scores['normalized_A'] = scores.apply(lambda x: (x[0]/max(scores['A'])).round(2), axis=1)
scores['normalized_B'] = scores.apply(lambda x: (x[1]/max(scores['B'])).round(2), axis=1)

mod_a = scores[['A', '%A', 'normalized_A']].copy()
mod_a = mod_a.rename(columns = {'A': 'Total', '%A': '%', 'normalized_A': 'Normalized'})

mod_b = scores[['B', '%B', 'normalized_B']].copy()
mod_b = mod_b.rename(columns = {'B': 'Total', '%B': '%', 'normalized_B': 'Normalized'})

mod_a

# mod_a.to_csv('./outputs/scores_mod_a.csv')
# mod_b.to_csv('./outputs/scores_mod_b.csv')

Unnamed: 0_level_0,Total,%,Normalized
Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0.0,0.0
1,0,0.0,0.0
2,1,0.71,0.06
3,4,2.86,0.22
4,7,5.0,0.39
5,5,3.57,0.28
6,2,1.43,0.11
7,6,4.29,0.33
8,11,7.86,0.61
9,16,11.43,0.89
