In [35]:
import pandas as pd
import re

# Modalities A and B
The A and B modalities are very similar in the way they're structured, so it makes sense to treat them similarly. For the Modality C, some few adjustments are going to have to be made, so we'll deal with it later.

<b>Objectives</b>
- Retrieve the amount of right answers per question
- Retrieve the amount of candidates for each score

In [563]:
# Full results CSV's
mod_a_results = pd.read_csv('./data/mod_a.csv', names=['code', 'score'], index_col=False)
mod_b_results = pd.read_csv('./data/mod_b.csv', names=['code', 'score'], index_col=False)

# Number of canditades for each modality
total_mod_A = len(mod_a_results.index)
total_mod_B = len(mod_b_results.index)

140


In [382]:
# Reading individual rooms results
files = {
    'A': [
        open('./data/mod_a_sala_4.txt', encoding='utf-8').read(),
        open('./data/mod_a_sala_5.txt', encoding='utf-8').read(),
        open('./data/mod_a_sala_6.txt', encoding='utf-8').read(),
        open('./data/mod_a_sala_7.txt', encoding='utf-8').read()
    ],
    'B': [
        open('./data/mod_b_sala_11.txt', encoding='utf-8').read(),
        open('./data/mod_b_sala_12.txt', encoding='utf-8').read(),
        open('./data/mod_b_sala_13.txt', encoding='utf-8').read(),
        open('./data/mod_b_sala_14.txt', encoding='utf-8').read(),
        open('./data/mod_b_sala_15.txt', encoding='utf-8').read(),
        open('./data/mod_b_extra.txt', encoding='utf-8').read()
    ]
}

In [383]:
# Using regex to retrieve only the answer list

answers = {'A': [], 'B': []}
for modality in files:
    for room in modalities[modality]:
        
        # Removing break lines and whitespaces for easier regex pattern
        string = room.replace('\n', '').replace(' ','');

        # .*? - Anything until you find the next pattern
        # \d* - Any combination of digits (e.g.: 1, 5, 20), indicating the question number
        # \.  - The dot character between the question number and the option marked
        # \w  - Any character, indicating the option that the student marked
        # \+* - The plus signal, indicating if it's the correct answer (asterisk because it might not be a right answer)
        matches = re.findall('.*?(\d*\.\w\+*)', string)

        # extend() instead of append() in order to add each individual element to list
        answers[modality].extend(matches)

In [510]:
# Checking which answers are right ones based on weather or not  it contains a plus sign besides it

right_answers = {'A': [], 'B': []}
for modality in answers:
    for answer in answers[modality]:
        
        # Matches two groups: the question number and the plus sign
        matches = re.match('(\d*)\.\w(\+)*', answer)
        number = matches.group(1)
        is_right = matches.group(2)

        # Checking if answer list is over and it's a right question
        if (number and is_right):
            question_number = int(number)

            # Appends 0 to the list if it hasn't already
            if (len(right_answers[modality]) < question_number): right_answers[modality].append(1)
            else: right_answers[modality][question_number-1] += 1
                

In [547]:
# Organizing Data Frame, adding some new columns and exporting to CSV format

questions = pd.DataFrame(data = right_answers)
questions = questions.set_index([list(range(1,21))])
questions.index.name = 'Question'
questions['%A'] = questions.apply(lambda x: ((x[0]/total_mod_A)*100).round(1), axis=1)
questions['%B'] = questions.apply(lambda x: ((x[1]/total_mod_B)*100).round(1), axis=1)
questions['normalized_A'] = questions.apply(lambda x: (x[0]/max(questions['A'])).round(1), axis=1)
questions['normalized_B'] = questions.apply(lambda x: (x[1]/max(questions['B'])).round(1), axis=1)

mod_a = questions[['A', '%A', 'normalized_A']].copy()
mod_a = mod_a.rename(columns = {'A': 'Total', '%A': '%', 'normalized_A': 'Normalized'})

mod_b = questions[['B', '%B', 'normalized_B']].copy()
mod_b = mod_b.rename(columns = {'B': 'Total', '%B': '%', 'normalized_B': 'Normalized'})

mod_a.to_csv('./outputs/questions_mod_a.csv')
mod_b.to_csv('./outputs/questions_mod_b.csv')

In [557]:
mod_a_results = pd.read_csv('./data/mod_a.csv', names=['code', 'score'], index_col=False)

0       142431-G
1       630009-J
2       015216-G
3       707364-I
4       308075-E
5       238953-B
6       281544-F
7       721849-C
8       561787-F
9       566313-F
10      174382-G
11      274875-E
12      452122-H
13      249160-D
14      111261-D
15      238168-J
16      966975-D
17      801279-I
18      447908-D
19      743502-C
20      591426-I
21      248445-I
22      321182-I
23      020386-A
24      626487-E
25      874144-J
26      881182-J
27      514251-J
28      917637-E
29      521167-D
         ...    
110     860821-G
111     996087-A
112     469783-I
113     292816-J
114     444761-H
115     099332-H
116     534724-G
117     050471-I
118     790503-F
119     506710-A
120     927366-E
121     126944-H
122     814000-E
123     120795-F
124     570445-G
125     585826-F
126     789377-C
127     213199-G
128     479888-F
129     626919-E
130     079942-C
131     876855-A
132     490197-B
133     343254-C
134     848347-F
135     051758-H
136     293932-J
137     608839