In [1]:
import pandas as pd
import numpy as np

# Financial Cognative questions

In [2]:
df_cog = pd.read_csv("./data/0_raw/cy07_msu_flt_cog.csv")

In [3]:
df_cog.shape

(107174, 2103)

In [4]:
# Financial math questions
fi_math_question_ids = []
with open("./data/reference/finance_math_ids.txt") as math_file:
    for line in math_file:
        fi_math_question_ids.append(line.strip())

In [5]:
# Financial reading questions
fi_reading_question_ids = []
with open("./data/reference/finance_reading_ids.txt") as reading_file:
    for line in reading_file:
        fi_reading_question_ids.append(line.strip())

In [6]:
# Financial literacy and information technology questisons
fi_it_question_ids = []
with open("./data/reference/finance_it_ids.txt") as fiit_file:
    for line in fiit_file:
        fi_it_question_ids.append(line.strip())

In [7]:
# drop columns not planning to use to free memory
cog_cols = fi_math_question_ids + fi_reading_question_ids + fi_it_question_ids
useful_cols = ['CNT', 'CNTRYID', 'CNTSCHID', 'CNTSTUID', 'STRATUM', 'LANGTEST_COG']
df_cog.drop(columns=df_cog.columns.difference(cog_cols + useful_cols), inplace=True)
df_cog.shape

(107174, 441)

In [8]:
# Given column ids as question_ids this method calculates score of a row
# valid answers given 10 point, partial answers given5 points
def calculate_score_for_columns(row, question_ids):
    student_score = 0.0
    student_answered_questions = 0

    for question in question_ids:
        score = row[question]
        if not np.isnan(score):
            score_str = str(int(score))
            student_answered_questions += 1
            if score_str == "1" or score_str.startswith("2"):
                student_score += 10
            elif score_str.startswith("1"):
                student_score += 5 # partial answer, half point
    return student_score, student_answered_questions

In [9]:
def calculate_test_score(row):
    m_score, m_count = calculate_score_for_columns(row, fi_math_question_ids)
    r_score, r_count = calculate_score_for_columns(row, fi_reading_question_ids)
    i_score, i_count = calculate_score_for_columns(row, fi_it_question_ids)
    return pd.Series([m_score, m_count, r_score, r_count, i_score, i_count])

In [10]:
# Calculate scores and answer counts
score_columns = ["fi_math_score", "fi_math_answered", "fi_reading_score", "fi_reading_answered", "fi_it_score", "fi_it_answered"]
df_cog[score_columns] = df_cog.apply(lambda row: calculate_test_score(row), axis=1)

In [11]:
columns_to_keep = useful_cols + score_columns

In [12]:
# drop unused columns
df_cog.drop(columns=df_cog.columns.difference(columns_to_keep), inplace=True)

In [13]:
df_cog.sample(5)

Unnamed: 0,CNTRYID,CNT,CNTSCHID,CNTSTUID,STRATUM,LANGTEST_COG,fi_math_score,fi_math_answered,fi_reading_score,fi_reading_answered,fi_it_score,fi_it_answered
63653,528.0,NLD,52800148.0,52856611.0,NLD0009,322.0,170.0,18.0,420.0,52.0,0.0,0.0
61724,528.0,NLD,52800096.0,52851755.0,NLD0003,322.0,150.0,24.0,320.0,59.0,0.0,0.0
67846,604.0,PER,60400117.0,60456890.0,PER0102,156.0,50.0,21.0,260.0,55.0,0.0,0.0
46373,380.0,ITA,38000285.0,38054268.0,ITA9797,200.0,20.0,16.0,260.0,48.0,0.0,0.0
99188,840.0,USA,84000093.0,84050811.0,USA0206,313.0,0.0,0.0,375.0,59.0,160.0,27.0


In [14]:
df_cog.to_csv("./data/1_processed/financial_cognitive_scores.csv",index=False)

# Financial Questionnaire

- It appears that financial questionnaire `cy07_msu_flt_qqq` is already included in student questionnaire `cy07_msu_stu_qqq` which is processed in `01_data_gathering_std`