In [1]:
import pandas as pd
import numpy as np
import json

# Expore datasets

## School and Teacher Questionnaires

In [2]:
# columns I find interesting
column_lookup = {}
with open("./data/reference/column_lookup.txt") as text_file:
    for line in text_file:
        item = json.loads(line)
        column_lookup[item['column']] = item['meaning']

### Load teacher dataset

In [3]:
df_tch = pd.read_csv("./data/1_processed/teacher_questionnaire.csv")

In [4]:
df_tch.head(3)

Unnamed: 0,CNTRYID,CNT,CNTSCHID,CNTTCHID,TEACHERID,STRATUM,TC001Q01NA,TC002Q01NA,TC005Q01NA,TC007Q01NA,...,TC198Q07HA,TC198Q08HA,TC198Q09HA,TC198Q10HA,TC186Q01HA,TCSTAFFSHORT,TCEDUSHORT,SATJOB,SATTEACH,SEFFREL
0,8.0,ALB,800057.0,800001.0,5.0,ALB0203,1.0,40.0,1.0,10.0,...,3.0,3.0,4.0,4.0,1.0,-0.0939,0.8218,0.4224,0.3534,
1,8.0,ALB,800121.0,800002.0,5.0,ALB0107,2.0,26.0,1.0,5.0,...,3.0,3.0,3.0,3.0,1.0,-0.1418,-0.3556,-0.4864,-1.1557,
2,8.0,ALB,800140.0,800003.0,5.0,ALB0101,2.0,38.0,1.0,11.0,...,3.0,3.0,4.0,4.0,1.0,-1.2438,0.1843,0.9035,0.2613,


In [5]:
df_tch.shape

(107367, 24)

In [6]:
for column in df_tch.columns:
    print('{:<12} -> {}'.format(column, column_lookup[column]))

CNTRYID      -> Country Identifier
CNT          -> Country code 3-character
CNTSCHID     -> Intl. School ID
CNTTCHID     -> Intl. Teacher ID
TEACHERID    -> Teacher identification code
STRATUM      -> Stratum ID 7-character (cnt + region ID + original stratum ID)
TC001Q01NA   -> Are you female or male?
TC002Q01NA   -> How old are you?
TC005Q01NA   -> What is your current employment status as a teacher? My employment status at this school
TC007Q01NA   -> How many years of work experience do you have? Year(s) working as a teacher at this school
TC198Q03HA   -> Agree: I would like to change to another school if that were possible.
TC198Q04HA   -> Agree: I regret that I decided to become a teacher.
TC198Q05HA   -> Agree: I enjoy working at this school.
TC198Q06HA   -> Agree: I wonder whether it would have been better to choose another profession.
TC198Q07HA   -> Agree: I would recommend my school as a good place to work.
TC198Q08HA   -> Agree: I think that the teaching profession is valued

#### Possible insights from teacher questionnaire

- teacher age average of Stratum's
  - gender
  - employment status
  - their years in teaching
- among the teachers that satisfied with their job
  - how many years do they worked
  - do their school have staff/material shortage

#### Combined features for teacher questionnaire

- for a school 
  - we can create a mean teacher satisfaction score
  - experiance in years / age score
  - [-1,1] male / female

In [7]:
def calc_tch_satisfied(x):
    total_score = 0

    # negative
    # TC198Q03HA   -> Agree: I would like to change to another school if that were possible.
    # TC198Q04HA   -> Agree: I regret that I decided to become a teacher.
    # TC198Q06HA   -> Agree: I wonder whether it would have been better to choose another profession.

    # positive
    # TC198Q05HA   -> Agree: I enjoy working at this school.
    # TC198Q07HA   -> Agree: I would recommend my school as a good place to work.
    # TC198Q08HA   -> Agree: I think that the teaching profession is valued in society.
    # TC198Q09HA   -> Agree: I am satisfied with my performance in this school.
    # TC198Q10HA   -> Agree: All in all, I am satisfied with my job.

    # these columns all range between 1-4 (1 strongly disagree, 2 disagree, 3 agree, 4 strongly agree)
    # higher the total score worst the education is effected

    total_score += x.TC198Q05HA + x.TC198Q07HA + x.TC198Q08HA + x.TC198Q09HA + x.TC198Q10HA

    # 3 -> agree, removed 7 and get the minus to calc positives from neg questions
    # min total score you could get is 0
    neg_score = 0
    neg_score += (x.TC198Q03HA + x.TC198Q04HA + x.TC198Q06HA -7) * -1

    return total_score + neg_score

In [8]:
df_tch['tch_satisfied'] = df_tch.apply(lambda x: calc_tch_satisfied(x), axis=1)

In [9]:
df_tch['tch_satisfied'].describe()

count    87717.000000
mean        16.775437
std          3.855377
min          0.000000
25%         14.000000
50%         17.000000
75%         20.000000
max         24.000000
Name: tch_satisfied, dtype: float64

In [10]:
df_tch.shape

(107367, 25)

In [11]:
tch_columns_to_keep = ["CNT", "CNTRYID","CNTSCHID","CNTTCHID","TEACHERID","STRATUM","TC001Q01NA","TC002Q01NA","TC007Q01NA","TC186Q01HA","TC005Q01NA","tch_satisfied"]

In [12]:
df_tch.drop(columns=df_tch.columns.difference(tch_columns_to_keep), inplace=True)
df_tch.shape

(107367, 12)

In [13]:
df_tch.to_csv("./data/2_merged/teacher.csv",index=False)

### Load school dataset

In [14]:
df_sch = pd.read_csv("./data/1_processed/school_questionnaire.csv")

In [15]:
df_sch.shape

(21903, 34)

In [16]:
for column in df_sch.columns:
    print('{:<12} -> {}'.format(column, column_lookup[column]))

CNTRYID      -> Country Identifier
CNT          -> Country code 3-character
CNTSCHID     -> Intl. School ID
CYC          -> PISA Assessment Cycle (2 digits + 2 character Assessment type - MS/FT)
NatCen       -> National Centre 6-digit Code
Region       -> Region
STRATUM      -> Stratum ID 7-character (cnt + region ID + original stratum ID)
SC001Q01TA   -> Which of the following definitions best describes the community in which your school is located?
SC013Q01TA   -> Is your school a public or a private school?
SC017Q01NA   -> Schools instruction hindered by: A lack of teaching staff.
SC017Q02NA   -> Schools instruction hindered by: Inadequate or poorly qualified teaching staff.
SC017Q05NA   -> Schools instruction hindered by: A lack of educational material [...]
SC017Q07NA   -> Schools instruction hindered by: A lack of physical infrastructure [...]
SC017Q08NA   -> Schools instruction hindered by: Inadequate or poor quality physical infrastructure [...]
SC011Q01TA   -> Which of the fol

#### Possible insights from school questionnaire

- effect of 
  - student/teacher ratio
  - teacher qualifications
  - school problem score vs ovrall PISA score

#### Combined features for teacher questionnaire

- education obstacle score (resource problem)
  - by lack of equipment, staff, material, intrastructure.. (combination of 5 columns)
- student centered education hinderance score (student behaviour problem)
  - absent, not attentive ...
- qualified(master+phd) teacher avg to schools teacher count

In [17]:
def calc_education_obstacle(x):
    total_score = 0

    # SC017Q01NA   -> Schools instruction hindered by: A lack of teaching staff.
    # SC017Q02NA   -> Schools instruction hindered by: Inadequate or poorly qualified teaching staff.
    # SC017Q05NA   -> Schools instruction hindered by: A lack of educational material [...]
    # SC017Q07NA   -> Schools instruction hindered by: A lack of physical infrastructure [...]
    # SC017Q08NA   -> Schools instruction hindered by: Inadequate or poor quality physical infrastructure [...]

    # these columns all range between 1-4 (1 not at all, 2 little, 3 some extend, 4 alot)
    # higher the total score worst the education is effected

    total_score += x.SC017Q01NA + x.SC017Q02NA + x.SC017Q05NA + x.SC017Q07NA + x.SC017Q08NA

    return total_score

In [18]:
df_sch['education_obstacle'] = df_sch.apply(lambda x: calc_education_obstacle(x), axis=1)

In [19]:
df_sch['education_obstacle'].describe()

count    20620.000000
mean         9.825606
std          3.482010
min          5.000000
25%          7.000000
50%         10.000000
75%         12.000000
max         20.000000
Name: education_obstacle, dtype: float64

In [20]:
def calc_problematic_students(x):
    total_score = 0

    # SC061Q01TA   -> Extent to which student learning is hindered by: Student truancy
    # SC061Q02TA   -> Extent to which student learning is hindered by: Students skipping classes
    # SC061Q03TA   -> Extent to which student learning is hindered by: Students lacking respect for teachers
    # SC061Q04TA   -> Extent to which student learning is hindered by: Student use of alcohol or illegal drugs
    # SC061Q05TA   -> Extent to which student learning is hindered by: Students intimidating or bullying other students
    # SC061Q11HA   -> Extent to which student learning is hindered by: Students not being attentive

    # these columns all range between 1-4 (1 not at all, 2 little, 3 some extend, 4 alot)
    # higher the total score worst the student is

    total_score += x.SC061Q01TA + x.SC061Q02TA + x.SC061Q03TA + x.SC061Q04TA + x.SC061Q05TA + x.SC061Q11HA

    return total_score

In [21]:
df_sch['problematic_students'] = df_sch.apply(lambda x: calc_problematic_students(x), axis=1)

In [22]:
df_sch['problematic_students'].describe()

count    20623.000000
mean        12.911070
std          3.923851
min          6.000000
25%         10.000000
50%         12.000000
75%         15.000000
max         24.000000
Name: problematic_students, dtype: float64

In [23]:
def calc_degree_score_teachers(x):
    # SC018Q05NA01 -> Teachers with an <ISCED Level 5A Bachelor degree> qualification: Full-time
    # SC018Q05NA02 -> Teachers with an <ISCED Level 5A Bachelor degree> qualification: Part-time
    # SC018Q06NA01 -> Teachers with an <ISCED Level 5A Masters degree> qualification: Full-time
    # SC018Q06NA02 -> Teachers with an <ISCED Level 5A Masters degree> qualification: Part-time
    # SC018Q07NA01 -> Teachers with an <ISCED Level 6> qualification: Full-time
    # SC018Q07NA02 -> Teachers with an <ISCED Level 6> qualification: Part-time
    # SC018Q01TA01 -> Teachers in TOTAL: Full-time
    # SC018Q01TA02 -> Teachers in TOTAL: Part-time


    total_teachers = x.SC018Q01TA01 + x.SC018Q01TA02

    if total_teachers == 0:
        return 0
        
    teachers_with_degree = (x.SC018Q05NA01+ x.SC018Q05NA02) + (x.SC018Q06NA01 + x.SC018Q06NA02) + (x.SC018Q07NA01 + x.SC018Q07NA02)
    # fulltime x1.5, bachelor 1, master 3, phd 6
    degree_score = (x.SC018Q05NA01*1.5 + x.SC018Q05NA02) + 3*(x.SC018Q06NA01*1.5 + x.SC018Q06NA02) + 6*(x.SC018Q07NA01*1.5 + x.SC018Q07NA02)

    return degree_score/total_teachers

In [24]:
df_sch['degree_score_teachers'] = df_sch.apply(lambda x: calc_degree_score_teachers(x), axis=1)

In [25]:
df_sch['degree_score_teachers'].describe()

count    12836.000000
mean         2.511260
std          1.493949
min          0.000000
25%          1.527683
50%          2.136364
75%          3.600000
max         14.890110
Name: degree_score_teachers, dtype: float64

In [26]:
df_sch[df_sch.degree_score_teachers > 14][['CNT','SC018Q01TA01', 'SC018Q01TA02', 'SC018Q05NA01', 'SC018Q06NA01', 'SC018Q07NA01']]

Unnamed: 0,CNT,SC018Q01TA01,SC018Q01TA02,SC018Q05NA01,SC018Q06NA01,SC018Q07NA01
6966,FRA,41.0,9.0,41.0,41.0,41.0
10436,KAZ,24.0,4.0,24.0,24.0,24.0
17841,ESP,89.0,2.0,89.0,89.0,89.0
21833,QRT,18.0,3.0,18.0,18.0,18.0


In [27]:
df_sch.shape

(21903, 37)

In [28]:
sch_final_columns = ["CNT", "CNTRYID","CNTSCHID","CYC","NatCen","Region","STRATUM","SC001Q01TA","SC013Q01TA","SC011Q01TA","SC002Q01TA","SC002Q02TA","SC003Q01TA","degree_score_teachers","problematic_students","education_obstacle"]

In [29]:
df_sch.drop(columns=df_sch.columns.difference(sch_final_columns), inplace=True)
df_sch.shape

(21903, 16)

In [30]:
df_sch.to_csv("./data/2_merged/school.csv",index=False)

## Student cognitive tasks
- includes reading, math and science
- some questions has option to give partial credit
- merged into the student csv

In [31]:
df_cog = pd.read_csv("./data/1_processed/cognitive_scores.csv")

In [32]:
df_cog.head(3)

Unnamed: 0,CNTRYID,CNT,CNTSCHID,CNTSTUID,STRATUM,LANGTEST_COG,RCORE_PERF,RCO1S_PERF,math_score,math_answered,reading_score,reading_answered,science_score,science_answered
0,8.0,ALB,800115.0,800001.0,ALB0107,140.0,1.0,1.0,100.0,22.0,320.0,52.0,0.0,0.0
1,8.0,ALB,800300.0,800002.0,ALB0105,140.0,1.0,1.0,40.0,23.0,270.0,53.0,0.0,0.0
2,8.0,ALB,800088.0,800003.0,ALB0101,140.0,2.0,2.0,0.0,0.0,250.0,52.0,80.0,34.0


## Student Questionnaire
- also includes financial questionnaire
- original dataset is filtered for only columns I wanted to work with

In [33]:
df_std = pd.read_csv("./data/1_processed/student_questionnaire.csv")

In [34]:
# Merge student questionnaire with cognative task dataframe above
merge_on_common_cols = ['CNT','CNTRYID', 'CNTSCHID', 'CNTSTUID', 'STRATUM']
df_student = df_std.merge(df_cog, left_on=merge_on_common_cols, right_on=merge_on_common_cols, how='outer')
df_student.shape

(612004, 215)

In [35]:
for column in df_student.columns:
    if column in column_lookup:
        print('{:<12} -> {}'.format(column, column_lookup[column]))
    else:
        print(column)

CNTRYID      -> Country Identifier
CNT          -> Country code 3-character
CNTSCHID     -> Intl. School ID
CNTSTUID     -> Intl. Student ID
STRATUM      -> Stratum ID 7-character (cnt + region ID + original stratum ID)
OECD         -> OECD country
LANGTEST_QQQ -> Language of Questionnaire
ST001D01T    -> Student International Grade (Derived)
ST003D02T    -> Student (Standardized) Birth - Month
ST003D03T    -> Student (Standardized) Birth -Year
ST004D01T    -> Student (Standardized) Gender
ST005Q01TA   -> What is the <highest level of schooling> completed by your mother?
ST006Q01TA   -> Does your mother have this qualification? <ISCED level 6> (incl. higher qualifications at level 5A in some countries)
ST006Q02TA   -> Does your mother have this qualification? <ISCED level 5A> (excl. higher qualifications at level 5A in some countries)
ST006Q03TA   -> Does your mother have any of the following qualifications? <ISCED level 5B>
ST006Q04TA   -> Does your mother have any of the following qu

### Combining, merging and reshaping

- Using my intuition, I would like to combine some of the columns in the `df_student` dataframe.
- Coefficient of each column in the combination will be arbitrary

In [36]:
df_sample = df_student.sample(3000)

#### column: home_art (high means more)



In [37]:
def calc_home_art(x):
    total_score = 0

    # ST013Q01TA -> How many books are there in your home?  6 -> +3,  5 -> +2,  4 -> +1  
    if(x.ST013Q01TA == 6):
        total_score += 3
    if (x.ST013Q01TA == 5):
        total_score += 2
    if (x.ST013Q01TA == 4):
        total_score += 1

    # ST012Q09NA   -> How many in your home: Musical instruments (e.g. guitar, piano) 4 -> +2, 3 -> +1  
    if (x.ST012Q09NA == 4):
        total_score += 2
    if(x.ST012Q09NA == 3):
        total_score += 1

    # ST011Q07TA   -> In your home: Classic literature (e.g. <Shakespeare>) 1 -> +1 
    # ST011Q08TA   -> In your home: Books of poetry 1 -> +1  
    # ST011Q09TA   -> In your home: Works of art (e.g. paintings) 1 -> +1  
    # ST011Q16NA   -> In your home: Books on art, music, or design 1 -> +1  
    if (x.ST011Q07TA == 1):
        total_score += 1
    if (x.ST011Q08TA == 1):
        total_score += 1
    if (x.ST011Q09TA == 1):
        total_score += 1
    if (x.ST011Q16NA == 1):
        total_score += 1
    
    return total_score

In [38]:
df_sample['home_art'] = df_sample.apply(lambda x: calc_home_art(x), axis=1)

In [39]:
df_sample['home_art'].value_counts()

2    509
1    447
0    443
3    439
4    401
5    262
6    203
7    152
8     98
9     46
Name: home_art, dtype: int64

#### column: bad_home_conditions (higher the worst)


In [40]:
def calc_bad_home_conditions(x):
    total_score = 0

    # ST011Q02TA   -> In your home: A room of your own -> 2 (no) +1  
    if(x.ST011Q02TA == 2):
        total_score += 1

    # ST011Q03TA   -> In your home: A quiet place to study -> 2 (no) +2  
    if (x.ST011Q03TA == 2):
        total_score += 2

    # ST012Q03TA   -> How many in your home: Rooms with a bath or shower -> 1 (none) -> +2  
    if (x.ST012Q03TA == 1):
        total_score += 2

    # IC001Q01TA   -> Available for you to use at home: Desktop computer -> 3 (no) -> +1  
    if (x.IC001Q01TA == 3):
        total_score += 1
    
    # IC001Q02TA   -> Available for you to use at home: Portable laptop, or notebook -> 3 (no) -> +1  
    if (x.IC001Q02TA == 3):
        total_score += 1
    
    return total_score

In [41]:
df_sample['bad_home_conditions'] = df_sample.apply(lambda x: calc_bad_home_conditions(x), axis=1)

In [42]:
df_sample['bad_home_conditions'].value_counts()

0    1767
1     603
2     304
3     188
4      78
5      42
7      11
6       7
Name: bad_home_conditions, dtype: int64

#### column: school_bad_time (higher the score worse the student feels at school)

In [43]:
def calc_school_bad_time(x):
    total_score = 0
    
    # ST038Q03NA   -> During the past 12 months, how often: Other students left me out of things on purpose.  3 -> +1 , 4 -> +2  
    if(x.ST038Q03NA == 3):
        total_score += 1
    if(x.ST038Q03NA == 4):
        total_score += 2

    # ST034Q06TA   -> Thinking about your school: I feel lonely at school. 1-> +1  
    if(x.ST034Q06TA == 1):
        total_score += 1

    # ST034Q05TA   -> Thinking about your school: Other students seem to like me. 4-> +2  
    if(x.ST034Q05TA == 4):
        total_score += 2

    # ST034Q04TA   -> Thinking about your school: I feel awkward and out of place in my school. 1-> +1  
    if(x.ST034Q04TA == 1):
        total_score += 1

    # ST034Q03TA   -> Thinking about your school: I feel like I belong at school. 4-> +1  
    if(x.ST034Q03TA == 4):
        total_score += 1

    # ST034Q02TA   -> Thinking about your school: I make friends easily at school. 4-> +1  
    if(x.ST034Q02TA == 4):
        total_score += 1

    # ST038Q08NA   -> During the past 12 months, how often: Other students spread nasty rumours about me. 3 -> +1 , 4 -> +2  
    if(x.ST038Q08NA == 3):
        total_score += 1
    if(x.ST038Q08NA == 4):
        total_score += 2

    # ST038Q07NA   -> During the past 12 months, how often: I got hit or pushed around by other students. 2-> +2, 3 -> +3 , 4 -> +4  
    if(x.ST038Q07NA == 2):
        total_score += 2
    if(x.ST038Q07NA == 3):
        total_score += 3
    if(x.ST038Q07NA == 4):
        total_score += 4

    # ST038Q05NA   -> During the past 12 months, how often: I was threatened by other students.  3 -> +2 , 4 -> +3  
    if(x.ST038Q05NA == 3):
        total_score += 2
    if(x.ST038Q07NA == 4):
        total_score += 3

    # ST038Q04NA   -> During the past 12 months, how often: Other students made fun of me. 3 -> +1 , 4 -> +2  
    if(x.ST038Q04NA == 3):
        total_score += 1
    if(x.ST038Q04NA == 4):
        total_score += 2
    
    return total_score


In [44]:
df_sample['school_bad_time'] = df_sample.apply(lambda x: calc_school_bad_time(x), axis=1)

In [45]:
df_sample['school_bad_time'].value_counts()

0     2000
2      291
1      230
4      102
3       84
5       72
6       64
7       35
8       34
9       21
10      15
13      13
11      13
17       8
12       7
14       4
15       4
16       2
19       1
Name: school_bad_time, dtype: int64

In [46]:
df_sample.ST016Q01NA.value_counts()

10.0    590
8.0     414
9.0     362
7.0     295
5.0     223
6.0     186
4.0     106
3.0      77
0.0      62
2.0      53
1.0      41
Name: ST016Q01NA, dtype: int64

#### column: emotional_state (negative -1 bad, +1 is good)



In [47]:
def calc_emotional_status(x):
    total_score = 0
    # I calculated absence of emotion as positive of counterpart emotion (emotions are not counterpart of each other but I am looking for an overall score)

    # ST186Q05HA   -> Thinking about yourself and how you normally feel: how often do you feel as described below? Happy 1 -> -2 , 2 -> -1, 3 -> 1, 4-> 2  
    # ST186Q07HA   -> Thinking about yourself and how you normally feel: how often do you feel as described below? Lively 1 -> -2 , 2 -> -1, 3 -> 1, 4-> 2  
    # ST186Q09HA   -> Thinking about yourself and how you normally feel: how often do you feel as described below? Proud 1 -> -2 , 2 -> -1, 3 -> 1, 4-> 2  
    # ST186Q01HA   -> Thinking about yourself and how you normally feel: how often do you feel as described below? Joyful 1 -> -2 , 2 -> -1, 3 -> 1, 4-> 2  
    # ST186Q03HA   -> Thinking about yourself and how you normally feel: how often do you feel as described below? Cheerful 1 -> -2 , 2 -> -1, 3 -> 1, 4-> 2  


    # ST186Q06HA   -> Thinking about yourself and how you normally feel: how often do you feel as described below? Scared 1 -> 2, 2 -> 1, 3-> -1, 4-> -2  
    # ST186Q10HA   -> Thinking about yourself and how you normally feel: how often do you feel as described below? Miserable 1 -> 2, 2 -> 1, 3-> -1, 4-> -2  
    # ST186Q02HA   -> Thinking about yourself and how you normally feel: how often do you feel as described below? Afraid 1 -> 2, 2 -> 1, 3-> -1, 4-> -2  
    # ST186Q08HA   -> Thinking about yourself and how you normally feel: how often do you feel as described below? Sad 1 -> 2, 2 -> 1, 3-> -1, 4-> -2  

    positives = ['ST186Q05HA', 'ST186Q07HA', 'ST186Q09HA', 'ST186Q01HA', 'ST186Q03HA']
    negatives = ['ST186Q06HA', 'ST186Q10HA', 'ST186Q02HA', 'ST186Q08HA', ]

    for item in positives:
        if(x.item == 1):
            total_score += -2
        if(x.item == 2):
            total_score += -1
        if(x.item == 3):
            total_score += 1
        if(x.item == 4):
            total_score += 2

    for item in negatives:
        if(x.item == 1):
            total_score += 2
        if(x.item == 2):
            total_score += 1
        if(x.item == 3):
            total_score += -1
        if(x.item == 4):
            total_score += -2

    # ST016Q01NA   -> Overall, how satisfied are you with your life as a whole these days? 0-10 -> 0.0 -> 1.0 * 3
        total_score += x.ST016Q01NA - 5

    # -23 is min number
    # +23 is max number

    return ((total_score + 23) / 46)


In [48]:
df_sample['emotional_status'] = df_sample.apply(lambda x: calc_emotional_status(x), axis=1)

In [49]:
df_sample['emotional_status'].describe()

count    2409.000000
mean        0.699361
std         0.224628
min         0.065217
25%         0.586957
50%         0.760870
75%         0.847826
max         0.934783
Name: emotional_status, dtype: float64

#### column: family_hi_ed (higher number higher education)

In [50]:
def calc_family_hi_ed(x):
    total_score = 0

    # ST177Q02HA   -> How many languages [...] do you and your parents speak well enough to converse with others? Your mother 3-> +1, 4->+2  
    if(x.ST177Q02HA == 3):
        total_score += 1
    if(x.ST177Q02HA == 4):
        total_score += 2

    # ST177Q03HA   -> How many languages [...] do you and your parents speak well enough to converse with others? Your father 3-> +1, 4->+2  
    if(x.ST177Q03HA == 3):
        total_score += 1
    if(x.ST177Q03HA == 4):
        total_score += 2

    # ST006Q01TA   -> Does your mother have this qualification? <ISCED level 6> (incl. higher qualifications at level 5A in some countries) -> 1-> +3  
    if (x.ST006Q01TA == 1):
        total_score += 3
    
    # ST006Q02TA   -> Does your mother have this qualification? <ISCED level 5A> (excl. higher qualifications at level 5A in some countries) -> 1-> +2  
    if (x.ST006Q02TA == 1): 
        total_score += 2
    
    # ST006Q03TA   -> Does your mother have any of the following qualifications? <ISCED level 5B> 1-> +1  
    if (x.ST006Q03TA == 1):
        total_score += 1
    
    # ST008Q01TA   -> Does your father have this qualification? <ISCED level 6> (incl. higher qualifications at level 5A in some countries) -> 1->+3  
    if (x.ST008Q01TA == 1):
        total_score += 3

    # ST008Q02TA   -> Does your father have this qualification? <ISCED level 5A> (excl. higher qualifications at level 5A in some countries) -> 1-> +2 
    if (x.ST008Q02TA == 1):
        total_score += 2

    # ST008Q03TA   -> Does your father have any of the following qualifications? <ISCED level 5B> 1-> +1  
    if (x.ST008Q03TA == 1):
        total_score += 1

    return total_score

In [51]:
df_sample['family_hi_ed'] = df_sample.apply(lambda x: calc_family_hi_ed(x), axis=1)

In [52]:
df_sample['family_hi_ed'].describe()

count    3000.000000
mean        2.573333
std         2.888380
min         0.000000
25%         0.000000
50%         2.000000
75%         4.000000
max        16.000000
Name: family_hi_ed, dtype: float64

#### column: teacher_quality (higher the better)

In [53]:
def calc_quality_tch(x):
    total_score = 0

    # ST213Q01HA   -> Thinking of past two <test language lessons>: It was clear to me that the teacher liked teaching us. -> 1->+2, 2->+1  
    if(x.ST213Q01HA == 1):
        total_score += 2
    if(x.ST213Q01HA == 2):
        total_score += 1

    # ST213Q02HA   -> Thinking of past two <test language lessons>: The enthusiasm of the teacher inspired me. -> 1->+2, 2->+1  
    if(x.ST213Q02HA == 1):
        total_score += 2
    if(x.ST213Q02HA == 2):
        total_score += 1

    # ST213Q03HA   -> Thinking of past two <test language lessons>: It was clear that the teacher likes to deal with the topic of the lesson. -> 1->+2, 2->+1  
    if (x.ST213Q03HA == 1):
        total_score += 2
    if (x.ST213Q03HA == 2):
        total_score += 1

    # ST213Q04HA   -> Thinking of past two <test language lessons>: The teacher showed enjoyment in teaching. -> 1->+2, 2->+1   
    if (x.ST213Q04HA == 1): 
        total_score += 2
    if (x.ST213Q04HA == 2): 
        total_score += 1
       
    # ST102Q03TA   -> How often during <test language lessons>: [...] the teacher presents a short summary of the previous lesson. -> 1->+1  
    if (x.ST102Q03TA == 1):
        total_score += 1
    
    # ST097Q01TA   -> How often during <test language lessons>: Students dont listen to what the teacher says. -> 1->+1  
    if (x.ST097Q01TA == 1):
        total_score += 1

    # ST097Q02TA   -> How often during <test language lessons>: There is noise and disorder. -> 4->+1  
    if (x.ST097Q02TA == 4):
        total_score += 1

    # ST097Q03TA   -> How often during <test language lessons>: The teacher waits long for students to quiet down. -> 4->+1  
    if (x.ST097Q03TA == 4):
        total_score += 1

    # ST097Q05TA   -> How often during <test language lessons>: Students dont start working for a long time after the lesson begins. -> 4->+1  
    if (x.ST097Q05TA == 4):
        total_score += 1

    return total_score

In [54]:
df_sample['quality_tch'] = df_sample.apply(lambda x: calc_quality_tch(x), axis=1)

In [55]:
df_sample['quality_tch'].describe()

count    3000.000000
mean        2.509000
std         2.205797
min         0.000000
25%         1.000000
50%         2.000000
75%         4.000000
max        13.000000
Name: quality_tch, dtype: float64

#### column: reading_not_easy (finds reading hard. higher the harder)


In [56]:
def calc_reading_not_easy(x):
    total_score = 0

    # ST163Q02HA   -> In the PISA test, how do you feel about the reading tasks: There were many words I could not understand. -> 1->+2, 2->+1  
    if(x.ST163Q02HA == 1):
        total_score += 2
    if(x.ST163Q02HA == 2):
        total_score += 1

    # ST163Q03HA   -> In the PISA test, how do you feel about the reading tasks: Many texts were too difficult for me. 1->+2, 2->+1  
    if(x.ST163Q03HA == 1):
        total_score += 2
    if(x.ST163Q03HA == 2):
        total_score += 1

    # ST161Q02HA   -> Agree: I am able to understand difficult texts. 1->+2, 2->+1 (1 disagree, 4 agree)  
    if (x.ST161Q02HA == 1):
        total_score += 2
    if (x.ST161Q02HA == 2):
        total_score += 1

    # ST161Q06HA   -> Agree: I have always had difficulty with reading. 4->+2, 3->+1  
    if (x.ST161Q06HA == 4): 
        total_score += 2
    if (x.ST161Q06HA == 3): 
        total_score += 1
       
    # ST161Q07HA   -> Agree: I have to read a text several times before completely understanding it. 4->+1  
    if (x.ST161Q07HA == 4):
        total_score += 1
    
    # ST160Q04IA   -> How much do you agree or disagree? For me, reading is a waste of time. 4->+2, 3->+1
    if (x.ST160Q04IA == 3):
        total_score += 1

    # ST160Q02IA   -> How much do you agree or disagree? Reading is one of my favourite hobbies. 1->+1
    if (x.ST160Q02IA == 1):
        total_score += 1

    return total_score

In [57]:
df_sample['reading_not_easy'] = df_sample.apply(lambda x: calc_reading_not_easy(x), axis=1)

In [58]:
df_sample['reading_not_easy'].describe()

count    3000.0000
mean        2.9660
std         1.7268
min         0.0000
25%         2.0000
50%         3.0000
75%         4.0000
max        10.0000
Name: reading_not_easy, dtype: float64

In [59]:
def calculate_all(x):
    home_art = calc_home_art(x)
    bad_home_conditions = calc_bad_home_conditions(x)
    school_bad_time = calc_school_bad_time(x)
    emotional_status = calc_emotional_status(x)
    family_hi_ed = calc_family_hi_ed(x)
    quality_tch = calc_quality_tch(x)
    reading_not_easy = calc_reading_not_easy(x)

    return pd.Series([home_art, bad_home_conditions, school_bad_time, emotional_status, family_hi_ed, quality_tch, reading_not_easy])
     

In [60]:
df_student[['home_art', 'bad_home_conditions', 'school_bad_time', 'emotional_status', 'family_hi_ed', 'quality_tch', 'reading_not_easy']] = df_student.apply(lambda x: calculate_all(x), axis=1)

### Drop columns

In [61]:
final_columns = ["CNT", "PV1MATH","PV1READ","PV1SCIE","PV1RCLI","PV1RCUN","PV1RCER","WEALTH","ESCS","PA042Q01TA","CNTRYID","CNTSCHID",
"CNTSTUID","STRATUM","OECD","ST001D01T","ST003D02T","ST003D03T","ST004D01T","reading_not_easy","quality_tch","family_hi_ed",
"emotional_status","school_bad_time","bad_home_conditions","home_art","math_score","math_answered","reading_score","reading_answered","science_score","science_answered"]

In [62]:
df_student.shape

(612004, 222)

In [63]:
df_student.drop(columns=df_student.columns.difference(final_columns), inplace=True)
df_student.shape

(612004, 32)

In [64]:
df_student.head()

Unnamed: 0,CNTRYID,CNT,CNTSCHID,CNTSTUID,STRATUM,OECD,ST001D01T,ST003D02T,ST003D03T,ST004D01T,...,reading_answered,science_score,science_answered,home_art,bad_home_conditions,school_bad_time,emotional_status,family_hi_ed,quality_tch,reading_not_easy
0,8.0,ALB,800115.0,800001.0,ALB0107,0.0,10.0,10.0,2002.0,1.0,...,52.0,0.0,0.0,4.0,0.0,1.0,0.673913,6.0,3.0,4.0
1,8.0,ALB,800300.0,800002.0,ALB0105,0.0,10.0,6.0,2002.0,1.0,...,53.0,0.0,0.0,2.0,2.0,1.0,0.934783,0.0,1.0,2.0
2,8.0,ALB,800088.0,800003.0,ALB0101,0.0,9.0,4.0,2002.0,2.0,...,52.0,80.0,34.0,0.0,2.0,13.0,0.934783,5.0,3.0,2.0
3,8.0,ALB,800014.0,800004.0,ALB0109,0.0,10.0,3.0,2002.0,1.0,...,57.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
4,8.0,ALB,800294.0,800005.0,ALB0203,0.0,10.0,2.0,2002.0,1.0,...,57.0,170.0,39.0,2.0,1.0,0.0,0.5,3.0,3.0,4.0


In [65]:
df_student.to_csv("./data/2_merged/student.csv",index=False)