In [1]:
import pandas as pd
import pyreadstat


In [2]:
df_sch, meta_sch = pyreadstat.read_sav('data/PISA2022_SCH_QQQ.SAV')

In [3]:
import os

with open('data/openai.api.key', 'r') as filek: 
    openai_key = filek.read()
os.environ["OPENAI_API_KEY"] =  openai_key 

In [4]:
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings

def meta2docs(spss_meta):
    excluded = ['CNTRYID']
    docs = []
    for col in  spss_meta.column_names:
        #if col in spss_meta.variable_value_labels:
        if col not in excluded:
            docs.append(
                Document(
                    page_content=spss_meta.column_names_to_labels[col],
                    metadata={"year": 2022, "original_col_name": col},
                ),
            )
    return docs
    
cols = meta2docs(meta_sch)

cols_vectorstore = Chroma.from_documents(cols, OpenAIEmbeddings())

In [5]:
cols

[Document(page_content='Country code 3-character', metadata={'year': 2022, 'original_col_name': 'CNT'}),
 Document(page_content='Intl. School ID', metadata={'year': 2022, 'original_col_name': 'CNTSCHID'}),
 Document(page_content='PISA Assessment Cycle (2 digits + 2 character Assessment type - MS/FT)', metadata={'year': 2022, 'original_col_name': 'CYC'}),
 Document(page_content='National Centre 6-digit Code', metadata={'year': 2022, 'original_col_name': 'NatCen'}),
 Document(page_content='Stratum ID 5-character (cnt + original stratum ID)', metadata={'year': 2022, 'original_col_name': 'STRATUM'}),
 Document(page_content='Adjudicated sub-region code 7-digit code (3-digit country code + region ID + stratum ID)', metadata={'year': 2022, 'original_col_name': 'SUBNATIO'}),
 Document(page_content='REGION', metadata={'year': 2022, 'original_col_name': 'REGION'}),
 Document(page_content='OECD country', metadata={'year': 2022, 'original_col_name': 'OECD'}),
 Document(page_content='Mode of Respon

In [6]:
cols_retriever = cols_vectorstore.as_retriever()

In [7]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0)

In [8]:
def extract_column_name_hints(question):
    prompt1 = f"Please list the typical database column fields, that required to answer the following question: {question}"
    relevant_col_list_msg = llm.invoke(prompt1)
    relevant_col_list = relevant_col_list_msg.content
    return relevant_col_list

In [9]:
from langchain_community.document_transformers import EmbeddingsRedundantFilter

def match_column_names(hints_text, retriever):
    filter = EmbeddingsRedundantFilter(embeddings=OpenAIEmbeddings())
    res = []
    hints = hints_text.split('\n')
    for hint in hints:
        rel_col_docs = cols_retriever.invoke(hint)    
        rel_col_docs = filter.transform_documents(rel_col_docs)
        res = res+rel_col_docs
    return res

def docs2explanation(docs, meta):
    t = ''
    for idx, doc in enumerate(docs):
        col_name = doc.metadata['original_col_name']
        if meta.variable_measure[col_name] != 'unknown':
            scale = meta.variable_measure[col_name]            
            if scale == 'scale':
                scale = 'interval'                
            measure = ' A ' + meta.readstat_variable_types[col_name] + ' variable with ' +scale  + ' scale measure.'
        else:
            measure = ''
        t =  t + str(idx+1)+ '. ' + col_name + " : " + doc.page_content +'.' + measure +'\n'
    t = t + ''
    return t

In [171]:
def gen_code(question, rel_col_docs, meta_sch):
    data_explanation = docs2explanation(rel_col_docs, meta_sch)
    columns = [i.metadata['original_col_name'] for i in rel_col_docs]
    prompt2 = f"Given a dataframe with the following columns {columns}, column meaning: {data_explanation}, can you generate a python code, without sample data, which can answer the following question? the code must contain only one function called 'run', that returns an exact number of type 'float'. \nQuestion: {question}"
    res = llm.invoke(prompt2)
    print(res)
    code = res.content.replace('```python','').replace('```','')
    return code

def exec_code(code, df):        
    df2 = df.dropna()
    loc = locals()
    exec(code + "\nr = run(df2)\n", globals(), loc)
    return loc['r']


In [162]:
def pipeline(question, df, meta, col_retriever):
    col_hints = extract_column_name_hints(question)
    print(col_hints)
    rel_col_docs = match_column_names(col_hints, col_retriever)    
    print([i.page_content for i in rel_col_docs])
    code =  gen_code(question, rel_col_docs, meta_sch)    
    print(code)
    res = exec_code(code, df)    
    return {'question': question, 'result': res, 'used_cols': [i.metadata['original_col_name'] for i in rel_col_docs]}

In [163]:
test_data = [
    {'question':'Which country has the highest average  teacher student ratio? How much is it? ','expected_columns': ['STRATIO', 'CNT'], 'expected_answer': 28.95},
    {'question':'Which country has the highest average class sizes? How much is it?','expected_columns': ['CLSIZE', 'CNT'], 'expected_answer': 42.65 },    
    {'question':'In how many languages ​​was the questionnaire filled out?', 'expected_columns': ['LANGTEST_QQQ'], 'expected_answer': 54},
    {'question':'What is Hungary average teacher-student ratio?', 'expected_columns': ['STRATIO', 'CNT'], 'expected_answer': 10.83},
    {'question':'What is OECD average teacher-student ratio?', 'expected_columns': ['STRATIO', 'OECD'], 'expected_answer':  12.551608},
    {'question':'Where is average teacher-student ratio in Hungary compared to the OECD average in terms of average teacher-student ratio? Give the difference as number.','expected_columns': ['STRATIO', 'OECD', 'CNT'], 'expected_answer': -1.721608 },    
    #{'question':'how many education systems were involved in the survey? ','expected_columns': ['STRATIO', 'CNT'], 'expected_answer': 28.95},
    {'question':'how many countries were involved in the survey?','expected_columns': ['CNT'], 'expected_answer': 80},
    {'question':'how many countries that are OECD memeber were involved in the survey? ','expected_columns': ['OECD', 'CNT'], 'expected_answer': 37},
    {'question':'how many schools were involved in the survey?','expected_columns': ['CNTSCHID'], 'expected_answer': 21629}
]

def evaluate(test_data, df, meta, cols_retriever):
    eval_res = []
    for test in test_data:
        t2 = test
        
        answer = pipeline(test['question'], df, meta, cols_retriever)
        
        found_cols = []
        for expected_column in test['expected_columns']:
            if expected_column in answer['used_cols']:
                found_cols.append(expected_column)
        t2['found_cols'] = found_cols
        
        found_cols_ratio = len(found_cols) / len(test['expected_columns'])
        t2['found_cols_ratio'] = found_cols_ratio

        t2['pipeline_result'] = answer['result']
        
        t2['used_cols'] = answer['used_cols']

        r = answer['result']
        
        if type(answer['result']) is tuple:
            print('finding float')
            for i in answer['result']:
                print(type(i))
                if type(i) is float or type(i) is float64:
                    print('found')
                    r = i
                    break
            
        t2['error'] = r - test['expected_answer']

        eval_res.append(t2)
        
    return eval_res

In [111]:
eval_res = evaluate(test_data, df_sch, meta_sch, cols_retriever)


1. Country
2. Teacher student ratio
['REGION', 'OECD country', 'Senate Weight (sum of 5000 per country)', 'Country code 3-character', 'Student-teacher ratio', 'Student-mathematics teacher ratio', 'Proportion of mathematics teachers at school', 'Proportion of school management personnel']

import pandas as pd

def run(data):
    highest_avg_ratio = data.groupby('CNT')['STRATIO'].mean().max()
    country_highest_avg_ratio = data.groupby('CNT')['STRATIO'].mean().idxmax()
    
    return highest_avg_ratio

1. Country
2. Class size
['REGION', 'OECD country', 'Senate Weight (sum of 5000 per country)', 'Country code 3-character', 'Math class size', 'Class size (test language class)', 'School size (Sum)', 'Student-teacher ratio']

import pandas as pd

def run(data):
    avg_class_sizes = data.groupby('CNT')['CLSIZE'].mean()
    country_highest_avg_class_size = avg_class_sizes.idxmax()
    highest_avg_class_size = avg_class_sizes.max()
    
    return float(highest_avg_class_size)

1. Responden

In [133]:
df = pd.DataFrame(eval_res)
df

Unnamed: 0,question,expected_columns,expected_answer,found_cols,found_cols_ratio,pipeline_result,used_cols,error
0,Which country has the highest average teacher...,"[STRATIO, CNT]",28.95,"[STRATIO, CNT]",1.0,28.952088,"[REGION, OECD, SENWT, CNT, STRATIO, SMRATIO, P...",0.002087701
1,Which country has the highest average class si...,"[CLSIZE, CNT]",42.65,"[CLSIZE, CNT]",1.0,42.65,"[REGION, OECD, SENWT, CNT, MCLSIZE, CLSIZE, SC...",0.0
2,In how many languages ​​was the questionnaire ...,[LANGTEST_QQQ],54.0,[LANGTEST_QQQ],1.0,54.0,"[ADMINMODE, CNTSCHID, LANGTEST_QQQ, STRATUM, L...",0.0
3,What is Hungary average teacher-student ratio?,"[STRATIO, CNT]",10.83,"[STRATIO, CNT]",1.0,10.829132,"[REGION, OECD, SENWT, CNT, SCHLTYPE, SCHSIZE, ...",-0.0008680556
4,What is OECD average teacher-student ratio?,"[STRATIO, OECD]",12.551608,"[STRATIO, OECD]",1.0,12.551608,"[REGION, OECD, SENWT, CNT, STRATIO, SMRATIO, P...",2.45737e-07
5,Where is average teacher-student ratio in Hung...,"[STRATIO, OECD, CNT]",-1.721608,"[STRATIO, OECD, CNT]",1.0,-1.722476,"[OECD, REGION, SENWT, CNT, STRATIO, SMRATIO, P...",-0.0008683013
6,how many countries were involved in the survey?,[CNT],80.0,[CNT],1.0,80.0,"[CNT, REGION, OECD, NatCen, CNT, NatCen, SUBNA...",0.0
7,how many countries that are OECD memeber were ...,"[OECD, CNT]",37.0,"[OECD, CNT]",1.0,37.0,"[CNT, REGION, OECD, NatCen, OECD, SC012Q11JA, ...",0.0
8,how many schools were involved in the survey?,[CNTSCHID],21629.0,[CNTSCHID],1.0,21629.0,"[CNTSCHID, SCHLTYPE, SC037Q03TA, SCHSIZE, SCHL...",0.0


In [113]:
summary(eval_res)

{'number_of_tests': 9,
 'found_cols_ratio_avg': 1.0,
 'success_rate': 1.0,
 'rmse': 0.0008073248805889901}

In [114]:
import math

def summary(eval_res):
    df = pd.DataFrame(eval_res)
    return {
        'number_of_tests': len(df),
        'found_cols_ratio_avg': df['found_cols_ratio'].mean(),
        'success_rate': len(df[df['error'] < 0.01]) / len(df),
        'rmse': math.sqrt((df.error * df.error).mean())
    }
summary(eval_res)

{'number_of_tests': 9,
 'found_cols_ratio_avg': 1.0,
 'success_rate': 1.0,
 'rmse': 0.0008073248805889901}

In [115]:
pipeline('question okosp sop sop skopk', df_sch, meta_sch, cols_retriever)

1. Question ID
2. Question text
3. OKOSP (Option Key for Option Selected by Participant)
4. SOP (Score for Option Selected by Participant)
5. SKOPK (Score Key for Option)
['Language of Questionnaire', 'Intl. School ID', '[Stem] At your school, what is the total number of students in the [national modal grade for 15-year-olds]?', 'Mode of Respondent', 'Language of Questionnaire', '[Stem] At your school, what is the total number of students in the [national modal grade for 15-year-olds]?', 'Extent a problem in your school: Intimidation or verbal abuse among students (including texting, emailing, etc.)', 'Mode of Respondent', 'School selectivity', 'Language of Questionnaire', 'Creative extra-curricular activities (3 activities)', 'School type', 'Exist at your school: Written specification of student performance standards', 'School selectivity', 'School size (Sum)', "During admission, how often consider: Student's record of academic performance (including placement tests)", 'School size (S

{'question': 'question okosp sop sop skopk',
 'result': 42.0,
 'used_cols': ['LANGTEST_QQQ',
  'CNTSCHID',
  'SC004Q01TA',
  'ADMINMODE',
  'LANGTEST_QQQ',
  'SC004Q01TA',
  'SC172Q05JA',
  'ADMINMODE',
  'SCHSEL',
  'LANGTEST_QQQ',
  'CREACTIV',
  'SCHLTYPE',
  'SC037Q04TA',
  'SCHSEL',
  'SCHSIZE',
  'SC012Q01TA',
  'SCHSIZE',
  'SCHSEL',
  'SC188Q10JA',
  'SC004Q01TA']}

In [119]:
pipeline('what is the relation between school autonomy  and teacher participation in Hungarian schools? Please give standardized coefficient as reasul.', df_sch, meta_sch, cols_retriever)

1. School ID
2. School autonomy level
3. Teacher participation level
4. Standardized coefficient

The standardized coefficient is a measure of the strength and direction of the relationship between school autonomy and teacher participation in Hungarian schools. It is calculated as part of a regression analysis to determine the extent to which changes in school autonomy predict changes in teacher participation.
['Intl. School ID', 'School type', "Exist at your school: Written specification of the school's curricular profile and educational goals", 'School size (Sum)', 'School autonomy (WLE)', "Extent structures your school's math programme: Discretion of individual teachers", 'School type', 'Proportion of school administrative personnel', 'Teacher participation (WLE)', 'How often you/others in [school management team]: Taking actions to support co-operation among teachers to develop new teaching practices', "How often you/others in [school management team]: Taking actions to ensure that

NameError: name 'LinearRegression' is not defined

In [None]:
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

In [173]:
pipeline('what is the relation between school autonomy  and teacher participation? Please give linear regression coefficient as result.', df_sch, meta_sch, cols_retriever)

1. School ID
2. School autonomy level
3. Teacher participation level

Linear regression coefficient: 0.75
['Intl. School ID', 'School type', "Exist at your school: Written specification of the school's curricular profile and educational goals", 'School size (Sum)', 'School autonomy (WLE)', "Extent structures your school's math programme: Discretion of individual teachers", 'School type', 'Proportion of school administrative personnel', 'Teacher participation (WLE)', 'How often you/others in [school management team]: Taking actions to support co-operation among teachers to develop new teaching practices', "How often you/others in [school management team]: Taking actions to ensure that teachers feel responsible for their students' learning outcomes", 'School Closure Support from Education Authorities', 'School\xa0Preparation for Remote Instruction, In Response to Pandemic\xa0(WLE)', 'Extent a problem in your school: Vandalism', 'Student-mathematics teacher ratio', 'Student-teacher ratio'

{'question': 'what is the relation between school autonomy  and teacher participation? Please give linear regression coefficient as result.',
 'result': 0.22163275642916797,
 'used_cols': ['CNTSCHID',
  'SCHLTYPE',
  'SC037Q03TA',
  'SCHSIZE',
  'SCHAUTO',
  'SC188Q09JA',
  'SCHLTYPE',
  'PROADMIN',
  'TCHPART',
  'SC201Q04JA',
  'SC201Q06JA',
  'SCSUPRTED',
  'SCPREPAP',
  'SC172Q03JA',
  'SMRATIO',
  'STRATIO',
  'PROPMATH',
  'SC178Q02JA']}

In [158]:

import pandas as pd
import statsmodels.api as sm

def run2(data):
    X = data[['SCHAUTO']]
    y = data['TCHPART']
    
    X = sm.add_constant(X)
    
    model = sm.OLS(y, X).fit()
    print(model.summary())
    return float(model.params['SCHAUTO'])


# Example usage:
# result = run(df)
# print(result)

In [175]:
df = df_sch[df_sch['SCHAUTO'].notna()]
df = df[df['TCHPART'].notna()]
run2(df)


                            OLS Regression Results                            
Dep. Variable:                TCHPART   R-squared:                       0.147
Model:                            OLS   Adj. R-squared:                  0.147
Method:                 Least Squares   F-statistic:                     2116.
Date:                Tue, 09 Jul 2024   Prob (F-statistic):               0.00
Time:                        21:36:03   Log-Likelihood:                -17834.
No. Observations:               12243   AIC:                         3.567e+04
Df Residuals:                   12241   BIC:                         3.569e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.1253      0.009     13.192      0.0

0.3465848737050316

In [166]:
loc = {}
#exec('import statsmodels.api as sm\nprint(sm.OLS([1,2,3],[1,2,3]))')
exec('def run(df):\n\tprint(sm.OLS([1,2,3],[1,2,3]))'+ "\nr = run(df)\n", globals(), loc)

<statsmodels.regression.linear_model.OLS object at 0x7f9e47386a40>
