# Run Regressions.ipynb

Logistic regressions of GSM8K accuracy on length and complexity variables, quadratic terms, clustering by identity of the question and method (task, conversation_id, method).

Regression of CW scores similarly.

Add complexity of provided answers as an interaction term in GSM8K regressions.

Add model as an interaction term.

Model and question (conversation_number by task) controls can soak up additional variation.

In [158]:
from stargazer.stargazer import Stargazer
import statsmodels.api as sm
import pandas as pd
import statsmodels.formula.api as smf


In [159]:
# Load Combined_Data.xlsx
df = pd.read_excel('Combined_Data.xlsx')

print(df.columns)

df


Index(['model_task_method', 'conversation_number',
       'coherence_1_incoherent_10_very_coherent', 'compliance_OLD',
       'ease_of_review_1_easy_10_hard', 'correct',
       'Prediction_Based_On_First_10', 'Prediction_Based_On_Last_10',
       'Aggregated_Prediction', 'Prediction_Based_On_First_10_LP',
       'response_Based_On_First_10_LP', 'Prediction_Based_On_Last_10_LP',
       'response_Based_On_Last_10_LP', 'response_LP',
       'Aggregated_Prediction_LP', 'Prediction_Based_On_First_50_LP',
       'response_Based_On_First_50_LP', 'Prediction_Based_On_Last_50_LP',
       'response_Based_On_Last_50_LP', 'Aggregated_Prediction_50_LP',
       'Prediction_Based_On_random_50_LP_1',
       'response_Based_On_random_50_LP_1',
       'Prediction_Based_On_random_50_LP_2',
       'response_Based_On_random_50_LP_2',
       'Aggregated_Prediction_random_50_LP', 'Unnamed: 0_x', 'response_x',
       'replace_slash_n_slash_n_with_newline_x',
       'replace_slash_n_slash_n_with_newline_values

Unnamed: 0,model_task_method,conversation_number,coherence_1_incoherent_10_very_coherent,compliance_OLD,ease_of_review_1_easy_10_hard,correct,Prediction_Based_On_First_10,Prediction_Based_On_Last_10,Aggregated_Prediction,Prediction_Based_On_First_10_LP,...,num_linebreaks_prompts_diff,num_sentences_prompts_diff,num_step_i_prompts_diff,num_1_dot_etc_prompts_diff,sentence_length_prompts_diff,fres_prompts_diff,num_linebreaks_provided_diff,num_sentences_provided_diff,num_step_i_provided_diff,num_1_dot_etc_provided_diff
0,td3_cw_direct_prompting_responses,1,1.0,1.0,1.0,,,1.0,1.0,,...,-1,-2,0,2,-6.571429,12.31,,,,
1,td3_cw_direct_prompting_responses,2,7.0,0.0,1.0,,,7.0,7.0,,...,-1,-3,0,2,-9.350000,-1.05,,,,
2,td3_cw_direct_prompting_responses,3,1.0,1.0,1.0,,,1.0,1.0,,...,-1,-4,0,2,-4.911111,-6.64,,,,
3,td3_cw_direct_prompting_responses,4,10.0,1.0,1.0,,,7.0,7.0,,...,-1,-1,0,2,-10.500000,-9.31,,,,
4,td3_cw_direct_prompting_responses,5,4.0,1.0,1.0,,,1.0,1.0,,...,-1,-2,0,2,-8.428571,13.64,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3195,gpt4_gsm8k_manual_cot_responses,96,,,,1.0,,,,,...,16,39,0,17,,,4.0,-1.0,0.0,-2.0
3196,gpt4_gsm8k_manual_cot_responses,97,,,,1.0,,,,,...,16,43,0,19,,,2.0,0.0,0.0,-1.0
3197,gpt4_gsm8k_manual_cot_responses,98,,,,1.0,,,,,...,16,44,0,20,,,3.0,1.0,0.0,0.0
3198,gpt4_gsm8k_manual_cot_responses,99,,,,1.0,,,,,...,15,44,0,19,,,1.0,0.0,0.0,-1.0


In [160]:
# Transformation - divide conversation length by 1000 to get effect per 1000K tokens
df['conversation_length_thousands'] = df['conversation_length']/1000


In [161]:
# Create quadratics
df['conversation_length_thousands_2'] = df['conversation_length_thousands']**2
df['consolidated_num_steps_ideas_2'] = df['consolidated_num_steps_ideas']**2
df['fres_2'] = df['fres']**2


In [162]:
# Create clustering variable
# Concatenate task, conversation_number, method
df['task_conversation_method'] = df['task'].astype(str) + "_" + df['conversation_number'].astype(str) + "_" + df['method'].astype(str)


In [163]:
# Task by conversation variable as a control
df['task_conversation'] = df['task'].astype(str) + "_" + df['conversation_number'].astype(str)


In [164]:
# Split data
gsm8k_data = df[df['task'] == 'gsm8k']
cw_data = df[df['task'] == 'cw']


### Some Checks

In [165]:
# Get values of correct in gsm8k_data
gsm8k_correct = gsm8k_data['correct'].values
print(set(gsm8k_correct))

# Print cases where correct is not 0 or 1
#print(gsm8k_data[gsm8k_data['correct'] != 0 & gsm8k_data['correct'] != 1])


{0.0, 1.0}


### GSM8K Regressions

#### Logistic Regression (no clustering)

In [166]:
# Define the logistic regression model
logit_no_clustering_gsm8k = smf.logit('correct ~ conversation_length_thousands + consolidated_num_steps_ideas + conversation_length_thousands_2 + consolidated_num_steps_ideas_2 + model', data=gsm8k_data).fit(cov_type='HC3')

# Display the summary
print(logit_no_clustering_gsm8k.summary())

# Marginal effects
logit_no_clustering_gsm8k_marginal_effects = logit_no_clustering_gsm8k.get_margeff(at='overall')
print(logit_no_clustering_gsm8k_marginal_effects.summary())


Optimization terminated successfully.
         Current function value: 0.576353
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                correct   No. Observations:                 1600
Model:                          Logit   Df Residuals:                     1594
Method:                           MLE   Df Model:                            5
Date:                Tue, 05 Dec 2023   Pseudo R-squ.:                  0.1496
Time:                        00:08:55   Log-Likelihood:                -922.17
converged:                       True   LL-Null:                       -1084.4
Covariance Type:                  HC3   LLR p-value:                 5.423e-68
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept                           1.2184      0.179      6.792  

#### Linear Probability Model with Clustering

In [167]:
# Define and fit the OLS model with clustered standard errors
lpm_with_clustering_gsm8k = smf.ols('correct ~ conversation_length_thousands + consolidated_num_steps_ideas + conversation_length_thousands_2 + consolidated_num_steps_ideas_2 + task_conversation + model', data=gsm8k_data).fit(cov_type='cluster', cov_kwds={'groups': gsm8k_data['task_conversation_method']})

# Print the model summary
print(lpm_with_clustering_gsm8k.summary())



                            OLS Regression Results                            
Dep. Variable:                correct   R-squared:                       0.348
Model:                            OLS   Adj. R-squared:                  0.303
Method:                 Least Squares   F-statistic:                     32.04
Date:                Tue, 05 Dec 2023   Prob (F-statistic):          1.17e-222
Time:                        00:08:55   Log-Likelihood:                -793.62
No. Observations:                1600   AIC:                             1797.
Df Residuals:                    1495   BIC:                             2362.
Df Model:                         104                                         
Covariance Type:              cluster                                         
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept 

### CW Regressions

In [168]:
# Check
# Is avg_inter_paragraph_cosine_sim always present
#print(cw_data[cw_data['avg_inter_paragraph_cosine_sim'].isnull()])

# Check length
#print(len(cw_data['avg_inter_paragraph_cosine_sim']))
#print(len(cw_data['fres'].dropna()))

# Limit cw_data to rows where avg_inter_paragraph_cosine_sim is not null
cw_data = cw_data[cw_data['avg_inter_paragraph_cosine_sim'].notnull()]


#### Preferred cosine similarity measure

In [169]:
# Define and fit the OLS model with clustered standard errors
reg_with_clustering_cw = smf.ols("avg_inter_paragraph_cosine_sim ~ conversation_length_thousands + consolidated_num_steps_ideas + conversation_length_thousands_2 + consolidated_num_steps_ideas_2 + fres + fres_2 + task_conversation + model",
                                 data=cw_data).fit(cov_type='cluster', cov_kwds={'groups': cw_data['task_conversation_method']})

# Print the model summary
print(reg_with_clustering_cw.summary())


                                  OLS Regression Results                                  
Dep. Variable:     avg_inter_paragraph_cosine_sim   R-squared:                       0.427
Model:                                        OLS   Adj. R-squared:                  0.381
Method:                             Least Squares   F-statistic:                     20.48
Date:                            Tue, 05 Dec 2023   Prob (F-statistic):          2.79e-163
Time:                                    00:08:55   Log-Likelihood:                 927.69
No. Observations:                            1434   AIC:                            -1641.
Df Residuals:                                1327   BIC:                            -1078.
Df Model:                                     106                                         
Covariance Type:                          cluster                                         
                                      coef    std err          z      P>|z|      [0.025   

#### Check task compliance as well

In [170]:
print(list(cw_data.columns))


['model_task_method', 'conversation_number', 'coherence_1_incoherent_10_very_coherent', 'compliance_OLD', 'ease_of_review_1_easy_10_hard', 'correct', 'Prediction_Based_On_First_10', 'Prediction_Based_On_Last_10', 'Aggregated_Prediction', 'Prediction_Based_On_First_10_LP', 'response_Based_On_First_10_LP', 'Prediction_Based_On_Last_10_LP', 'response_Based_On_Last_10_LP', 'response_LP', 'Aggregated_Prediction_LP', 'Prediction_Based_On_First_50_LP', 'response_Based_On_First_50_LP', 'Prediction_Based_On_Last_50_LP', 'response_Based_On_Last_50_LP', 'Aggregated_Prediction_50_LP', 'Prediction_Based_On_random_50_LP_1', 'response_Based_On_random_50_LP_1', 'Prediction_Based_On_random_50_LP_2', 'response_Based_On_random_50_LP_2', 'Aggregated_Prediction_random_50_LP', 'Unnamed: 0_x', 'response_x', 'replace_slash_n_slash_n_with_newline_x', 'replace_slash_n_slash_n_with_newline_values_x', 'replace_slash_n_with_newline_x', 'replace_slash_n_with_newline_values_x', 'avg_cosine_sim', 'num_sentences_x', '

In [171]:
# Define the logistic regression model
logit_no_clustering_cw_compliance = smf.logit('compliance ~ conversation_length_thousands + consolidated_num_steps_ideas + conversation_length_thousands_2 + consolidated_num_steps_ideas_2 + fres + fres_2 + model', data=cw_data).fit(cov_type='HC3')

print('logit no clustering cw compliance')
# Display the summary
print(logit_no_clustering_cw_compliance.summary())

# Marginal effects
logit_no_clustering_cw_compliance_marginal_effects = logit_no_clustering_cw_compliance.get_margeff(at='overall')
print(logit_no_clustering_cw_compliance_marginal_effects.summary())


Optimization terminated successfully.
         Current function value: 0.654309
         Iterations 5
logit no clustering cw compliance
                           Logit Regression Results                           
Dep. Variable:             compliance   No. Observations:                 1434
Model:                          Logit   Df Residuals:                     1426
Method:                           MLE   Df Model:                            7
Date:                Tue, 05 Dec 2023   Pseudo R-squ.:                 0.05297
Time:                        00:08:56   Log-Likelihood:                -938.28
converged:                       True   LL-Null:                       -990.76
Covariance Type:                  HC3   LLR p-value:                 1.019e-19
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept                       

In [172]:
# Define and fit the OLS model with clustered standard errors
lpm_with_clustering_cw_compliance = smf.ols('compliance ~ conversation_length_thousands + consolidated_num_steps_ideas + conversation_length_thousands_2 + consolidated_num_steps_ideas_2 + fres + fres_2 + task_conversation + model', data=cw_data).fit(cov_type='cluster', cov_kwds={'groups': cw_data['task_conversation_method']})

# Print the model summary
print('lpm with clustering cw compliance')
print(lpm_with_clustering_cw_compliance.summary())


lpm with clustering cw compliance
                            OLS Regression Results                            
Dep. Variable:             compliance   R-squared:                       0.203
Model:                            OLS   Adj. R-squared:                  0.140
Method:                 Least Squares   F-statistic:                     12.22
Date:                Tue, 05 Dec 2023   Prob (F-statistic):          5.08e-108
Time:                        00:08:56   Log-Likelihood:                -874.55
No. Observations:                1434   AIC:                             1963.
Df Residuals:                    1327   BIC:                             2527.
Df Model:                         106                                         
Covariance Type:              cluster                                         
                                      coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------

### GSM8K Regression with Provided Answer Complexity Interaction

In [173]:
print(list(gsm8k_data.columns))

# Length_provided in thousands
gsm8k_data['length_provided_thousands'] = gsm8k_data['length_provided']/1000

# Squared length_provided in thousands
gsm8k_data['length_provided_thousands_2'] = gsm8k_data['length_provided_thousands']**2


['model_task_method', 'conversation_number', 'coherence_1_incoherent_10_very_coherent', 'compliance_OLD', 'ease_of_review_1_easy_10_hard', 'correct', 'Prediction_Based_On_First_10', 'Prediction_Based_On_Last_10', 'Aggregated_Prediction', 'Prediction_Based_On_First_10_LP', 'response_Based_On_First_10_LP', 'Prediction_Based_On_Last_10_LP', 'response_Based_On_Last_10_LP', 'response_LP', 'Aggregated_Prediction_LP', 'Prediction_Based_On_First_50_LP', 'response_Based_On_First_50_LP', 'Prediction_Based_On_Last_50_LP', 'response_Based_On_Last_50_LP', 'Aggregated_Prediction_50_LP', 'Prediction_Based_On_random_50_LP_1', 'response_Based_On_random_50_LP_1', 'Prediction_Based_On_random_50_LP_2', 'response_Based_On_random_50_LP_2', 'Aggregated_Prediction_random_50_LP', 'Unnamed: 0_x', 'response_x', 'replace_slash_n_slash_n_with_newline_x', 'replace_slash_n_slash_n_with_newline_values_x', 'replace_slash_n_with_newline_x', 'replace_slash_n_with_newline_values_x', 'avg_cosine_sim', 'num_sentences_x', '

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gsm8k_data['length_provided_thousands'] = gsm8k_data['length_provided']/1000
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gsm8k_data['length_provided_thousands_2'] = gsm8k_data['length_provided_thousands']**2


In [174]:
# Define the logistic regression model
logit_no_clustering_gsm8k_provided_interaction = smf.logit('correct ~ length_provided_thousands * conversation_length_thousands + length_provided_thousands * consolidated_num_steps_ideas + length_provided_thousands * conversation_length_thousands_2 + length_provided_thousands * consolidated_num_steps_ideas_2 + length_provided_thousands * model + length_provided_thousands_2', data=gsm8k_data).fit(cov_type='HC3')

# Display the summary
print(logit_no_clustering_gsm8k_provided_interaction.summary())

# Marginal effects
logit_no_clustering_gsm8k_provided_interaction_marginal_effects = logit_no_clustering_gsm8k_provided_interaction.get_margeff(at='overall')
print(logit_no_clustering_gsm8k_provided_interaction_marginal_effects.summary())


Optimization terminated successfully.
         Current function value: 0.541507
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                correct   No. Observations:                 1600
Model:                          Logit   Df Residuals:                     1587
Method:                           MLE   Df Model:                           12
Date:                Tue, 05 Dec 2023   Pseudo R-squ.:                  0.2010
Time:                        00:08:56   Log-Likelihood:                -866.41
converged:                       True   LL-Null:                       -1084.4
Covariance Type:                  HC3   LLR p-value:                 8.873e-86
                                                                coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------------------------------
Intercept     

In [175]:
# Define and fit the OLS model with clustered standard errors
lpm_with_clustering_gsm8k_provided_interaction = smf.ols('correct ~ length_provided_thousands * conversation_length_thousands + length_provided_thousands * consolidated_num_steps_ideas + length_provided_thousands * conversation_length_thousands_2 + length_provided_thousands * consolidated_num_steps_ideas_2 + length_provided_thousands * task_conversation + length_provided_thousands * model + length_provided_thousands_2', data=gsm8k_data).fit(cov_type='cluster', cov_kwds={'groups': gsm8k_data['task_conversation_method']})

# Print the model summary
print(lpm_with_clustering_gsm8k_provided_interaction.summary())




                            OLS Regression Results                            
Dep. Variable:                correct   R-squared:                       0.407
Model:                            OLS   Adj. R-squared:                  0.319
Method:                 Least Squares   F-statistic:                     71.75
Date:                Tue, 05 Dec 2023   Prob (F-statistic):               0.00
Time:                        00:08:56   Log-Likelihood:                -718.03
No. Observations:                1600   AIC:                             1852.
Df Residuals:                    1392   BIC:                             2971.
Df Model:                         207                                         
Covariance Type:              cluster                                         
                                                                coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------

### Add model interaction terms

In [176]:
# Define the logistic regression model
logit_no_clustering_gsm8k_model_interaction = smf.logit('correct ~ conversation_length_thousands * model + consolidated_num_steps_ideas * model + conversation_length_thousands_2 * model + consolidated_num_steps_ideas_2 * model', data=gsm8k_data).fit(cov_type='HC3')

# Display the summary
print(logit_no_clustering_gsm8k_model_interaction.summary())

# Marginal effects
logit_no_clustering_gsm8k_marginal_effects_model_interaction = logit_no_clustering_gsm8k_model_interaction.get_margeff(at='overall')
print(logit_no_clustering_gsm8k_marginal_effects_model_interaction.summary())


Optimization terminated successfully.
         Current function value: 0.566818
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                correct   No. Observations:                 1600
Model:                          Logit   Df Residuals:                     1590
Method:                           MLE   Df Model:                            9
Date:                Tue, 05 Dec 2023   Pseudo R-squ.:                  0.1637
Time:                        00:08:56   Log-Likelihood:                -906.91
converged:                       True   LL-Null:                       -1084.4
Covariance Type:                  HC3   LLR p-value:                 5.343e-71
                                                   coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------------------
Intercept                               

In [177]:
# Define and fit the OLS model with clustered standard errors
lpm_with_clustering_gsm8k_model_interaction = smf.ols('correct ~ conversation_length_thousands * model + consolidated_num_steps_ideas * model + conversation_length_thousands_2 * model + consolidated_num_steps_ideas_2 * model + task_conversation', data=gsm8k_data).fit(cov_type='cluster', cov_kwds={'groups': gsm8k_data['task_conversation_method']})

# Print the model summary
print(lpm_with_clustering_gsm8k_model_interaction.summary())


                            OLS Regression Results                            
Dep. Variable:                correct   R-squared:                       0.361
Model:                            OLS   Adj. R-squared:                  0.314
Method:                 Least Squares   F-statistic:                     28.54
Date:                Tue, 05 Dec 2023   Prob (F-statistic):          1.64e-210
Time:                        00:08:56   Log-Likelihood:                -778.49
No. Observations:                1600   AIC:                             1775.
Df Residuals:                    1491   BIC:                             2361.
Df Model:                         108                                         
Covariance Type:              cluster                                         
                                                   coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------

In [178]:
# Define and fit the OLS model with clustered standard errors
reg_with_clustering_cw_model_interaction = smf.ols("avg_inter_paragraph_cosine_sim ~ conversation_length_thousands * model + consolidated_num_steps_ideas * model + conversation_length_thousands_2 * model + consolidated_num_steps_ideas_2 * model + fres * model + fres_2 * model + task_conversation",
                                 data=cw_data).fit(cov_type='cluster', cov_kwds={'groups': cw_data['task_conversation_method']})

# Print the model summary
print(reg_with_clustering_cw_model_interaction.summary())


                                  OLS Regression Results                                  
Dep. Variable:     avg_inter_paragraph_cosine_sim   R-squared:                       0.438
Model:                                        OLS   Adj. R-squared:                  0.391
Method:                             Least Squares   F-statistic:                     20.59
Date:                            Tue, 05 Dec 2023   Prob (F-statistic):          4.44e-168
Time:                                    00:08:56   Log-Likelihood:                 941.60
No. Observations:                            1434   AIC:                            -1657.
Df Residuals:                                1321   BIC:                            -1062.
Df Model:                                     112                                         
Covariance Type:                          cluster                                         
                                                   coef    std err          z      P>|z|  

In [179]:
# Define the logistic regression model
logit_no_clustering_cw_compliance_model_interaction = smf.logit('compliance ~ conversation_length_thousands * model + consolidated_num_steps_ideas * model + conversation_length_thousands_2 * model + consolidated_num_steps_ideas_2 * model + fres * model + fres_2 * model', data=cw_data).fit(cov_type='HC3')

print('logit no clustering cw compliance')
# Display the summary
print(logit_no_clustering_cw_compliance_model_interaction.summary())

# Marginal effects
logit_no_clustering_cw_compliance_marginal_effects_model_interaction = logit_no_clustering_cw_compliance_model_interaction.get_margeff(at='overall')
print(logit_no_clustering_cw_compliance_marginal_effects_model_interaction.summary())


Optimization terminated successfully.
         Current function value: 0.650621
         Iterations 5
logit no clustering cw compliance
                           Logit Regression Results                           
Dep. Variable:             compliance   No. Observations:                 1434
Model:                          Logit   Df Residuals:                     1420
Method:                           MLE   Df Model:                           13
Date:                Tue, 05 Dec 2023   Pseudo R-squ.:                 0.05831
Time:                        00:08:56   Log-Likelihood:                -932.99
converged:                       True   LL-Null:                       -990.76
Covariance Type:                  HC3   LLR p-value:                 1.531e-18
                                                   coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------------------
Interc

In [180]:
# Define and fit the OLS model with clustered standard errors
lpm_with_clustering_cw_compliance_model_interaction = smf.ols('compliance ~ conversation_length_thousands * model + consolidated_num_steps_ideas * model + conversation_length_thousands_2 * model + consolidated_num_steps_ideas_2 * model + fres * model + fres_2 * model + task_conversation * model', data=cw_data).fit(cov_type='cluster', cov_kwds={'groups': cw_data['task_conversation_method']})

# Print the model summary
print('lpm with clustering cw compliance')
print(lpm_with_clustering_cw_compliance_model_interaction.summary())


lpm with clustering cw compliance
                            OLS Regression Results                            
Dep. Variable:             compliance   R-squared:                       0.321
Model:                            OLS   Adj. R-squared:                  0.204
Method:                 Least Squares   F-statistic:                     67.99
Date:                Tue, 05 Dec 2023   Prob (F-statistic):               0.00
Time:                        00:08:56   Log-Likelihood:                -759.90
No. Observations:                1434   AIC:                             1944.
Df Residuals:                    1222   BIC:                             3061.
Df Model:                         211                                         
Covariance Type:              cluster                                         
                                                   coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------