In [1]:
# load relevant packages
from pypdf import PdfReader
import os
import re
import pandas as pd
import numpy as np
from autocorrect import spell 

# Set option to display all rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Questionnaire Parsing and Formatting

In [2]:
# read in pdf questionnaire and convert it to txt

# creating a pdf reader object 
reader = PdfReader('World Value Survey/WVS_7_Codebook_Core_Variables.pdf') 
  
# printing number of pages in pdf file 
print("Length of Questionnaire: ", len(reader.pages)) 
  
# getting a specific page from the pdf file 
page = reader.pages[0] 

content = "------------- Content of Questionnaire -------------"
# extracting text from page 
for i in range(0,len(reader.pages)):
    content = content + "\n" + reader.pages[i].extract_text() 
    
print(content)
 

Length of Questionnaire:  74
------------- Content of Questionnaire -------------
 
 
10 
 
The WORLD VALUES SURVEY ASSOCIATION 
www.worldvaluessurvey.org  
 
Core Variables 
Social Values, Norms, Stereotypes (Q1-Q45) 
Q1 Important in life: Family  
For each of the following aspects, indicate how imp ortant it is in your life. Would you 
say it is very important, rather important, not ver y important or not important at all? – 
Family  
1.- Very important 
2.- Rather important 
3.- Not very important 
4.- Not at all important 
-1-.- Don´t know 
-2-.- No answer 
-4-.- Not asked in this country 
-5-.- Missing; Not available  
Q2 Important in life: Friends  
For each of the following aspects, indicate how imp ortant it is in your life. Would you 
say it is very important, rather important, not ver y important or not important at all? – 
Friends  
1.- Very important 
2.- Rather important 
3.- Not very important 
4.- Not at all important 
-1-.- Don´t know 
-2-.- No answer 
-4-.- Not asked i

### Extraction Method: Regular Expression

In [3]:
# extract relevant information from questions Q1 - Q290 in WVS

# Define the pattern using regular expression
pattern = re.compile(r'Q\d+.*?(?=Q\d+|$)', re.DOTALL)


# Find all matches
matches = pattern.findall(content)
filtered_matches = [match.strip() for match in matches if len(match.strip()) >= 10]

In [4]:
# create dataframe
questionnaire = pd.DataFrame({
    "question": [np.nan] * len(filtered_matches), 
    "answer options": [np.nan] * len(filtered_matches),
    "list_answer_options": [np.nan] * len(filtered_matches)
    })

# irrelevant substrings
substring = r'\d+\s+The WORLD VALUES SURVEY ASSOCIATION \nwww.worldvaluessurvey.org'

for i,k in enumerate(filtered_matches):
    k = re.sub(substring, "", k)
    split_string = re.split("(\n1|\n2|\n 0.|\n 1.-|\n 10.-)", k, maxsplit=1)
    try:
        questionnaire["question"][i] = split_string[0]
        questionnaire["answer options"][i] = split_string[1] + split_string[2]
    except:
        print("An exception occurred. This might be a follow up question which was parsed incorrectly.")
        
questionnaire["answer options"] = questionnaire["answer options"].str.replace("\n", "")
questionnaire["question"] = questionnaire["question"].str.replace("\n", "")

# Extract the question ID using regex and create a new column
questionnaire['question_ID'] = questionnaire['question'].str.extract(r'(Q\d+)')
questionnaire['full_question_ID'] = questionnaire['question'].str.extract(r'(Q\d+\w*)')

questionnaire['question'] = questionnaire['question'].str.replace(r'(Q\d+\w*)\s* ', '', regex=True)

An exception occurred. This might be a follow up question which was parsed incorrectly.
An exception occurred. This might be a follow up question which was parsed incorrectly.
An exception occurred. This might be a follow up question which was parsed incorrectly.
An exception occurred. This might be a follow up question which was parsed incorrectly.
An exception occurred. This might be a follow up question which was parsed incorrectly.
An exception occurred. This might be a follow up question which was parsed incorrectly.
An exception occurred. This might be a follow up question which was parsed incorrectly.
An exception occurred. This might be a follow up question which was parsed incorrectly.
An exception occurred. This might be a follow up question which was parsed incorrectly.
An exception occurred. This might be a follow up question which was parsed incorrectly.
An exception occurred. This might be a follow up question which was parsed incorrectly.
An exception occurred. This migh

For each of the following aspects, indicate how imp ortant it is in your life. Would you 
say it is very important, rather important, not ver y important or not important at all? – 
Family  ' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  questionnaire["question"][i] = split_string[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  questionnaire["answer options"][i] = split_string[1] + split_string[2]
1.- Very important 
2.- Rather important 
3.- Not very important 
4.- Not at all important 
-1-.- Don´t know 
-2-.- No answer 
-4-.- Not asked in this country 
-5-.- Missing; Not available' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  questionnaire["answer options"][i] = split_string[1] + split_string[2]


#### Autocorrection for questions

In [5]:
import re
from symspellpy.symspellpy import SymSpell, Verbosity

# Initialize SymSpell object
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

# Load the frequency dictionary
dictionary_path = "frequency_dictionary_en_82_765.txt"
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

# Load the bigram dictionary
bigram_path = "frequency_bigramdictionary_en_243_342.txt"
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

split_pattern = re.compile(r'([^\w\s]+)')
list_delimiters = [",", ":", ".", "?", "!", ";"]

def correct_sentence(sentence):
    # Split the sentence by non-character delimiters while keeping the delimiters
    tokens = split_pattern.split(sentence)
    #print(tokens)
    corrected_sentence = []
    
    for token in tokens:
        if token in list_delimiters:
            corrected_sentence.append(token + " ")
        else:
            suggestions = sym_spell.lookup_compound(token, max_edit_distance=2,transfer_casing=True)
            corrected_sentence.append(suggestions[0].term if suggestions else token)
    
    return "".join(corrected_sentence)


In [6]:
# correct question column

for i,k in enumerate(questionnaire["question"]):
    corrected_sentence = correct_sentence(k)
    questionnaire["question"][i] = corrected_sentence
    
# create answer option lists



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  questionnaire["question"][i] = corrected_sentence
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  questionnaire["question"][i] = corrected_sentence
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  questionnaire["question"][i] = corrected_sentence
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  questionnaire["que

In [15]:
def extract_content_with_delimiters(text):
    pattern = r'(-?\d+)(.*?)(?=-?\d|$)'
    matches = re.findall(pattern, text)
    results = []

    for match in matches:
        current_number = match[0]
        content = match[1]
        results.append(f'{current_number}{content}')

    return results


for i,k in enumerate(questionnaire["answer options"]):
    try:
        questionnaire["list_answer_options"][i] = extract_content_with_delimiters(k)
    except:
        print(f"An error occured with parsing the content of row {i} with content {k}")



An error occured with parsing the content of row 114 with content nan
An error occured with parsing the content of row 117 with content nan
An error occured with parsing the content of row 120 with content nan
An error occured with parsing the content of row 123 with content nan
An error occured with parsing the content of row 126 with content nan
An error occured with parsing the content of row 129 with content nan
An error occured with parsing the content of row 132 with content nan
An error occured with parsing the content of row 135 with content nan
An error occured with parsing the content of row 138 with content nan
An error occured with parsing the content of row 141 with content nan
An error occured with parsing the content of row 144 with content nan
An error occured with parsing the content of row 147 with content nan
An error occured with parsing the content of row 216 with content nan
An error occured with parsing the content of row 268 with content nan
An error occured wit

In [21]:
print("The uncleaned questionnaire contains: ", questionnaire.shape[0], " rows.")

# we have to remove nan cells due to incorrect parsing and because the questions are not relevant
questionnaire = questionnaire.dropna(subset=['question', 'answer options'])
print("The uncleaned questionnaire contains: ", questionnaire.shape[0], " rows.")

The uncleaned questionnaire contains:  292  rows.


### Extraction Method: ChatGPT Extraction

In [17]:
%pip uninstall -y transformer-engine
%pip install torch==2.2.0
%pip install transformers --upgrade
%pip install flash-attn==2.2.0


^C
Note: you may need to restart the kernel to use updated packages.




Note: you may need to restart the kernel to use updated packages.


#### Llama3

In [8]:
import transformers
import torch

In [9]:
model_id = "meta-llama/Meta-Llama-3-8B"

pipeline = transformers.pipeline(
        "text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto"
)
pipeline("Hey how are you doing today?")


RuntimeError: Failed to import transformers.pipelines because of the following error (look up to see its traceback):
cannot import name 'formatargspec' from 'inspect' (c:\Users\ruppr\anaconda3\Lib\inspect.py)

In [6]:
output

{'error': 'Authorization header is correct, but the token seems invalid'}

#### Mistral-8x7B

In [51]:
import autogen

config_list = [
    {
        # Choose your model name.
        #"api_type": "open_ai",
        "base_url": "http://localhost:1234/v1",
        # You need to provide your API key here.
        "api_key": "NULL",
    }
]

llm_config = {
    #"request_timeout": 10,
    "seed": 42,
    "config_list": config_list,
    "temperature": 0
}

assistent = autogen.AssistantAgent(
    name = "assistent",
    system_message="You are a bot trying to create a python dataframe out of a questionnaire in text format.",
    llm_config=llm_config
)

user_proxy = autogen.UserProxyAgent(
    name = "user_proxy",
    human_input_mode="NEVER",
    max_consecutive_auto_reply=10,
    is_termination_msg=lambda x: x.get("content", "").rstrip().endswith("TERMINATE"),
    code_execution_config={"work_dir": "web"},
    llm_config=llm_config,
    system_message="Reply TERMINATE if the task has been successfully solved. Otherwise reply CONTINUE."
)

task = f"""Write a python dataframe which contains three columns with information extracted from the given text. The three columns
are 'question label', 'question' and 'answer options'. This is the text the information should be extracted from: """ 


In [52]:
user_proxy.initiate_chat(
    assistent,
    message = task
)

[33muser_proxy[0m (to assistent):

Write a python dataframe which contains three columns with information extracted from the given text. The three columns
are 'question label', 'question' and 'answer options'. This is the text the information should be extracted from: 

--------------------------------------------------------------------------------


BadRequestError: Error code: 400 - {'error': '<LM Studio error> Exit code -1073740791.. Error Data: n/a, Additional Data: n/a'}

In [11]:
from pathlib import Path

from autogen import AssistantAgent, UserProxyAgent
from autogen.coding import LocalCommandLineCodeExecutor

# Setting up the code executor.
workdir = Path("coding")
workdir.mkdir(exist_ok=True)
code_executor = LocalCommandLineCodeExecutor(work_dir=workdir)

# Setting up the agents.
user_proxy_agent = UserProxyAgent(
    name="User",
    code_execution_config={"executor": code_executor},
    is_termination_msg=lambda msg: "TERMINATE" in msg.get("content"),
)

assistant_agent = AssistantAgent(
    name="Mistral Assistant",
    llm_config={"config_list": config_list},
)

ModuleNotFoundError: No module named 'autogen.coding'

# Descriptive Statistics of WVS questionnaire
In this section I am exploring the structure of the WVS questionnaire and the information I extracted in the section above. This is needed and helpful for creating the perturbations reliably.

In [122]:
import pdfplumber
pdf = pdfplumber.open("World Value Survey/WVS7_Explanatory_note_on_scales_recoding.pdf")
levels = []

for i in range(1, len(pdf.pages)):
    page =  pdf.pages[i]
    table = page.extract_table()
    levels.append(table)
    
cleaned_levels = []
for p in range(0,len(levels)):
    for i in levels[p]:
        if i[0] == "":
            print("Header detected and not added.")
        else:
            try: 
                new_list = [x for x in i if x not in ["", None]]
                cleaned_levels.append(new_list)
                print("List appended.")
            except: "List was already deleted."
        
    
#cleaned_levels

Header detected and not added.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
Header detected and not added.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
Header detected and not added.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
Header detected and not added.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
Header detected and not added.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
List appended.
Header detected and not added.
List appended.
List appended.
List

In [123]:
columns = ["ID", "Variables","Original scale", "Recoded “positive” scale", ""]
scales = pd.DataFrame(cleaned_levels, columns=columns)
for column in columns:
    scales[column] = scales[column].str.replace("\n", " ")
scales = scales.iloc[:,:4]

In [124]:
scales["list_answer_options"] = np.nan * scales.shape[0]


for i,k in enumerate(scales["Original scale"]):
    try:
        scales["list_answer_options"][i] = extract_content_with_delimiters(k)
    except:
        print(f"An error occured with parsing the content of row {i} with content {k}")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scales["list_answer_options"][i] = extract_content_with_delimiters(k)
  scales["list_answer_options"][i] = extract_content_with_delimiters(k)


In [125]:
scales["applies_to"] = np.nan * scales.shape[0]


for i,k in enumerate(scales["ID"]):
    try:
        k = k[1:]
        prefix, range_str = k.split("-")
        start = int(prefix)
        end = int(range_str)
        scales["applies_to"][i] = ["Q" + str(i) for i in range(start, end + 1)]
    except:
        scales["applies_to"][i] = "Q" + k
        #print(f"Applies to a single question.")


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scales["applies_to"][i] = ["Q" + str(i) for i in range(start, end + 1)]
  scales["applies_to"][i] = ["Q" + str(i) for i in range(start, end + 1)]


# Perturbation

## Perturbation 1: Option Ordering
Here I are changing the order of the answer options. Primarily I am interested in changing the order of the options so that the prior first option becomes the last and vice versa.

In [126]:
df_option_order = pd.DataFrame(scales)
df_option_order["list_answer_options_recoded"] = np.nan * df_option_order.shape[0]


for i,k in enumerate(df_option_order["Recoded “positive” scale"]):
    try:
        df_option_order["list_answer_options_recoded"][i] = extract_content_with_delimiters(k)
    except:
        print(f"An error occured with parsing the content of row {i} with content {k}")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_option_order["list_answer_options_recoded"][i] = extract_content_with_delimiters(k)
  df_option_order["list_answer_options_recoded"][i] = extract_content_with_delimiters(k)


In [127]:
# include refusal categories in every list
# answer option "-3-.- Not applicable (no first answer)" is left out because it occurs just 4 times
refusals = ["-1=Don´t know", "-2=No answer", "-4=Not asked in this country", "-5=Missing; Not available"]

for index, row in df_option_order.iterrows():
    row['list_answer_options'].extend(refusals)
    row["list_answer_options_recoded"].extend(refusals)


In [128]:
df_option_order.head()

Unnamed: 0,ID,Variables,Original scale,Recoded “positive” scale,list_answer_options,applies_to,list_answer_options_recoded
0,Q1-6,"Importance of family, friends, work, politics, religion",1=Very important 2=Rather important 3=Not very important 4=Not important at all,4=Very important 3=Rather important 2=Not very important 1=Not important at all,"[1=Very important , 2=Rather important , 3=Not very important , 4=Not important at all, -1=Don´t know, -2=No answer, -4=Not asked in this country, -5=Missing; Not available]","[Q1, Q2, Q3, Q4, Q5, Q6]","[4=Very important , 3=Rather important , 2=Not very important , 1=Not important at all, -1=Don´t know, -2=No answer, -4=Not asked in this country, -5=Missing; Not available]"
1,Q7-17,Important child qualities,1=Mentioned 2=Not mentioned,1=Mentioned 0=Not mentioned,"[1=Mentioned , 2=Not mentioned, -1=Don´t know, -2=No answer, -4=Not asked in this country, -5=Missing; Not available]","[Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15, Q16, Q17]","[1=Mentioned , 0=Not mentioned, -1=Don´t know, -2=No answer, -4=Not asked in this country, -5=Missing; Not available]"
2,Q18-26,Not acceptable as neighbours,1=Mentioned 2=Not mentioned,1=Mentioned 0=Not mentioned,"[1=Mentioned , 2=Not mentioned, -1=Don´t know, -2=No answer, -4=Not asked in this country, -5=Missing; Not available]","[Q18, Q19, Q20, Q21, Q22, Q23, Q24, Q25, Q26]","[1=Mentioned , 0=Not mentioned, -1=Don´t know, -2=No answer, -4=Not asked in this country, -5=Missing; Not available]"
3,Q27-32,Set of statements on social attitudes,1=Strongly agree 2=Agree 3=Disagree 4=Strongly disagree,4=Strongly agree 3=Agree 2=Disagree 1=Strongly disagree,"[1=Strongly agree , 2=Agree , 3=Disagree , 4=Strongly disagree, -1=Don´t know, -2=No answer, -4=Not asked in this country, -5=Missing; Not available]","[Q27, Q28, Q29, Q30, Q31, Q32]","[4=Strongly agree , 3=Agree , 2=Disagree , 1=Strongly disagree, -1=Don´t know, -2=No answer, -4=Not asked in this country, -5=Missing; Not available]"
4,Q33-41,Set of statements on social attitudes,"1=Strongly agree 2=Agree 3=Neither agree, nor disagree 4=Disagree 5=Strongly disagree","5=Strongly agree 4=Agree 3=Neither agree, nor disagree 2=Disagree 1=Strongly disagree","[1=Strongly agree , 2=Agree , 3=Neither agree, nor disagree , 4=Disagree , 5=Strongly disagree, -1=Don´t know, -2=No answer, -4=Not asked in this country, -5=Missing; Not available]","[Q33, Q34, Q35, Q36, Q37, Q38, Q39, Q40, Q41]","[5=Strongly agree , 4=Agree , 3=Neither agree, nor disagree , 2=Disagree , 1=Strongly disagree, -1=Don´t know, -2=No answer, -4=Not asked in this country, -5=Missing; Not available]"


## Perturbation 2: Refusal Category
Here I create a dataframe which does not contain any refusal categories.

## Perturbation 3: Middle Category
Here I check whether the answer options contain a middle category and create a dataframe which avoids middle categories.