<a href="https://colab.research.google.com/github/jgbrenner/psychometrics/blob/main/Copy_of_Final_model_PsychC9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Generating Psychometric Scale Items in Polish Measuring Perfectionism Using Generative AI and Conducting Exploratory Graph Analysis**



In [1]:
# Install necessary Python packages
!pip install openai groq rpy2

Collecting groq
  Downloading groq-0.12.0-py3-none-any.whl.metadata (13 kB)
Downloading groq-0.12.0-py3-none-any.whl (108 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.9/108.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.12.0


In [2]:
# Import necessary Python libraries
import os
import openai
import pandas as pd
import numpy as np
import json
import requests
import re
from google.colab import drive, userdata

In [3]:
# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Define the path where the R library will be saved in Google Drive
library_path = '/content/drive/MyDrive/R_libraries'

# Create the directory if it doesn't exist
if not os.path.exists(library_path):
    os.makedirs(library_path)

Mounted at /content/drive


In [4]:
# Verify rpy2 version
import rpy2
print(f"rpy2 version: {rpy2.__version__}")

rpy2 version: 3.4.2


In [5]:
# Fetching the OpenAI API key from Colab secrets
openai_api_key = userdata.get('OPENAI_API_KEY')
if openai_api_key:
    os.environ["OPENAI_API_KEY"] = openai_api_key
else:
    raise ValueError("OPENAI_API_KEY is not set or invalid.")

# Import OpenAI Client
from openai import Client
client = Client()

In [6]:
# Prompting GPT-4o-mini to generate items
messages = [
    {
        "role": "system",
        "content": "Jesteś ekspertem w psychometrii, który tworzy pytania testowe."
    },
    {
        "role": "user",
        "content": (
            "Wygeneruj po 3 stwierdzenia dla każdego z następujących konstruktów: "
            "Perfekcjonizm skierowany na siebie: Tendencja do wymagania doskonałości od siebie samego, "
            "Perfekcjonizm skierowany na innych: Stawianie wysokich oczekiwań wobec innych i krytyczna ocena ich osiągnięć, "
            "Perfekcjonizm społecznie narzucony: Przekonanie, że inni oczekują od nas doskonałości. "
            "Niech stwierdzenia będą zwięzłe i jasne,  odpowiednie do oceny w skali Likerta. "
            "Provide the output ONLY in JSON format as a list of dictionaries, "
            "without any additional text or explanation. "
            "Each dictionary should have keys 'construct' and 'item'."
        )
    }
]

try:
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        temperature=0.7,
        max_tokens=8048,
        top_p=1,
        stream=False
    )
    response_content = completion.choices[0].message.content
    print("\nGenerated Items:\n", response_content)
except Exception as e:
    print(f"An error occurred: {e}")
    response_content = ""


Generated Items:
 ```json
[
    {"construct": "Perfekcjonizm skierowany na siebie", "item": "Czuję, że muszę osiągnąć doskonałość we wszystkim, co robię."},
    {"construct": "Perfekcjonizm skierowany na siebie", "item": "Czuję się źle, gdy nie spełniam własnych wysokich oczekiwań."},
    {"construct": "Perfekcjonizm skierowany na siebie", "item": "Zdarza mi się marnować czas na poprawianie drobnych błędów."},
    
    {"construct": "Perfekcjonizm skierowany na innych", "item": "Oczekuję, że inni będą zawsze osiągać perfekcję."},
    {"construct": "Perfekcjonizm skierowany na innych", "item": "Krytykuję innych, gdy nie spełniają moich wysokich standardów."},
    {"construct": "Perfekcjonizm skierowany na innych", "item": "Często porównuję osiągnięcia innych do moich oczekiwań."},
    
    {"construct": "Perfekcjonizm społecznie narzucony", "item": "Czuję, że społeczeństwo oczekuje ode mnie doskonałości."},
    {"construct": "Perfekcjonizm społecznie narzucony", "item": "Obawiam się, ż

In [7]:
# Attempt to parse the JSON response
try:
    generated_items = json.loads(response_content)
    items_df = pd.DataFrame(generated_items)
except json.JSONDecodeError as e:
    print(f"JSON parsing failed: {e}")
    # Attempt to extract valid JSON content from the response
    json_match = re.search(r'\[.*\]', response_content, re.DOTALL)
    if json_match:
        json_str = json_match.group(0)
        print("Extracted JSON String:")
        print(json_str)
        try:
            generated_items = json.loads(json_str)
            items_df = pd.DataFrame(generated_items)
        except json.JSONDecodeError as e2:
            print(f"Second JSON decoding attempt failed: {e2}")
            items_df = None
    else:
        print("No valid JSON found in the response.")
        items_df = None

JSON parsing failed: Expecting value: line 1 column 1 (char 0)
Extracted JSON String:
[
    {"construct": "Perfekcjonizm skierowany na siebie", "item": "Czuję, że muszę osiągnąć doskonałość we wszystkim, co robię."},
    {"construct": "Perfekcjonizm skierowany na siebie", "item": "Czuję się źle, gdy nie spełniam własnych wysokich oczekiwań."},
    {"construct": "Perfekcjonizm skierowany na siebie", "item": "Zdarza mi się marnować czas na poprawianie drobnych błędów."},
    
    {"construct": "Perfekcjonizm skierowany na innych", "item": "Oczekuję, że inni będą zawsze osiągać perfekcję."},
    {"construct": "Perfekcjonizm skierowany na innych", "item": "Krytykuję innych, gdy nie spełniają moich wysokich standardów."},
    {"construct": "Perfekcjonizm skierowany na innych", "item": "Często porównuję osiągnięcia innych do moich oczekiwań."},
    
    {"construct": "Perfekcjonizm społecznie narzucony", "item": "Czuję, że społeczeństwo oczekuje ode mnie doskonałości."},
    {"construct": "P

In [8]:
# Save the items to CSV if parsing succeeds
if items_df is not None and not items_df.empty:
    items_df.to_csv("generated_items_pool.csv", index=False)
    print("Generated items saved to 'generated_items_pool.csv'.")
else:
    print("No items to save.")
    items_df = None

Generated items saved to 'generated_items_pool.csv'.


In [9]:
# Prepare the list of items
if items_df is not None:
    # Clean and validate item texts
    items_df['item'] = items_df['item'].astype(str).str.strip()
    # Filter out empty items
    items_df = items_df[items_df['item'] != '']
    # Reset index after filtering
    items_df.reset_index(drop=True, inplace=True)

    # Now prepare item_texts for embeddings
    item_texts = items_df["item"].tolist()

    # Set up the API endpoint and headers for OpenAI embeddings
    embedding_endpoint = "https://api.openai.com/v1/embeddings"
    embedding_model = "text-embedding-3-small"

    headers = {
        "Authorization": f"Bearer {openai_api_key}",
        "Content-Type": "application/json"
    }

    # Prepare the data payload
    data = {
        "model": embedding_model,
        "input": item_texts
    }

    # Make the API request to generate embeddings
    try:
        response = requests.post(embedding_endpoint, headers=headers, json=data)
        if response.status_code != 200:
            print(f"Request failed with status code {response.status_code}: {response.text}")
            embeddings_array = None
        else:
            response_data = response.json()
            # Extract embeddings
            embeddings = [item['embedding'] for item in response_data['data']]
            embeddings_array = np.array(embeddings)
            print(f"Embeddings generated successfully. Shape: {embeddings_array.shape}")
            # Save the embeddings
            np.save("embeddings.npy", embeddings_array)
            print("Embeddings saved as 'embeddings.npy'.")
    except Exception as e:
        print(f"An error occurred during embedding generation: {e}")
        embeddings_array = None
else:
    print("Error: items_df is not defined. Cannot proceed with embedding generation.")
    embeddings_array = None

Embeddings generated successfully. Shape: (9, 1536)
Embeddings saved as 'embeddings.npy'.


In [10]:

# Ensure that items_df and embeddings_array have the same number of items
if embeddings_array is not None and items_df is not None:
    if embeddings_array.shape[0] != len(items_df):
        print(f"Embeddings count ({embeddings_array.shape[0]}) does not match items count ({len(items_df)}). Adjusting items_df.")
        # Truncate items_df to match embeddings_array size
        items_df = items_df.iloc[:embeddings_array.shape[0]]
        items_df.reset_index(drop=True, inplace=True)


In [11]:
# Define custom abbreviations for constructs
construct_abbreviations = {
    "Perfekcjonizm skierowany na siebie": "PSS",
    "Perfekcjonizm społecznie narzucony": "PSP",
    "Perfekcjonizm skierowany na innych": "PSI"
}

In [12]:
# Apply custom labels with error handling for unexpected constructs
if items_df is not None:
    items_df['item_label'] = items_df.apply(
        lambda row: f"{construct_abbreviations.get(row['construct'], 'UNK')}{row.name+1}", axis=1
    )

    # Verify the generated labels
    item_labels = items_df['item_label'].tolist()
    constructs = items_df['construct'].tolist()

    # Export labeled items for R
    items_df.to_csv("psychometric_items.csv", index=False)
    print("Items exported to 'psychometric_items.csv'.")
else:
    print("No items to label or export.")

Items exported to 'psychometric_items.csv'.


In [13]:
# Prepare data for R analysis
if embeddings_array is not None and items_df is not None:
    # Pass variables from Python to R
    # Install rpy2 for R and Python integration
    %load_ext rpy2.ipython
    %R -i embeddings_array -i item_labels -i constructs -i library_path
else:
    print("Cannot proceed to R analysis due to missing data.")

In [14]:
%%R

# Set the custom library path
.libPaths(library_path)

# List of required packages
required_packages <- c("EGAnet", "aricode", "psych", "dplyr", "stringr", "readr", "qgraph")

# Install and load necessary R packages
for (pkg in required_packages) {
    if (!requireNamespace(pkg, quietly = TRUE)) {
        install.packages(pkg, lib = library_path, dependencies = TRUE)
    }
    library(pkg, character.only = TRUE)
}


EGAnet (version 2.1.0)[0m[0m 

For help getting started, see <https://r-ega.net> 

For bugs and errors, submit an issue to <https://github.com/hfgolino/EGAnet/issues>

Attaching package: ‘dplyr’



    filter, lag



    intersect, setdiff, setequal, union




In [16]:
%%R

# Verify dimensions of embeddings_array and item_labels
print(dim(embeddings_array))  # Should return (number of items, embedding dimensions)
print(length(item_labels))    # Should match the number of items in embeddings_array

# Ensure the dimensions match
if (nrow(embeddings_array) != length(item_labels)) {
    stop("The number of rows in embeddings_array does not match the length of item_labels.")
}

[1]    9 1536
[1] 9


In [17]:
%%R

#Convert embeddings_array to matrix and assign row names
embeddings_matrix <- as.matrix(embeddings_array)
rownames(embeddings_matrix) <- item_labels

# Compute the correlation matrix
cor_matrix <- cor(t(embeddings_matrix))

# Handle potential NA values in the correlation matrix
cor_matrix[is.na(cor_matrix)] <- 0

# Print the correlation matrix dimensions for verification
print(dim(cor_matrix))


[1] 9 9


In [18]:
%%R

# Apply Unique Variable Analysis (UVA) to identify redundant items
uva_result <- UVA(
    data = cor_matrix,
    n = nrow(cor_matrix),
    method = "wTO",
    threshold = 0.20  # Adjusted threshold as per AI-GENIE method
)

# Items to remove (redundant items)
redundant_items <- uva_result$redundant

# Print redundant items
print("Redundant Items:")
print(redundant_items)

[1] "Redundant Items:"
NULL


In [21]:
%%R

# Remove redundant items from embeddings and correlation matrix
embeddings_matrix_reduced <- embeddings_matrix[!rownames(embeddings_matrix) %in% redundant_items, ]
cor_matrix_reduced <- cor_matrix[!rownames(cor_matrix) %in% redundant_items, !colnames(cor_matrix) %in% redundant_items]

# Update item_labels and constructs
item_labels_reduced <- item_labels[!item_labels %in% redundant_items]
constructs_reduced <- constructs[!item_labels %in% redundant_items]

# Ensure constructs_reduced and item_labels_reduced are vectors
constructs_reduced <- as.vector(constructs_reduced)
item_labels_reduced <- as.vector(item_labels_reduced)

# Ensure alignment between constructs_reduced and item_labels_reduced
names(constructs_reduced) <- item_labels_reduced


In [26]:
%%R
# Print the structure of known_communities
print("Structure of known_communities:")
str(known_communities)

# Convert known_communities to a vector
known_communities_vector <- unlist(known_communities, use.names = FALSE)

# Similarly, ensure constructs_reduced is a vector
constructs_reduced_vector <- unlist(constructs_reduced, use.names = FALSE)

# Print the structures to confirm
print("Structure of known_communities_vector:")
str(known_communities_vector)
print("Structure of constructs_reduced_vector:")
str(constructs_reduced_vector)

[1] "Structure of known_communities:"
List of 9
 $ PSS1: chr "Perfekcjonizm skierowany na siebie"
 $ PSS2: chr "Perfekcjonizm skierowany na siebie"
 $ PSS3: chr "Perfekcjonizm skierowany na siebie"
 $ PSI4: chr "Perfekcjonizm skierowany na innych"
 $ PSI5: chr "Perfekcjonizm skierowany na innych"
 $ PSI6: chr "Perfekcjonizm skierowany na innych"
 $ PSP7: chr "Perfekcjonizm społecznie narzucony"
 $ PSP8: chr "Perfekcjonizm społecznie narzucony"
 $ PSP9: chr "Perfekcjonizm społecznie narzucony"
[1] "Structure of known_communities_vector:"
 chr [1:9] "Perfekcjonizm skierowany na siebie" ...
[1] "Structure of constructs_reduced_vector:"
 chr [1:9] "Perfekcjonizm skierowany na siebie" ...


In [29]:


%%R

library(EGAnet)
library(aricode)

# Check structures before starting
print("Structure of known_communities:")
str(known_communities)
print("Structure of constructs_reduced:")
str(constructs_reduced)

# Convert known_communities to a data frame
known_df <- data.frame(
    item_label = names(known_communities),
    known_community = unlist(known_communities, use.names = FALSE),
    stringsAsFactors = FALSE
)

# Initialize NMI values
nmi_values <- numeric(10)

# Debugging: Check the structure of cor_matrix_reduced
print("Dimensions of cor_matrix_reduced:")
print(dim(cor_matrix_reduced))
print("First 5x5 of cor_matrix_reduced:")
print(cor_matrix_reduced[1:5, 1:5])

# Loop through step sizes to compute NMI
for (step_size in 1:10) {
    ega_result <- EGA(
        data = cor_matrix_reduced,
        corr = NULL,  # NULL because data is already a correlation matrix
        n = nrow(cor_matrix_reduced),
        model = "glasso",
        algorithm = "walktrap",
        steps = step_size,
        plot.EGA = FALSE
    )

    # Check if EGA result contains valid communities
    if (is.null(ega_result$wc) || length(ega_result$wc) == 0) {
        warning("EGA did not find any communities at step size ", step_size)
        nmi_values[step_size] <- NA
        next
    }

    # Detected communities
    detected_communities <- ega_result$wc

    # Convert detected_communities to a data frame
    detected_df <- data.frame(
        item_label = names(detected_communities),
        detected_community = unlist(detected_communities, use.names = FALSE),
        stringsAsFactors = FALSE
    )

    # Merge known and detected communities on item_label
    merged_df <- merge(known_df, detected_df, by = "item_label", all = FALSE)

    # Ensure we have matching data
    if (nrow(merged_df) == 0) {
        warning("No matching items between known and detected communities at step size ", step_size)
        nmi_values[step_size] <- NA
        next
    }

    # Convert communities to factors
    known_communities_factor <- as.factor(merged_df$known_community)
    detected_communities_factor <- as.factor(merged_df$detected_community)

    # Check lengths
    length_known <- length(known_communities_factor)
    length_detected <- length(detected_communities_factor)
    cat("Length of known communities:", length_known, "\n")
    cat("Length of detected communities:", length_detected, "\n")

    # Remove NA values
    valid_indices <- !is.na(known_communities_factor) & !is.na(detected_communities_factor)
    if (sum(valid_indices) == 0) {
        warning("No valid indices for NMI calculation at step size ", step_size)
        nmi_values[step_size] <- NA
        next
    }

    # Compute NMI
    nmi <- NMI(
        known_communities_factor[valid_indices],
        detected_communities_factor[valid_indices]
    )
    nmi_values[step_size] <- nmi
}

# Identify optimal step size
valid_nmi_indices <- which(!is.na(nmi_values))
if (length(valid_nmi_indices) == 0) {
    stop("No valid NMI values found across all step sizes.")
}
optimal_step <- valid_nmi_indices[which.max(nmi_values[valid_nmi_indices])]

# Output the results
cat("Optimal step size:", optimal_step, "with NMI:", nmi_values[optimal_step], "\n")




[1] "Structure of known_communities:"
List of 9
 $ PSS1: chr "Perfekcjonizm skierowany na siebie"
 $ PSS2: chr "Perfekcjonizm skierowany na siebie"
 $ PSS3: chr "Perfekcjonizm skierowany na siebie"
 $ PSI4: chr "Perfekcjonizm skierowany na innych"
 $ PSI5: chr "Perfekcjonizm skierowany na innych"
 $ PSI6: chr "Perfekcjonizm skierowany na innych"
 $ PSP7: chr "Perfekcjonizm społecznie narzucony"
 $ PSP8: chr "Perfekcjonizm społecznie narzucony"
 $ PSP9: chr "Perfekcjonizm społecznie narzucony"
[1] "Structure of constructs_reduced:"
List of 9
 $ PSS1: chr "Perfekcjonizm skierowany na siebie"
 $ PSS2: chr "Perfekcjonizm skierowany na siebie"
 $ PSS3: chr "Perfekcjonizm skierowany na siebie"
 $ PSI4: chr "Perfekcjonizm skierowany na innych"
 $ PSI5: chr "Perfekcjonizm skierowany na innych"
 $ PSI6: chr "Perfekcjonizm skierowany na innych"
 $ PSP7: chr "Perfekcjonizm społecznie narzucony"
 $ PSP8: chr "Perfekcjonizm społecznie narzucony"
 $ PSP9: chr "Perfekcjonizm społecznie narzucony"
[1]

  Input into 'argument' is 'NULL' type. Input is expected to be 'character' or 'function' type

 For more details on how to fix this error, see:
https://r-ega.net/articles/errors.html#typeof-error




Error in "set_default" : 
  Input into 'argument' is 'NULL' type. Input is expected to be 'character' or 'function' type

 For more details on how to fix this error, see:
https://r-ega.net/articles/errors.html#typeof-error
