In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import os


These are the fundamental packages we need to use in order to transform that data from the original format (although many quesations have been re-scaled already in Google Sheets) to the final usable format, where we apply Min-Max Scaling to the composite variables.

In [9]:
# Define file path (update if needed)
file_path = r"C:\Users\12012\Downloads\JN Version CSV - Sheet1 (2).csv"


# Check if the file exists
if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    print("✅ Data Loaded Successfully!")
else:
    print("❌ File not found. Check the filename and path.")

# Display the first few rows
df.head()


✅ Data Loaded Successfully!


Unnamed: 0,Respondent ID,Respondant Zip Code,Urban or Rural Re-scaled,Age,Gender,Ethnicity,Income Re-scaled,Education Re-scaled,Household size Re-scaled,Marital,...,Crisis Composit Score,Systems Thinking Score,Trust Score,Conspiracy,Complexity,Openness,Conscienciousness,Extroversion,Agreeableness,Neuroticism
0,1447,86406,1,72,Male,White/Caucasian,8,4,2,Married or Partnered,...,3,27,4.333334,10,11,19,19,17,18,4
1,1448,85260,1,64,Male,White/Caucasian,7,5,1,Single,...,0,13,3.333333,8,11,14,20,18,19,3
2,1449,85741,1,35,Female,Hispanic,3,2,2,Single,...,2,15,4.333334,8,15,9,16,9,13,5
3,1450,85042,0,44,Female,White/Caucasian,1,1,6,Married or Partnered,...,11,20,2.666667,15,12,8,9,9,12,21
4,1451,85212,1,66,Female,White/Caucasian,5,2,3,Married or Partnered,...,3,27,2.666667,8,6,16,19,14,18,12


# Test 1: Ensuring all rows of respondents are present
### We know that there should be 1042 responses in total, representing the 1042 AZ-based survey respondents. (There were a similar number of Austrian respondents, but due to variations in their data, we will only study the Arizona cohort here).

In [18]:
# Test to ensure we have 1042 respondents accounted for
expected_rows = 1042
assert df.shape[0] == expected_rows, f"❌ Error: Expected {expected_rows} rows, but found {df.shape[0]}!"
print(f"✅ Test Passed: All {expected_rows} respondents are included.")


✅ Test Passed: All 1042 respondents are included.


In [11]:
print(df.columns)




Index(['Respondent ID', 'Respondant Zip Code', 'Urban or Rural Re-scaled',
       'Age', 'Gender', 'Ethnicity', 'Income Re-scaled', 'Education Re-scaled',
       'Household size Re-scaled', 'Marital', 'Employment Simple',
       'Elections: I think elections are fair and reliable Re-scaled',
       'Elections: During the last (2022) election, I voted:',
       'Elections: I understand the term 'democracy' to mean:',
       'Elections: How would you define the political system in which you live? ',
       'Political System Oppinions: Which of these two political systems do you think is the most effective and functional for making complex decisions? ',
       'Political System Oppinions: Which of these two political systems would you prefer to live in?',
       'Economic Instability Re-scaled', 'Energy Crisis Re-scaled',
       'Climate Change Re-scaled', 'Lack of Natural Resources Re-scaled',
       'Militarty Conflicts Re-scaled', 'Interstate Conflicts Re-scaled',
       'Employment an

In [13]:
from sklearn.preprocessing import MinMaxScaler

# Define the composite score columns: 
# these columns existed in the original survey result data, 
# but the individual scales and questions that the scores emerged from
# were removed prior to the selection of the final variables for this exploration.

composite_columns = [
    "Systems Thinking Score", 
    "Trust Score", 
    "Conspiracy", 
    "Complexity", 
    "Openness", 
    "Conscienciousness", 
    "Extroversion", 
    "Agreeableness", 
    "Neuroticism"
]

# Initialize the scaler
scaler = MinMaxScaler()

# Apply MinMax scaling to the composite columns defined above
df[composite_columns] = scaler.fit_transform(df[composite_columns])

# Display the first few rows after scaling
df.head()


Unnamed: 0,Respondent ID,Respondant Zip Code,Urban or Rural Re-scaled,Age,Gender,Ethnicity,Income Re-scaled,Education Re-scaled,Household size Re-scaled,Marital,...,Crisis Composit Score,Systems Thinking Score,Trust Score,Conspiracy,Complexity,Openness,Conscienciousness,Extroversion,Agreeableness,Neuroticism
0,1447,86406,1,72,Male,White/Caucasian,8,4,2,Married or Partnered,...,3,0.75,0.833333,0.333333,0.4,0.888889,0.888889,0.777778,0.833333,0.055556
1,1448,85260,1,64,Male,White/Caucasian,7,5,1,Single,...,0,0.361111,0.583333,0.2,0.4,0.611111,0.944444,0.833333,0.888889,0.0
2,1449,85741,1,35,Female,Hispanic,3,2,2,Single,...,2,0.416667,0.833333,0.2,0.666667,0.333333,0.722222,0.333333,0.555556,0.111111
3,1450,85042,0,44,Female,White/Caucasian,1,1,6,Married or Partnered,...,11,0.555556,0.416667,0.666667,0.466667,0.277778,0.333333,0.333333,0.5,1.0
4,1451,85212,1,66,Female,White/Caucasian,5,2,3,Married or Partnered,...,3,0.75,0.416667,0.2,0.066667,0.722222,0.888889,0.611111,0.833333,0.5


After applying the MinMax scaler to the composite variable columns, we have a new data set. Let's continue to use this one for further examination of the data. The following code block will create a CSV file and export the MinMax Scaled data.

In [12]:
# Save the cleaned and scaled dataset
df.to_csv("scaled_data.csv", index=False)
print("✅ Scaled dataset saved as 'scaled_data.csv'")


✅ Scaled dataset saved as 'scaled_data.csv'


# Test 2: Ensuring re-scaling has occurred
### The columns defined as "Composite Columns" are "scores" based on scales, made up of multiple questions. The questions are less important individually than the composite scores are, as a way of comparing and studying respondents.
### When MinMax scaling, the goal is to take column data, and re-scale it to between 0 and 1.
### Scores that were once between 3 and 21 should now be more easily compared to scores in the 5 to 21 scale.
### The test below will ensure that the columns that *shoud* be re-scaled, *are* re-scaled, and that there are no values less than 0 or more than 1.


In [21]:
# TEST 2: Testing for Correct Value Range
assert df[composite_columns].min().min() >= 0, "❌ Error: Min value is below 0!"
assert df[composite_columns].max().max() <= 1, "❌ Error: Max value is above 1!"
print("✅ Test 2 Passed: All values are between 0 and 1.")


✅ Test 2 Passed: All values are between 0 and 1.


# Test 3: Ensure that there are no missing values in the data
## With so many lines of data, any errors due to missing values might mis-align data in the larger data frame. Looking for missing data values before running analysis at a larger scale will ensure that we can proceed confidently without worring about NaN values.


In [23]:
# Test that there are no NaN values in the re-scaled dataset
assert df[composite_columns].isnull().sum().sum() == 0, "❌ Error: There are missing values in the dataset!"
print("✅ Test 3 Passed: No missing values in the dataset.")


✅ Test 3 Passed: No missing values in the dataset.
