In [14]:
# Use this helper to generate a list of columnID's to remove from a dataframe. 
# If you need to change removed pairs, add the # REMOVE tag to any line in the annotate_this_map.py file.
# The script will extract the columnID's and write them to a new file called drop_list.py

def extract_removed_pairs(file_path, output_file_path):
    removed_pairs = set()
    
    with open(file_path, 'r') as file:
        for line_num, line in enumerate(file, 1):
            line = line.strip()
            print(f"Processing line {line_num}: {line}")
            
            if '# REMOVE' in line:
                # Split and clean key, then process it
                key_value_pair = line.split(':', 1)
                if len(key_value_pair) > 1:
                    key = key_value_pair[0].strip().replace('"', '')  # Remove any extra quotes
                    key = key.lower()  # Convert entire key to lowercase
                    if key.startswith("qn"):
                        key = "QN" + key[2:]  # Ensure only first two letters are uppercase
                    removed_pairs.add(key)
                    print(f"Added key: {key}")

    # Write the collected keys as a list of strings to the output file
    if removed_pairs:
        with open(output_file_path, 'w') as output_file:
            formatted_keys = ', '.join(f"'{key}'" for key in sorted(removed_pairs))
            output_file.write(f'dropped = [{formatted_keys}]')  # Output in list format

    print(f"Collected column IDs: {sorted(removed_pairs)}")
    print(f"Unique column IDs of questions tagged with # REMOVE have been exported to {output_file_path} as a list.")

# Usage
file_path = 'map_annotated.py'  # Replace with your actual file path
output_file_path = 'drop_list.py'  
extract_removed_pairs(file_path, output_file_path)





Processing line 1: questions = {
Processing line 2: "Artificial_id": "Artificial Unique Identifier",
Processing line 3: "LOCATION": "Where are you currently taking this survey?", # -> Categorical
Processing line 4: "QN1": "How old are you?", # -> Categorical: 0-13, 14-18, 19+
Processing line 5: "QN2": "What is your sex?", # -> Categorical
Processing line 6: "QN3": "What grade are you in?", # -> Categorical: Middle School or High School
Processing line 7: "QN4A": "Are you Hispanic, Latino, Latina, or of Spanish origin? (No, not of Hispanic, Latino, Latina, or Spanish origin)", # REMOVE: to avoid Dummy Variable Trap
Added key: QN4a
Processing line 8: "QN4B": "Are you Hispanic, Latino, Latina, or of Spanish origin? (Yes, Mexican, Mexican American, Chicano, or Chicana)",
Processing line 9: "QN4C": "Are you Hispanic, Latino, Latina, or of Spanish origin? (Yes, Puerto Rican)",
Processing line 10: "QN4D": "Are you Hispanic, Latino, Latina, or of Spanish origin? (Yes, Cuban)",
Processing line 