In [40]:
# library imports
import pandas as pd
import numpy as np
import os
import glob
import csv

In [41]:
def delete_specific_files(file_paths):
    """
    Deletes specific files given their paths.

    Parameters:
        file_paths (list): A list of file paths to delete.

    Returns:
        dict: A dictionary with the status of each file.
              Format: {file_path: "Deleted" or "Error: <reason>"}
    """
    result = {}

    for file_path in file_paths:
        try:
            if os.path.exists(file_path):
                os.remove(file_path)
                result[file_path] = "Deleted"
            else:
                result[file_path] = "Error: File does not exist"
        except Exception as e:
            result[file_path] = f"Error: {str(e)}"

    return result

In [42]:
def drop_column_in_csv(input_file, output_file, column_to_drop):
    """
    Drops a specific column from a CSV file.

    Parameters:
        input_file (str): Path to the input CSV file.
        output_file (str): Path to the output CSV file with the column removed.
        column_to_drop (str): The name of the column to be dropped.

    Returns:
        None
    """
    try:
        # Read the input CSV
        with open(input_file, 'r', newline='', encoding='utf-8') as infile:
            reader = csv.DictReader(infile)
            fieldnames = [col for col in reader.fieldnames if col != column_to_drop]
            
            if column_to_drop not in reader.fieldnames:
                print(f"Column '{column_to_drop}' not found in the CSV.")
                return

            # Write to the output CSV
            with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
                writer = csv.DictWriter(outfile, fieldnames=fieldnames)
                writer.writeheader()
                for row in reader:
                    del row[column_to_drop]
                    writer.writerow(row)

        print(f"Column '{column_to_drop}' has been removed and saved to '{output_file}'.")
    
    except FileNotFoundError:
        print(f"Error: File '{input_file}' not found.")
    except Exception as e:
        print(f"Error: {str(e)}")

In [43]:
def check_duplicate_cols(file_path):
    '''
    Input: csv file path
    Checks if the csv file have duplicate column names.
    '''
    df = pd.read_csv(file_path,encoding="utf-8")

    # Get column names and find duplicates
    column_names = df.columns
    duplicate_columns = column_names[column_names.duplicated()].tolist()

    if duplicate_columns:
        print(f"WARNING! Duplicate column titles found: {duplicate_columns}")
    else:
        print("\nNice! No duplicate column titles found.\n")

In [44]:
def get_columns_from_formatted_codebook(file_path_formatted_codebook, verbose):
    '''
    input: formatted codebook filepath
    Read the formatted codebook csv to get the questions and their color categories
    '''
    # Read the dataset from file
    df = pd.read_csv(file_path_formatted_codebook)

    # Creating no_color_questions for rows where 'color' is empty
    no_color_questions = df.loc[df['Color_Category'].isnull(), 'Custom_variable_name'].tolist()

    # Creating lists for each color, where grey_questions=question having sub categories.
    grey_questions  = df.loc[df['Color_Category'] == 'Grey', 'Custom_variable_name'].tolist()
    yellow_questions = df.loc[df['Color_Category'] == 'Yellow', 'Custom_variable_name'].tolist()
    green_questions  = df.loc[df['Color_Category'] == 'Green', 'Custom_variable_name'].tolist()
    
    question_with_characteristics= df.loc[df['Characteristic'] != '-', 'Custom_variable_name'].tolist()

    # Output the results if needed
    if(verbose):
        print("::::get_columns_from_formatted_codebook:::::Returns")
        print("no_color_questions:", no_color_questions)
        print("grey_questions(question having sub categories):", grey_questions )
        print("yellow_questions:", yellow_questions )
        print("green_questions:", green_questions )
        print("green_questions:", green_questions )
    return no_color_questions, grey_questions,yellow_questions, green_questions,question_with_characteristics

In [45]:
def filter_datasets_for_required_col(file_path_dataset,file_path_formatted_sosec_code_book, columns_to_keep, output_folder_path=r"../data/1_preprocess/",verbose=False):
    '''
    input:codebook file path and sosec dataset file path
    Only Keeps the columns in the dataset which are also in the codebook and saves the files
    also Saves a file for the columns in the code book which are not in the dataset
    '''
    # Read the dataset CSV file
    df = pd.read_csv(file_path_dataset)

    # Check which columns are not found
    not_found_columns = [col for col in columns_to_keep if col not in df.columns]

    if not_found_columns:
        # Read the formatted_sosec_code_book CSV file
        file_path_2 = file_path_formatted_sosec_code_book
        df2 = pd.read_csv(file_path_2,encoding="utf-8")
        
        # Filter rows where Custom_variable_name matches values in not_found_columns
        df_no_matches = df2[df2['Custom_variable_name'].isin(not_found_columns)]
        
        # Select the corresponding Text values
        result_df = df_no_matches[['Custom_variable_name', 'Text']]
        
        # Save the result as a new CSV
        output_file = r'1_codebook_no_matching_columns_in_dataset.csv'
        output_file_path = os.path.join(output_folder_path, output_file)
        result_df.to_csv(output_file_path, index=False)
        if(verbose):
            print(f"Not Matched columns saved to: {output_file_path}")

    # Filter the DataFrame to keep only the columns that exist in the DataFrame
    df_filtered = df[[col for col in columns_to_keep if col in df.columns]]
    
    output_file = r'1_df_dataset_with_codebook_columns_full_no_processing.csv'
    output_file_path = os.path.join(output_folder_path, output_file)

    df_filtered.to_csv(output_file_path, index=False)

    if(verbose):
        print(df_filtered.head())
    
    return output_file_path+""

In [46]:
def get_row_count(file_path_csv):
    '''
    input filepath of csv
    Returns the number of rows in the given CSV file.
    '''
    # Load the dataset
    df = pd.read_csv(file_path_csv)

    # Get the number of rows
    row_count = len(df)

    print(f"Number of rows in the csv: {row_count}")
    return row_count

In [47]:
def count_columns_in_csv(file_path_csv):
    """
    Counts and returns the number of columns in a CSV file.
    
    Parameters:
    file_path (str): The path to the CSV file.
    
    Returns:
    int: The number of columns in the CSV file.
    """
    # Load the CSV file into a DataFrame
    df = pd.read_csv(file_path_csv)
    
    # Get the number of col
    columns_count = len(df.columns)
    
    
    print(f"Number of columns in the CSV: {columns_count}")
    # Return the number of columns
    return columns_count

In [48]:
#Excluded


def perform_filter_on_dataset_F2A3(file_path_dataset_with_codebook_columns,output_folder_path=r"../data/1_preprocess/"):
    '''
    input the datafile path with only codebook columns
    replace 6 and 0 by blank
    F2A3 = The possibility of losing your job (leave this empty if you do not work)
    '''
    # Load the dataset
    df = pd.read_csv(file_path_dataset_with_codebook_columns)

    # Replace 6 and 0 in the 'F2A3' column with  empty
    df['F2A3'] = df['F2A3'].replace({6: "", 0: ""})

    # Save the updated DataFrame back to a CSV file
    output_file = r'2_df_dataset_with_codebook_columns_full_F2A3.csv'
    output_file_path = os.path.join(output_folder_path, output_file)
    df.to_csv(output_file_path, index=False)

    print(f"Updated dataset saved to: {output_file_path}")
    return output_file_path+""

In [49]:
#Excluded

def perform_filter_on_dataset_F7mA1(file_path_dataset_with_codebook_columns,output_folder_path=r"../data/1_preprocess/"):
    '''
    input the datafile path with only codebook columns
    replace 0 by blank
    F7mA1 = job category (leave this empty if you do not work)
    '''
    # Load the dataset
    df = pd.read_csv(file_path_dataset_with_codebook_columns)

    # Replace 0 in the 'F7mA1' column with  empty
    df['F7mA1'] = df['F7mA1'].replace({0: ""})

    # Save the updated DataFrame back to a CSV file
    output_file = r'3_df_dataset_with_codebook_columns_full_F7mA1.csv'
    output_file_path = os.path.join(output_folder_path, output_file)    
    df.to_csv(output_file_path, index=False)

    print(f"Updated dataset saved to: {output_file_path}")
    return output_file_path+""

In [50]:
#EXCLUDED:
def delete_rows_for_out_of_range_data(sosec_data_path,file_path_dataset_with_codebook_columns,output_folder_path=r"../data/1_preprocess/",verbose=False):
    '''
    input: sosec_datafile path and sosec dataset with only codebook columns
    Deletes the rows for all the columns for with the values are out of range. and saves the csv
    '''
    file1 = pd.read_csv(sosec_data_path)

    file2 = pd.read_csv(file_path_dataset_with_codebook_columns)

    # Create a dictionary from file2 with ranges
    ranges = {}
    for _, row in file2.iterrows():
        col = row['Custom_variable_name']
        range_str = row['Characteristic']
        
        # Check if range_str is a valid string before splitting
        if isinstance(range_str, str) and range_str and range_str != '-':  # Valid range and not 'F'
            # If there's a valid range, split it into a list of integers
            ranges[col] = list(map(int, range_str.split(',')))

        else:
            # If no valid range is provided (empty or 'F'), set the range to None
            ranges[col] = None

    # Validate the data in file1 against the ranges
    def validate_data(file1, ranges):
        errors = []
        valid_rows = file1.copy()  # Copy of the original DataFrame to modify
        
        # Loop through each column and validate values
        for col in file1.columns:
            if col in ranges:
                valid_range = ranges[col]
                if valid_range is not None:  # Only check if a valid range exists
                    # Create a boolean mask for invalid rows
                    invalid_rows = ~(valid_rows[col].isin(valid_range) | valid_rows[col].isna())      

                    # Track errors for rows with out-of-range data
                    for index, value in valid_rows[invalid_rows][col].dropna().items():
                        errors.append(f"Out of range: {col} at row {index + 1} with value {value}")
                    
                    # Remove rows with invalid data
                    valid_rows = valid_rows[~invalid_rows]
                else:
                    # If no range is provided, assume all values are valid for that column
                    continue
        
        return valid_rows, errors

    # Get valid rows and errors
    valid_rows, errors = validate_data(file1, ranges)

    # Save the valid rows to a CSV file
    output_file = r"4_df_dataset_with_codebook_columns_filtered_outofrange.csv"
    output_file_path = os.path.join(output_folder_path, output_file)
    valid_rows.to_csv(output_file_path, index=False)

    if(verbose):
        # Output validation errors
        if errors:
            print("Validation Errors:")
            for error in errors:
                print(error)
        else:
            print("All data is within valid ranges.")

    # Print where the cleaned data has been saved
    print(f"Cleaned data saved to '{output_file_path}'.")

    return output_file_path+""

In [51]:
#26-01-2025: updated method.
def handle_invalid_values(sosec_data_path, file_path_dataset_with_codebook_columns, output_folder_path=r"../data/1_preprocess/", verbose=False):
    '''
    input: sosec_datafile path and sosec dataset with only codebook columns
    Replaces out-of-range values with "" (null) instead of deleting rows and saves the CSV.
    '''
    file1 = pd.read_csv(sosec_data_path)
    file2 = pd.read_csv(file_path_dataset_with_codebook_columns)

    # Create a dictionary from file2 with ranges
    ranges = {}
    for _, row in file2.iterrows():
        col = row['Custom_variable_name']
        range_str = row['Characteristic']
        
        # Check if range_str is a valid string before splitting
        if isinstance(range_str, str) and range_str and range_str != '-':  # Valid range and not 'F'
            # If there's a valid range, split it into a list of integers
            ranges[col] = list(map(int, range_str.split(',')))
        else:
            # If no valid range is provided (empty or 'F'), set the range to None
            ranges[col] = None

    # Validate the data in file1 against the ranges
    def validate_data(file1, ranges):
        errors = []
        valid_data = file1.copy()  # Copy of the original DataFrame to modify

        # Loop through each column and validate values
        for col in file1.columns:
            if col in ranges:
                valid_range = ranges[col]
                if valid_range is not None:  # Only check if a valid range exists
                    # Create a boolean mask for invalid values
                    invalid_mask = ~(valid_data[col].isin(valid_range) | valid_data[col].isna())

                    # Track errors for rows with out-of-range data
                    for index, value in valid_data[invalid_mask][col].dropna().items():
                        errors.append(f"Out of range: {col} at row {index + 1} with value {value}")

                    # Replace invalid values with an empty string
                    #valid_data.loc[invalid_mask, col] = ""
                    valid_data.loc[invalid_mask, col] = np.nan  # Assign NaN for invalid values
                else:
                    # If no range is provided, assume all values are valid for that column
                    continue
        
        return valid_data, errors

    # Get valid data and errors
    valid_data, errors = validate_data(file1, ranges)

    # Save the modified data to a CSV file
    output_file = r"4_df_dataset_with_codebook_columns_filtered_outofrange.csv"
    output_file_path = os.path.join(output_folder_path, output_file)
    valid_data.to_csv(output_file_path, index=False)

    if verbose:
        # Output validation errors
        if errors:
            print("Validation Errors:")
            for error in errors:
                print(error)
        else:
            print("All data is within valid ranges.")

    # Print where the cleaned data has been saved
    print(f"Cleaned data saved to '{output_file_path}'.")

    return output_file_path + ""

# Example usage:
# handle_invalid_values("sosec_data.csv", "codebook_columns.csv", verbose=True)


In [52]:
#Old method: not in use.

def perform_filter_on_dataset_F7cA1(file_path_dataset_with_codebook_columns,min_yob,max_yob,output_folder_path=r"../data/1_preprocess/"):
    '''
    filter rows based on range for F7cA1(Yob).
    '''
    # Load the dataset
    df = pd.read_csv(file_path_dataset_with_codebook_columns)

    # Filter out rows where 'F7cA1' is not in range
    df = df[(df['F7cA1'] >= min_yob) & (df['F7cA1'] <= max_yob)]

    # Save the updated DataFrame back to a CSV file
    output_file = r'5_df_dataset_with_codebook_columns_filtered_F7cA1-yob.csv'
    output_file_path = os.path.join(output_folder_path, output_file)
    df.to_csv(output_file_path, index=False)

    print(f"Updated dataset saved to: {output_file_path}")
    return output_file_path


In [53]:
#MODIFIED: 26-01-2025:  invalid values are replaced by null and are not deleted.

def perform_filter_on_dataset_F7cA1(file_path_dataset_with_codebook_columns, min_yob, max_yob, output_folder_path=r"../data/1_preprocess/"):
    '''
    Replace out-of-range values with "" (null) for F7cA1 (Year of Birth).
    '''
    # Load the dataset
    df = pd.read_csv(file_path_dataset_with_codebook_columns)

    # Replace values in 'F7cA1' that are out of range with "" (null)
    out_of_range_mask = (df['F7cA1'] < min_yob) | (df['F7cA1'] > max_yob)
    #df.loc[out_of_range_mask, 'F7cA1'] = ""
    df.loc[out_of_range_mask, 'F7cA1'] = np.nan  # Assign NaN for invalid values

    # Save the updated DataFrame back to a CSV file
    output_file = r'5_df_dataset_with_codebook_columns_filtered_F7cA1-yob.csv'
    output_file_path = os.path.join(output_folder_path, output_file)
    df.to_csv(output_file_path, index=False)

    print(f"Updated dataset saved to: {output_file_path}")
    return output_file_path


In [54]:
def update_codefile_YOB_char_and_label(file_path, custom_variable_name, yob_start, yob_end, output_file):
    """
    Updates the Characteristic and Value_labels in the CSV file for a specific Custom_variable_name
    and saves the updated data to a new file.

    Parameters:
        file_path (str): Path to the input CSV file.
        custom_variable_name (str): The Custom_variable_name to search for.
        yob_start (int): Start year for Characteristic.
        yob_end (int): End year for Characteristic.
        output_file (str): Path to save the updated CSV file.

    Returns:
        None
    """
    updated_rows = []
    updated_characteristic = ", ".join(str(year) for year in range(yob_start, yob_end + 1))
    updated_value_labels = f"{yob_start} to {yob_end}"

    try:
        # Read the CSV file
        with open(file_path, 'r', newline='', encoding='utf-8') as infile:
            reader = csv.DictReader(infile)
            fieldnames = reader.fieldnames
            
            # Ensure the required columns exist
            if "Custom_variable_name" not in fieldnames or "Characteristic" not in fieldnames or "Value_labels" not in fieldnames:
                print("Error: Required columns (Custom_variable_name, Characteristic, Value_labels) are missing in the CSV.")
                return

            # Process each row
            for row in reader:
                if row["Custom_variable_name"] == custom_variable_name:
                    row["Characteristic"] = updated_characteristic
                    row["Value_labels"] = updated_value_labels
                updated_rows.append(row)

        # Write to the new output CSV file
        with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(updated_rows)

        print(f"Updated CSV file has been saved as '{output_file}'.")

    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
    except Exception as e:
        print(f"Error: {str(e)}")




In [55]:
def drop_high_null_columns(FILE_PATH_DATASET,file_path_formatted_sosec_code_book ,col_to_exclude, threshold=0.7 ,output_folder_path=r"../data/1_preprocess/"):
    '''
    Deletes columns with more than 70% null values from file_name_A, excluding from list of columns and saves the dataset.
    Prints removed columns and saves them to a CSV file.

    Parameters:
        file_name_A (str): Path to the input CSV file.
        col_to_exclude(list): Name of the column to exclude from deletion.
        threshold (float): Proportion of nulls above which columns are dropped.
        output_file (str): Path to save the filtered dataset.
        removed_columns_file (str): Path to save the list of removed columns.
    '''

    output_file_csv = r"6_df_dataset_with_codebook_columns_filtered_lessdata.csv"
    output_file = os.path.join(output_folder_path, output_file_csv)
    # Load the dataset
    df_A = pd.read_csv(FILE_PATH_DATASET)
    
    # Calculate the threshold for null values
    null_threshold = threshold * len(df_A)
    
    # Identify columns to keep based on null percentage and exceptions
    cols_to_keep = [col for col in df_A.columns 
                    if (df_A[col].isna().sum() <= null_threshold) or col in col_to_exclude]
    
    # Identify the columns to remove
    cols_to_remove = [col for col in df_A.columns if col not in cols_to_keep]
    
    # Print the removed columns
    print(f"Removed columns: {cols_to_remove}")
    

    if cols_to_remove:
        # Read the formatted_sosec_code_book CSV file
        file_path_2 = file_path_formatted_sosec_code_book
        df2 = pd.read_csv(file_path_2,encoding="utf-8")
        
        # Filter rows where Custom_variable_name matches values in not_found_columns
        df_no_matches = df2[df2['Custom_variable_name'].isin(cols_to_remove)]
        
        # Select the corresponding Text values
        result_df = df_no_matches[['Custom_variable_name', 'Text']]
        
        # Save the result as a new CSV
        output_file_csv= r"6_columns_having_70percent_empty_values.csv"
        removed_columns_file = os.path.join(output_folder_path, output_file_csv)
        result_df.to_csv(removed_columns_file, index=False)
    
        print(f"Not Matched columns saved to: {removed_columns_file}")


    # Filter the DataFrame to keep only the selected columns
    df_filtered = df_A[cols_to_keep]

    # Save the filtered dataset
    df_filtered.to_csv(output_file, index=False)
    print(f"Filtered dataset saved to: {output_file}")
    print(f"List of removed columns saved to: {removed_columns_file}")
    
    #Note: filter not performed. 
    return FILE_PATH_DATASET+"" #output_file+""

In [56]:
def remove_below_percentile(file_path_sosec_dataset, percentile=10,output_folder_path=r"../data/1_preprocess/"):
    '''
    performs filter on i_TIME using the given percentile value
    input: csv of sosec dataset.
    percentile value
    output: filtered dataset as csv
    '''
    # Load the dataset
    df = pd.read_csv(file_path_sosec_dataset)

    # Calculate the 10th percentile of the 'i_TIME' column
    percentile_value = df['i_TIME'].quantile(percentile / 100.0)

    # Filter the DataFrame to keep only rows where 'i_TIME' is greater than or equal to the 10th percentile
    df_filtered = df[df['i_TIME'] >= percentile_value]

    # Save the updated DataFrame back to a CSV file
    output_file = r"7_df_dataset_with_codebook_columns_filtered_itime.csv"
    output_file_path = os.path.join(output_folder_path, output_file)
    df_filtered.to_csv(output_file_path, index=False)

    print(f"Updated dataset saved to: {output_file_path}")
    return output_file_path

In [57]:
# Function to load CSV and one-hot encode specific columns
def one_hot_encode_csv(df, columns_to_encode,output_folder_path=r"../data/1_combined_preprocess/"):
    '''
    input: csv of sosec dataset.
    list of columns to encode.
    deleted the columns_to_encode from the data.
    save the csv

    '''
    
    # Select only available columns from the list
    available_columns = [col for col in columns_to_encode if col in df.columns]

    # One-hot encode the available columns
    df_encoded = pd.get_dummies(df, columns=available_columns)

    output_file = r"8_df_dataset_with_codebook_columns_filtered_hotencoding.csv"
    output_file_path = os.path.join(output_folder_path, output_file)
    # Save the encoded DataFrame to a new CSV file
    df_encoded.to_csv(output_file_path, index=False)

    print(f"One-hot encoded CSV saved to {output_file_path}")
    return output_file_path+""



In [58]:
def delete_all_csv_files(folder_path):
    """
    Deletes all CSV files in the specified folder.

    Parameters:
        folder_path (str): Relative or absolute path to the folder.
    """
    # Get the full path of all CSV files in the folder
    csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

    if not (csv_files):
        print(f"No csv files to delete in {folder_path}")
    else:
        for file_path in csv_files:
            try:
                os.remove(file_path)
                print(f"Deleted: {file_path}")
            except Exception as e:
                print(f"Failed to delete {file_path}: {e}")
    print("\n")

In [59]:
def prepare_for_personas_format1(FILE_PATH_DATASET,FILE_PATH_REFORMATED_SOSEC_CODE_BOOK,output_folder_path=r"../data/1_preprocess/"):
    # Load the files
    file1_path = FILE_PATH_REFORMATED_SOSEC_CODE_BOOK
    file2_path = FILE_PATH_DATASET

    file1 = pd.read_csv(file1_path)
    file2 = pd.read_csv(file2_path)

    # Get the column names from file2 (excluding i_TIME)
    file2.rename(columns={"i_TIME": "i_TIME (Time taken in seconds to fill the survey)"}, inplace=True)
    file2_columns = file2.columns[0:]

    # Check which columns in file2 exist in the Custom_variable_name of file1
    common_columns = file1[file1['Custom_variable_name'].isin(file2_columns)]

    # Extract the required values
    text_row = common_columns.set_index('Custom_variable_name').reindex(file2_columns)['Text'].fillna('').values.tolist()
    characteristic_row = common_columns.set_index('Custom_variable_name').reindex(file2_columns)['Characteristic'].fillna('').values.tolist()
    value_labels_row = common_columns.set_index('Custom_variable_name').reindex(file2_columns)['Value_labels'].fillna('').values.tolist()

    # Insert these rows into file2
    file2 = pd.DataFrame(file2) 
    new_rows = pd.DataFrame([text_row, characteristic_row, value_labels_row], columns=file2.columns)

    # Combine the new rows with the original data
    file2 = pd.concat([new_rows, file2], ignore_index=True)

    # Save the updated file2 to a new CSV
    output_file = r"9_processed_data_for_personas_Format_1.csv"
    FILE_PATH_PERSONA_FORMAT1 = os.path.join(output_folder_path, output_file)
    file2.to_csv(FILE_PATH_PERSONA_FORMAT1, index=False)

    print(f"Updated file2 saved to {FILE_PATH_PERSONA_FORMAT1}")
    return FILE_PATH_PERSONA_FORMAT1


In [60]:
def prepare_for_personas_format2(FILE_PATH_PERSONA_FORMAT1,output_folder_path=r"../data/1_preprocess/"):
    # Load the files
    df= pd.read_csv(FILE_PATH_PERSONA_FORMAT1, header=None) 

    # Save the updated file1
    output_file = r"9_processed_data_for_personas_Format_2.csv"
    output_file_path = os.path.join(output_folder_path, output_file)

    # Remove the first row (the old header row)
    df = df.iloc[1:].reset_index(drop=True)

    # Save the updated DataFrame back to a CSV file without the header
    df.to_csv(output_file_path, header=False, index=False)



In [61]:
def check_duplicates_and_matches(file1, file2, output_duplicates_file1='duplicates_file1.csv',
                                 output_duplicates_file2='duplicates_file2.csv', output_matches='matching_rows.csv'):
    """
    Check for duplicates in two CSV files and identify matching rows between them.

    Parameters:
    - file1: str - Path to the first CSV file.
    - file2: str - Path to the second CSV file.
    - output_duplicates_file1: str - Path to save duplicate rows from file1 (default: 'duplicates_file1.csv').
    - output_duplicates_file2: str - Path to save duplicate rows from file2 (default: 'duplicates_file2.csv').
    - output_matches: str - Path to save matching rows between file1 and file2 (default: 'matching_rows.csv').

    Returns:
    - None: Saves duplicates and matching rows to the specified files.
    """

    # Load the first file
    print("\nLoading file1...")
    df1 = pd.read_csv(file1)
    print(f"File1 loaded: {len(df1)} rows.")

    # Load the second file
    print("\nLoading file2...")
    df2 = pd.read_csv(file2)
    print(f"File2 loaded: {len(df2)} rows.")

    # Step 1: Check for duplicates in file1
    print("\n\nChecking for duplicates in file1...")
    duplicates_file1 = df1[df1.duplicated(keep=False)]
    if not duplicates_file1.empty:
        print(f"Found {len(duplicates_file1)} duplicate rows in file1.")
        duplicates_file1.to_csv(output_duplicates_file1, index=False)
        print(f"Duplicate rows saved to {output_duplicates_file1}.")
    else:
        print("No duplicate rows found in file1.")

    # Step 2: Check for duplicates in file2
    print("\n\nChecking for duplicates in file2...")
    duplicates_file2 = df2[df2.duplicated(keep=False)]
    if not duplicates_file2.empty:
        print(f"Found {len(duplicates_file2)} duplicate rows in file2.")
        duplicates_file2.to_csv(output_duplicates_file2, index=False)
        print(f"Duplicate rows saved to {output_duplicates_file2}.")
    else:
        print("No duplicate rows found in file2.")

    # Step 3: Check for matching rows between file1 and file2
    print("\n\nChecking for matching rows between file1 and file2...")
    df1_set = set(tuple(row) for row in df1.values)
    df2_set = set(tuple(row) for row in df2.values)

    # Find intersections (matching rows)
    matching_rows = df1_set.intersection(df2_set)

    if matching_rows:
        print(f"Found {len(matching_rows)} matching row(s) between file1 and file2.")
        matching_rows_df = pd.DataFrame(list(matching_rows), columns=df1.columns)
        matching_rows_df.to_csv(output_matches, index=False)
        print(f"Matching rows saved to {output_matches}.")
    else:
        print("No matching rows found between file1 and file2.")


In [62]:
file1 = r"..\data\0_SOSEC Data RCS\data_sample_35_SOSEC_dataset_us.csv"
file2 = r"..\data\0_SOSEC Data RCS\new_data_sample.csv"

check_duplicates_and_matches(
    file1=file1,
    file2=file2,
    output_duplicates_file1=r"..\data\0_preprocess\duplicates_file1.csv",
    output_duplicates_file2=r"..\data\0_preprocess\duplicates_file2.csv",
    output_matches=r"..\data\0_preprocess\matching_rows.csv"
)


Loading file1...


  df1 = pd.read_csv(file1)


File1 loaded: 36781 rows.

Loading file2...


  df2 = pd.read_csv(file2)


File2 loaded: 33513 rows.


Checking for duplicates in file1...
No duplicate rows found in file1.


Checking for duplicates in file2...
No duplicate rows found in file2.


Checking for matching rows between file1 and file2...
No matching rows found between file1 and file2.


In [63]:
def merge_files(df1, df2, output_file, remove_duplicates=True):
    """
    Merge rows from file2 into file1 and save as a new CSV file.

    Parameters:
    - df1: str - Path to the first CSV df.
    - df2: str - Path to the second CSV df.
    - output_file: str - Path to save the merged file.
    - remove_duplicates: bool - Whether to remove duplicate rows after merging (default: True).

    Returns:
    - None: Saves the merged rows to the specified file.
    """
    # Merge rows
    print("Merging df...")
    merged_df = pd.concat([df1, df2], ignore_index=True)
    print(f"Files merged: {len(merged_df)} rows before removing duplicates.")

    # Remove duplicates if required
    if remove_duplicates:
        merged_df = merged_df.drop_duplicates()
        print(f"Duplicates removed: {len(merged_df)} rows after removing duplicates.")

    # Save to output file
    merged_df.to_csv(output_file, index=False)
    print(f"Merged file saved to {output_file}.")


In [64]:
file1 =  r"..\data\0_SOSEC Data RCS\data_sample_35_SOSEC_dataset_us.csv"
file2  = r"..\data\0_SOSEC Data RCS\new_data_sample.csv"


output_file = r"..\data\0_SOSEC Data RCS\0_combined_data.csv"




df1 = pd.read_csv(file1)  
df2 = pd.read_csv(file2, skiprows=[0])  # Skip rows 0


merge_files(df1, df2, output_file,remove_duplicates=False)





  df1 = pd.read_csv(file1)
  df2 = pd.read_csv(file2, skiprows=[0])  # Skip rows 0


Merging df...
Files merged: 70293 rows before removing duplicates.
Merged file saved to ..\data\0_SOSEC Data RCS\0_combined_data.csv.


## FIRST SAMPLE FILE

In [65]:
# Load the required data files.
FILE_PATH_DATASET = r"..\data\0_SOSEC Data RCS\data_sample_35_SOSEC_dataset_us.csv"

#FILE_PATH_DATASET = r"..\data\0_SOSEC Data RCS\0_combined_data.csv"

#FILE_PATH_DATASET = r"..\data\0_SOSEC Data RCS\new_data_sample.csv"



FILE_PATH_REFORMATED_SOSEC_CODE_BOOK = r'..\data\0_Reformated_SOSEC_Code-book_US_November_Reformulated_Questions_For_Dict_All_Columns.csv' 
OUTPUT_FOLDER_PATH=r"../data/1a_preprocess/"

VERBOSE = False

# Call the function with the relative path to delete all csv files in 1_preprocess folder.
delete_all_csv_files(r"../data/1a_preprocess/")

#Get list of columns from formatted sosec code book
#no_color_questions, grey_questions,yellow_questions, green_questions,question_with_characteristics
_i, _j, _k, _l,required_columns = get_columns_from_formatted_codebook(FILE_PATH_REFORMATED_SOSEC_CODE_BOOK,VERBOSE)

# list of column names to keep
columns_to_keep =  ['i_TIME'] + required_columns 

#Only keep the columns as per sosec dataset
output_file_path = filter_datasets_for_required_col(FILE_PATH_DATASET,FILE_PATH_REFORMATED_SOSEC_CODE_BOOK,columns_to_keep,OUTPUT_FOLDER_PATH,VERBOSE)

#Check for duplicate columns in the filtered dataset
check_duplicate_cols(output_file_path)

#Changed the 0 to 6 in the codebook.
#replace 6 and 0 in column F2A3 by empty cells
#output_file_path = perform_filter_on_dataset_F2A3(output_file_path,OUTPUT_FOLDER_PATH)
#print("\nNo of rows and columns in original dataset after setting 6,0 to empty in F2A3")
#no_of_rows = get_row_count(output_file_path)
#no_of_columns = count_columns_in_csv(output_file_path)

#replace 0 in column F7mA1 (job category) by empty cells
#output_file_path = perform_filter_on_dataset_F7mA1(output_file_path,OUTPUT_FOLDER_PATH)
#print("\nNo of rows and columns in original dataset after setting 0 to empty in F7mA1")
#no_of_rows = get_row_count(output_file_path)
#no_of_columns = count_columns_in_csv(output_file_path)

#MODIFIED:(01-26-2025: change out of range data to null instead of deleting/removing the rows)
#output_file_path = delete_rows_for_out_of_range_data(output_file_path,FILE_PATH_REFORMATED_SOSEC_CODE_BOOK,OUTPUT_FOLDER_PATH,False)
#print("\nNo of data after removing out of invalid range values from dataset")
#no_of_rows = get_row_count(output_file_path)
#no_of_columns = count_columns_in_csv(output_file_path)
output_file_path = handle_invalid_values(output_file_path,FILE_PATH_REFORMATED_SOSEC_CODE_BOOK,OUTPUT_FOLDER_PATH,False)
print("\nNo of data after HANDLING out of invalid range values from dataset: NOTE SHOULD BE SAME")
no_of_rows = get_row_count(output_file_path)
no_of_columns = count_columns_in_csv(output_file_path)

#MODIFIED:(01-26-2025: change out of range data to null instead of deleting/removing the rows)
#filter rows based on F7cA1 (YOB) values, 01-26-2025:DOES NOT remove where out of reasonable age range.
yob_start = 1959
yob_end = 2004
output_file_path = perform_filter_on_dataset_F7cA1(output_file_path,1959,2004,OUTPUT_FOLDER_PATH)
print(f"\nNo of data rows after FILTER rows having out of range F7cA1 (YOB:{yob_start}-{yob_end}) values: NOTE SHOULD BE SAME")
no_of_rows = get_row_count(output_file_path)
no_of_columns = count_columns_in_csv(output_file_path)



#Correct YOB range in code book for later use as temp file
file_path = r"../data/0_Reformated_SOSEC_Code-book_US_November_Reformulated_Questions_For_Dict_All_Columns.csv"
SOSEC_Codebook_path = r"../data/SOSEC_Code-book_Current.csv"
custom_variable_name = "F7cA1"
yob_start = 1959
yob_end = 2004
update_codefile_YOB_char_and_label(file_path, custom_variable_name, yob_start, yob_end, SOSEC_Codebook_path)


FILE_PATH_REFORMATED_SOSEC_CODE_BOOK = SOSEC_Codebook_path






#27-01-2025: removal of columns with null>70%  not applied. 
#Delete columns having more than 70% null with exceptions of some columns
#col_to_exclude =["F6a_DemPartyA2","F6a_RepPartyA2","F6b_DemPartyA2","F6b_RepPartyA2"]
#output_file_path = drop_high_null_columns(output_file_path,FILE_PATH_REFORMATED_SOSEC_CODE_BOOK, col_to_exclude,0.7,OUTPUT_FOLDER_PATH)
#print(f"\nNo of data rows after Delete columns having more than 70% null with exceptions of columns: {col_to_exclude} ")
#print("NOTE: 70% NULL NOT REMOVED FROM THE FILE")
#no_of_rows = get_row_count(output_file_path)
#no_of_columns = count_columns_in_csv(output_file_path)

#Delete data which is filled in rapidly without reading by using i_time column
percentile_val = 10
output_file_path = remove_below_percentile(output_file_path,10,OUTPUT_FOLDER_PATH)
print(f"\nNo of data rows and col which is filled having time of i_TIME more then {percentile_val}th percentile")
no_of_rows = get_row_count(output_file_path)
no_of_columns = count_columns_in_csv(output_file_path)

#Drop i_time
input_csv=output_file_path
output_file_path=r"../data/1a_preprocess/7a_df_dataset_with_codebook_columns_filtered_itime.csv"
column_to_remove='i_TIME'
drop_column_in_csv(input_csv, output_file_path, column_to_remove)


#Add info as rows for creating personas.
FILE_PATH_PERSONA_FORMAT1= prepare_for_personas_format1(output_file_path,FILE_PATH_REFORMATED_SOSEC_CODE_BOOK,OUTPUT_FOLDER_PATH)


#Rename all first row of row labels.
prepare_for_personas_format2(FILE_PATH_PERSONA_FORMAT1,OUTPUT_FOLDER_PATH)


#Delete generated extra files.
file_paths = [
    #r"../data/1a_preprocess/1_codebook_no_matching_columns_in_dataset.csv",
    r"../data/1a_preprocess/1_df_dataset_with_codebook_columns_full_no_processing.csv",
    r"../data/1a_preprocess/2_df_dataset_with_codebook_columns_full_F2A3.csv",
    r"../data/1a_preprocess/3_df_dataset_with_codebook_columns_full_F7mA1.csv",
    r"../data/1a_preprocess/4_df_dataset_with_codebook_columns_filtered_outofrange.csv",
    r"../data/1a_preprocess/5_df_dataset_with_codebook_columns_filtered_F7cA1-yob.csv",
    r"../data/1a_preprocess/6_df_dataset_with_codebook_columns_filtered_lessdata.csv",
    #r"../data/1a_preprocess/6_removed_columns_due_to_lessdata.csv",
    r"../data/1a_preprocess/7_df_dataset_with_codebook_columns_filtered_itime.csv",
    r"../data/1a_preprocess/7a_df_dataset_with_codebook_columns_filtered_itime.csv"]

status = delete_specific_files(file_paths)
for file, message in status.items():
    print(f"{file}: {message}")

Deleted: ../data/1a_preprocess\1_codebook_no_matching_columns_in_dataset.csv
Deleted: ../data/1a_preprocess\9_processed_data_for_personas_Format_1.csv
Deleted: ../data/1a_preprocess\9_processed_data_for_personas_Format_2.csv




  df = pd.read_csv(file_path_dataset)



Nice! No duplicate column titles found.

Cleaned data saved to '../data/1a_preprocess/4_df_dataset_with_codebook_columns_filtered_outofrange.csv'.

No of data after HANDLING out of invalid range values from dataset: NOTE SHOULD BE SAME
Number of rows in the csv: 36781
Number of columns in the CSV: 133
Updated dataset saved to: ../data/1a_preprocess/5_df_dataset_with_codebook_columns_filtered_F7cA1-yob.csv

No of data rows after FILTER rows having out of range F7cA1 (YOB:1959-2004) values: NOTE SHOULD BE SAME
Number of rows in the csv: 36781
Number of columns in the CSV: 133
Updated CSV file has been saved as '../data/SOSEC_Code-book_Current.csv'.
Updated dataset saved to: ../data/1a_preprocess/7_df_dataset_with_codebook_columns_filtered_itime.csv

No of data rows and col which is filled having time of i_TIME more then 10th percentile
Number of rows in the csv: 33118
Number of columns in the CSV: 133
Column 'i_TIME' has been removed and saved to '../data/1a_preprocess/7a_df_dataset_wit

  df= pd.read_csv(FILE_PATH_PERSONA_FORMAT1, header=None)


../data/1a_preprocess/1_df_dataset_with_codebook_columns_full_no_processing.csv: Deleted
../data/1a_preprocess/2_df_dataset_with_codebook_columns_full_F2A3.csv: Error: File does not exist
../data/1a_preprocess/3_df_dataset_with_codebook_columns_full_F7mA1.csv: Error: File does not exist
../data/1a_preprocess/4_df_dataset_with_codebook_columns_filtered_outofrange.csv: Deleted
../data/1a_preprocess/5_df_dataset_with_codebook_columns_filtered_F7cA1-yob.csv: Deleted
../data/1a_preprocess/6_df_dataset_with_codebook_columns_filtered_lessdata.csv: Error: File does not exist
../data/1a_preprocess/7_df_dataset_with_codebook_columns_filtered_itime.csv: Deleted
../data/1a_preprocess/7a_df_dataset_with_codebook_columns_filtered_itime.csv: Deleted


## NEW SAMPLE FILE

In [66]:
# Load the required data files.
FILE_PATH_DATASET = r"..\data\0_SOSEC Data RCS\new_data_sample.csv"
FILE_PATH_REFORMATED_SOSEC_CODE_BOOK = r'..\data\0_Reformated_SOSEC_Code-book_US_November_Reformulated_Questions_For_Dict_All_Columns.csv' 
OUTPUT_FOLDER_PATH=r"../data/1b_preprocess/"

VERBOSE = False

# Call the function with the relative path to delete all csv files in 1_preprocess folder.
delete_all_csv_files(r"../data/1b_preprocess/")

#Get list of columns from formatted sosec code book
#no_color_questions, grey_questions,yellow_questions, green_questions,question_with_characteristics
_i, _j, _k, _l,required_columns = get_columns_from_formatted_codebook(FILE_PATH_REFORMATED_SOSEC_CODE_BOOK,VERBOSE)

# list of column names to keep
columns_to_keep =  ['i_TIME'] + required_columns 

#Only keep the columns as per sosec dataset
output_file_path = filter_datasets_for_required_col(FILE_PATH_DATASET,FILE_PATH_REFORMATED_SOSEC_CODE_BOOK,columns_to_keep,OUTPUT_FOLDER_PATH,VERBOSE)

#Check for duplicate columns in the filtered dataset
check_duplicate_cols(output_file_path)

#Changed the 0 to 6 in the codebook.
#replace 6 and 0 in column F2A3 by empty cells
#output_file_path = perform_filter_on_dataset_F2A3(output_file_path,OUTPUT_FOLDER_PATH)
#print("\nNo of rows and columns in original dataset after setting 6,0 to empty in F2A3")
#no_of_rows = get_row_count(output_file_path)
#no_of_columns = count_columns_in_csv(output_file_path)

#replace 0 in column F7mA1 (job category) by empty cells
#output_file_path = perform_filter_on_dataset_F7mA1(output_file_path,OUTPUT_FOLDER_PATH)
#print("\nNo of rows and columns in original dataset after setting 0 to empty in F7mA1")
#no_of_rows = get_row_count(output_file_path)
#no_of_columns = count_columns_in_csv(output_file_path)

#MODIFIED:(01-26-2025: change out of range data to null instead of deleting/removing the rows)
#output_file_path = delete_rows_for_out_of_range_data(output_file_path,FILE_PATH_REFORMATED_SOSEC_CODE_BOOK,OUTPUT_FOLDER_PATH,False)
#print("\nNo of data after removing out of invalid range values from dataset")
#no_of_rows = get_row_count(output_file_path)
#no_of_columns = count_columns_in_csv(output_file_path)
output_file_path = handle_invalid_values(output_file_path,FILE_PATH_REFORMATED_SOSEC_CODE_BOOK,OUTPUT_FOLDER_PATH,False)
print("\nNo of data after HANDLING out of invalid range values from dataset: NOTE SHOULD BE SAME")
no_of_rows = get_row_count(output_file_path)
no_of_columns = count_columns_in_csv(output_file_path)

#MODIFIED:(01-26-2025: change out of range data to null instead of deleting/removing the rows)
#filter rows based on F7cA1 (YOB) values, 01-26-2025:DOES NOT remove where out of reasonable age range.
yob_start = 1959
yob_end = 2004
output_file_path = perform_filter_on_dataset_F7cA1(output_file_path,1959,2004,OUTPUT_FOLDER_PATH)
print(f"\nNo of data rows after FILTER rows having out of range F7cA1 (YOB:{yob_start}-{yob_end}) values: NOTE SHOULD BE SAME")
no_of_rows = get_row_count(output_file_path)
no_of_columns = count_columns_in_csv(output_file_path)

#Correct YOB range in code book for later use as temp file
file_path = r"../data/0_Reformated_SOSEC_Code-book_US_November_Reformulated_Questions_For_Dict_All_Columns.csv"
SOSEC_Codebook_path = r"../data/SOSEC_Code-book_Current.csv"
custom_variable_name = "F7cA1"
yob_start = 1959
yob_end = 2004
update_codefile_YOB_char_and_label(file_path, custom_variable_name, yob_start, yob_end, SOSEC_Codebook_path)

FILE_PATH_REFORMATED_SOSEC_CODE_BOOK = SOSEC_Codebook_path

#27-01-2025: removal of columns with null>70%  not applied. 
#Delete columns having more than 70% null with exceptions of some columns
#col_to_exclude =["F6a_DemPartyA2","F6a_RepPartyA2","F6b_DemPartyA2","F6b_RepPartyA2"]
#output_file_path = drop_high_null_columns(output_file_path,FILE_PATH_REFORMATED_SOSEC_CODE_BOOK, col_to_exclude,0.7,OUTPUT_FOLDER_PATH)
#print(f"\nNo of data rows after Delete columns having more than 70% null with exceptions of columns: {col_to_exclude} ")
#print("NOTE: 70% NULL NOT REMOVED FROM THE FILE")
#no_of_rows = get_row_count(output_file_path)
#no_of_columns = count_columns_in_csv(output_file_path)

#Delete data which is filled in rapidly without reading by using i_time column
percentile_val = 10
output_file_path = remove_below_percentile(output_file_path,10,OUTPUT_FOLDER_PATH)
print(f"\nNo of data rows and col which is filled having time of i_TIME more then {percentile_val}th percentile")
no_of_rows = get_row_count(output_file_path)
no_of_columns = count_columns_in_csv(output_file_path)

#Drop i_time
input_csv=output_file_path
output_file_path=r"../data/1b_preprocess/7a_df_dataset_with_codebook_columns_filtered_itime.csv"
column_to_remove='i_TIME'
drop_column_in_csv(input_csv, output_file_path, column_to_remove)

#Add info as rows for creating personas.
FILE_PATH_PERSONA_FORMAT1= prepare_for_personas_format1(output_file_path,FILE_PATH_REFORMATED_SOSEC_CODE_BOOK,OUTPUT_FOLDER_PATH)

#Rename all first row of row labels.
prepare_for_personas_format2(FILE_PATH_PERSONA_FORMAT1,OUTPUT_FOLDER_PATH)

#Delete generated extra files.
file_paths = [
    #r"../data/1a_preprocess/1_codebook_no_matching_columns_in_dataset.csv",
    r"../data/1b_preprocess/1_df_dataset_with_codebook_columns_full_no_processing.csv",
    r"../data/1b_preprocess/2_df_dataset_with_codebook_columns_full_F2A3.csv",
    r"../data/1b_preprocess/3_df_dataset_with_codebook_columns_full_F7mA1.csv",
    r"../data/1b_preprocess/4_df_dataset_with_codebook_columns_filtered_outofrange.csv",
    r"../data/1b_preprocess/5_df_dataset_with_codebook_columns_filtered_F7cA1-yob.csv",
    r"../data/1b_preprocess/6_df_dataset_with_codebook_columns_filtered_lessdata.csv",
    #r"../data/1b_preprocess/6_removed_columns_due_to_lessdata.csv",
    r"../data/1b_preprocess/7_df_dataset_with_codebook_columns_filtered_itime.csv",
    r"../data/1b_preprocess/7a_df_dataset_with_codebook_columns_filtered_itime.csv"
    ]

status = delete_specific_files(file_paths)
for file, message in status.items():
    print(f"{file}: {message}")

Deleted: ../data/1b_preprocess\1_codebook_no_matching_columns_in_dataset.csv
Deleted: ../data/1b_preprocess\9_processed_data_for_personas_Format_1.csv
Deleted: ../data/1b_preprocess\9_processed_data_for_personas_Format_2.csv




  df = pd.read_csv(file_path_dataset)



Nice! No duplicate column titles found.

Cleaned data saved to '../data/1b_preprocess/4_df_dataset_with_codebook_columns_filtered_outofrange.csv'.

No of data after HANDLING out of invalid range values from dataset: NOTE SHOULD BE SAME
Number of rows in the csv: 33513
Number of columns in the CSV: 133
Updated dataset saved to: ../data/1b_preprocess/5_df_dataset_with_codebook_columns_filtered_F7cA1-yob.csv

No of data rows after FILTER rows having out of range F7cA1 (YOB:1959-2004) values: NOTE SHOULD BE SAME
Number of rows in the csv: 33513
Number of columns in the CSV: 133
Updated CSV file has been saved as '../data/SOSEC_Code-book_Current.csv'.
Updated dataset saved to: ../data/1b_preprocess/7_df_dataset_with_codebook_columns_filtered_itime.csv

No of data rows and col which is filled having time of i_TIME more then 10th percentile
Number of rows in the csv: 30168
Number of columns in the CSV: 133
Column 'i_TIME' has been removed and saved to '../data/1b_preprocess/7a_df_dataset_wit

  df= pd.read_csv(FILE_PATH_PERSONA_FORMAT1, header=None)


../data/1b_preprocess/1_df_dataset_with_codebook_columns_full_no_processing.csv: Deleted
../data/1b_preprocess/2_df_dataset_with_codebook_columns_full_F2A3.csv: Error: File does not exist
../data/1b_preprocess/3_df_dataset_with_codebook_columns_full_F7mA1.csv: Error: File does not exist
../data/1b_preprocess/4_df_dataset_with_codebook_columns_filtered_outofrange.csv: Deleted
../data/1b_preprocess/5_df_dataset_with_codebook_columns_filtered_F7cA1-yob.csv: Deleted
../data/1b_preprocess/6_df_dataset_with_codebook_columns_filtered_lessdata.csv: Error: File does not exist
../data/1b_preprocess/7_df_dataset_with_codebook_columns_filtered_itime.csv: Deleted
../data/1b_preprocess/7a_df_dataset_with_codebook_columns_filtered_itime.csv: Deleted


## MERGE RESULTS (FORMAT-1)

In [67]:
file1 = r"..\data\1a_preprocess\9_processed_data_for_personas_Format_1.csv"
file2 = r"..\data\1b_preprocess\9_processed_data_for_personas_Format_1.csv"
output_file = r"..\data\1_combined_preprocess\9_processed_data_for_personas_Format_1.csv"


df1 = pd.read_csv(file1)  
df2 = pd.read_csv(file2, skiprows=[1, 2, 3])  # Skip rows 0, 1, 2

merge_files(df1, df2, output_file,remove_duplicates=False)

no_of_rows = get_row_count(output_file_path)
no_of_columns = count_columns_in_csv(output_file_path)

  df1 = pd.read_csv(file1)


Merging df...
Files merged: 63289 rows before removing duplicates.
Merged file saved to ..\data\1_combined_preprocess\9_processed_data_for_personas_Format_1.csv.


FileNotFoundError: [Errno 2] No such file or directory: '../data/1b_preprocess/7a_df_dataset_with_codebook_columns_filtered_itime.csv'

## Hot encoding on Format #1

In [None]:
#Not applied: hot-encooding.
# Enabled : again on 02-03-25 with all columns.


#Encode the columns which are based on categorical values. 
#F7a:Gender
#F7bA1	Enter a 5-digit Zip number.:
#F7d	Were you born in the US?
#F7e	Was your mother born in the US?
#F7f	Was your father born in the US?
#F7g:  educational level
#F7h: employment status?
#F7i	What is your marital status?
#F7lA1	Which religious community do you belong to?
#F7mA1	To which of the following occupational groups do you belong?
#F7n	Which ethnic group do you belong to?
input_file_path = output_file
columns_to_encode = ['F7a', 'F7bA1','F7d','F7e','F7f','F7g', 'F7h', 'F7i', 'F7lA1', 'F7mA1','F7n']  
dfx = pd.read_csv(input_file_path, skiprows=[1, 2, 3])  # Skip rows 1, 2, 3
columns_to_encode = dfx.columns.tolist()  
output_folder_path=r"../data/1_combined_preprocess/"

# Perform one-hot encoding
output_file_path = one_hot_encode_csv(dfx, columns_to_encode,output_folder_path)
print("\nNo of data rows and columns after encoding and removing the encoded columns.")
no_of_rows = get_row_count(output_file_path)
no_of_columns = count_columns_in_csv(output_file_path)


One-hot encoded CSV saved to ../data/1_combined_preprocess/8_df_dataset_with_codebook_columns_filtered_hotencoding.csv

No of data rows and columns after encoding and removing the encoded columns.
Number of rows in the csv: 63286
Number of columns in the CSV: 959


## MERGE RESULTS (FORMAT-2)

In [39]:
file1 = r"..\data\1a_preprocess\9_processed_data_for_personas_Format_2.csv"
file2 = r"..\data\1b_preprocess\9_processed_data_for_personas_Format_2.csv"
output_file = r"..\data\1_combined_preprocess\9_processed_data_for_personas_Format_2.csv"

df1 = pd.read_csv(file1)  
df2 = pd.read_csv(file2, skiprows=[0, 1, 2])  # Skip rows 0, 1, 2


merge_files(df1, df2, output_file,remove_duplicates=False)

no_of_rows = get_row_count(output_file_path)
no_of_columns = count_columns_in_csv(output_file_path)

  df1 = pd.read_csv(file1)


Merging df...
Files merged: 63287 rows before removing duplicates.
Merged file saved to ..\data\1_combined_preprocess\9_processed_data_for_personas_Format_2.csv.
Number of rows in the csv: 63286
Number of columns in the CSV: 959
