# number of csv files that have misplaced columns

In [15]:
import os
import pandas as pd

# Define the directory containing the CSV files
source_directory = '/Users/jiaruisong/Documents/Coding/INFO 288 Big Data and Development_Data/drug_related_full_dataset_type_amount_cleaned_penalty'

# Initialize a counter
count_files_with_more_than_20_columns = 0
number_of_files = 0

# Loop through each file in the source directory
for filename in os.listdir(source_directory):
    if filename.endswith('.csv'):  # Check if the file is a CSV
        file_path = os.path.join(source_directory, filename)
        number_of_files += 1
        
        # Try to read the CSV file
        try:
            df = pd.read_csv(file_path)
            # Check if the CSV file has more than 20 columns
            if df.shape[1] > 19:
                count_files_with_more_than_20_columns += 1
        except Exception as e:
            print(f'Failed to process {filename}: {e}')

# Print the total count of files with more than 20 columns
print(f"Total CSV files with more than 19 columns: {count_files_with_more_than_20_columns}")
print(f"Total number of CSV files: {number_of_files}")


Total CSV files with more than 19 columns: 0
Total number of CSV files: 1980


# clean type misplaced columns - mostly not gonna use this

In [None]:
import os
import shutil
import pandas as pd

# Define the source and destination directories
source_directory = 'path/to/source_directory'
destination_directory = 'path/to/destination_directory'

# Create the destination directory if it does not exist
if not os.path.exists(destination_directory):
    os.makedirs(destination_directory)

# Loop through each file in the source directory
for filename in os.listdir(source_directory):
    # Check if the file is a CSV
    if filename.endswith('.csv'):
        file_path = os.path.join(source_directory, filename)

        df = pd.read_csv(file_path)
        # Check for substrings and replace values in 'total_imprisonment_a'
        df.loc[df['total_imprisonment_a'].str.contains('无期', na=False), 'total_imprisonment_a'] = 9998
        df.loc[df['total_imprisonment_a'].str.contains('死刑', na=False), 'total_imprisonment_a'] = 9999

        # Save the modified DataFrame back to CSV
        df.to_csv(file_path, index=False)

        print("Modifications are complete and the file has been saved.")

print("Processing complete.")


# clean 无期徒刑、死刑. remove empty value. remove temp columns. write back to original location. tested

In [25]:
import os
import pandas as pd

# Path to the directory containing the CSV files
directory_path = '/Users/jiaruisong/Documents/Coding/INFO 288 Big Data and Development_Data/drug_related_full_dataset_type_amount_cleaned_penalty'

# Initialize overall counters for each condition
total_count_wuqi = 0
total_count_sixing = 0

# Loop through each file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith('.csv'):  # Check if the file is a CSV file
        file_path = os.path.join(directory_path, filename)
        df = pd.read_csv(file_path)
        
        #set flags of original columns and rows
        original_columns = set(df.columns)
        
        # Drop the specified columns if they exist in the DataFrame and check if a drop occurred

        df.drop(columns=['ResponseText', 'TrimmedType'], errors='ignore', inplace=True)


        # Drop rows where 'total_imprisonment_a' is NaN, empty, or contains only a space
        try:
            df = df.dropna(subset=['total_imprisonment_a'])  # Drop rows where this column is NaN
        except KeyError:
            print(f"Column 'total_imprisonment_a' not found in {filename}")
        
        # Convert 'total_imprisonment_a' to string to ensure string operations work
        df['total_imprisonment_a'] = df['total_imprisonment_a'].astype(str)
        df = df[df['total_imprisonment_a'].str.strip() != '']  # Remove rows where this column is empty or space only

        # Initialize counters for the current file
        count_wuqi = 0
        count_sixing = 0

        # Check for substrings and replace values in 'total_imprisonment_a'
        contains_wuqi = df['total_imprisonment_a'].str.contains('无期', na=False)
        contains_sixing = df['total_imprisonment_a'].str.contains('死刑', na=False)

        if contains_wuqi.any():
            count_wuqi = contains_wuqi.sum()  # Count the number of True values
            df.loc[contains_wuqi, 'total_imprisonment_a'] = 9998

        if contains_sixing.any():
            count_sixing = contains_sixing.sum()  # Count the number of True values
            df.loc[contains_sixing, 'total_imprisonment_a'] = 9999

        # Save the modified DataFrame back to CSV only if changes were made
        dropped_columns = original_columns - set(df.columns)
        columns_dropped_flag = len(dropped_columns) > 0
        original_count = pd.read_csv(file_path).shape[0]
        
        if count_wuqi > 0 or count_sixing > 0 or df.shape[0] != original_count or columns_dropped_flag:
            df.to_csv(file_path, index=False)
            print(f"Modified {filename}: '无期' changes: {count_wuqi}, '死刑' changes: {count_sixing}, Rows dropped: {original_count - df.shape[0]}")

        # Update total counts
        total_count_wuqi += count_wuqi
        total_count_sixing += count_sixing

# Print the total modifications after all files have been processed
print(f"Total modifications across all files: '无期' = {total_count_wuqi}, '死刑' = {total_count_sixing}")


Modified part-00043-tid-3342515310755238742-2e94fd7b-2d66-4cf3-b445-b2f56643ce20-3898-1-c000.csv: '无期' changes: 0, '死刑' changes: 0, Rows dropped: 0
Modified part-00033-tid-8870043383557983114-473032ac-c43b-4b65-9fb7-8c170157e0c9-4335-1-c000.csv: '无期' changes: 0, '死刑' changes: 0, Rows dropped: 0
Modified part-00004-tid-5337073424345691499-209e3284-258c-4dd6-ba04-b7d62b0bf4ad-4965-1-c000.csv: '无期' changes: 0, '死刑' changes: 0, Rows dropped: 0
Modified part-00001-tid-8224862644505304205-87b17c42-1585-49e0-8711-f64fb7eefeeb-1277-1-c000.csv: '无期' changes: 0, '死刑' changes: 0, Rows dropped: 0
Modified part-00010-tid-3342515310755238742-2e94fd7b-2d66-4cf3-b445-b2f56643ce20-3865-1-c000.csv: '无期' changes: 0, '死刑' changes: 0, Rows dropped: 0
Modified part-00036-tid-1014278928818658117-550dace3-6c04-47ce-979a-66dd407a72b9-3736-1-c000.csv: '无期' changes: 0, '死刑' changes: 0, Rows dropped: 0
Modified part-00017-tid-1014278928818658117-550dace3-6c04-47ce-979a-66dd407a72b9-3717-1-c000.csv: '无期' changes: 

# check how if columns are successfully dropped - should be 17 columns now.

In [26]:
import os
import pandas as pd

# Define the directory containing the CSV files
source_directory = '/Users/jiaruisong/Documents/Coding/INFO 288 Big Data and Development_Data/drug_related_full_dataset_type_amount_cleaned_penalty'

# Initialize a counter
count_files_wrong_columns = 0
number_of_files = 0

# Loop through each file in the source directory
for filename in os.listdir(source_directory):
    if filename.endswith('.csv'):  # Check if the file is a CSV
        file_path = os.path.join(source_directory, filename)
        number_of_files += 1
        
        # Try to read the CSV file
        try:
            df = pd.read_csv(file_path)
            # Check if the CSV file has more than 20 columns
            if df.shape[1] != 17:
                count_files_wrong_columns += 1
                print(f"File {filename} has {df.shape[1]} columns")
        except Exception as e:
            print(f'Failed to process {filename}: {e}')

# Print the total count of files with more than 20 columns
print(f"Total CSV files with wrong number of columns: {count_files_wrong_columns}")
print(f"Total number of CSV files: {number_of_files}")


Total CSV files with wrong number of columns: 0
Total number of CSV files: 1979
