# clean type

In [11]:
import os
import pandas as pd
import json
import glob
from concurrent.futures import ThreadPoolExecutor, as_completed

def load_state(state_file):
    try:
        with open(state_file, "r") as file:
            return json.load(file)
    except (FileNotFoundError, json.JSONDecodeError):
        return {}
    
def save_state(state_file, state_data):
    with open(state_file, "w") as file:
        json.dump(state_data, file)

    
# Define a function to clean drug names based on the given patterns
def clean_drug_names(drug_name):
    if pd.isna(drug_name):
        return pd.NA
    # Patterns and their replacements
    replacements = {
        "甲基苯丙胺": ["甲基苯丙", "冰毒", "麻果", "麻古"],
        "海洛因": ["海洛", "heroin"],
        "氯胺酮": ["氯胺酮", "K粉", "King粉", "k粉"],
        "罂粟": ["罂粟"],
        "大麻": ["大麻"],
        "鸦片": ["鸦片"],
        "Other": []  # This will be used as a catch-all category
    }
    
    for canonical, patterns in replacements.items():
        if any(pattern in drug_name for pattern in patterns):
            return canonical
    return "Other"  # Return 'Other' if no patterns match

# Function to process the DataFrame and clean the drug columns
def process_drug_columns(df):
    # Convert empty strings to NaN and ensure you are modifying the DataFrame directly
    df.loc[:, 'drug_a'] = df['drug_a'].replace('', pd.NA)
    df.loc[:, 'drug_b'] = df['drug_b'].replace('', pd.NA)
    
    # Drop rows where 'drug_a' is NaN or empty
    df = df.dropna(subset=['drug_a'])
    
    # Apply the cleaning function to non-NaN, non-empty values using .loc to ensure direct modification
    df.loc[:, 'drug_a'] = df['drug_a'].apply(clean_drug_names)
    df.loc[:, 'drug_b'] = df['drug_b'].apply(clean_drug_names)
    
    # Convert amounts to numeric, setting errors='coerce' will turn errors into NaN
    df['amount_a'] = pd.to_numeric(df['amount_a'], errors='coerce')
    df['amount_b'] = pd.to_numeric(df['amount_b'], errors='coerce')
    
   # Sum amounts where drug_a and drug_b are the same, and both amounts are non-NaN
    mask = (df['drug_a'] == df['drug_b']) & df['amount_a'].notna() & df['amount_b'].notna()
    df.loc[mask, 'amount_a'] += df.loc[mask, 'amount_b']
    
    df['amount_a'] = df['amount_a'].round(2)

    df = df.drop(columns=['drug_b', 'amount_b', 'ResponseText', 'TrimmedType', 'TextAroundTrimmedPoint'])  # Drop columns if no longer needed
    
    return df


# Main processing function
def process_files(base_path, state_file):
    state_data = load_state(state_file)

    if not os.path.exists(os.path.dirname(state_file)):
        os.makedirs(os.path.dirname(state_file))

    for file_path in glob.glob(os.path.join(base_path, "**", "*.csv"), recursive=True):
        file_name = os.path.basename(file_path)
        if state_data.get(file_name):
            print(f"Skipping already processed file: {file_name}")
            continue

        print(f"Processing file: {file_name}")
        df = pd.read_csv(file_path, on_bad_lines='skip')

        # Clean drug columns
        df = process_drug_columns(df)
        

        output_file_path = os.path.join(output_folder, file_name)
        df.to_csv(output_file_path, index=False)
        
        # Update state file after processing
        state_data[file_name] = True
        save_state(state_file, state_data)

base_path = "/Users/jiaruisong/Documents/Coding/INFO 288 Big Data and Development_Data/drug_related_full_dataset_percent_type_amount"
output_folder = "/Users/jiaruisong/Documents/Coding/INFO 288 Big Data and Development_Data/drug_related_full_dataset_type_amount_cleaned"
state_file = "/Users/jiaruisong/Documents/Coding/INFO 288 Big Data and Development_Data/statefile/state_file_drug_type_amount_clean_combine.json"
process_files(base_path, state_file)

Skipping already processed file: part-00003-tid-1928088339454322354-ceb47e75-20cd-42f3-878f-8d0f000b669f-4493-1-c000.csv
Skipping already processed file: part-00007-tid-2050708359596842787-ff866193-608c-4ef6-9c78-d50589914e7e-3155-1-c000.csv
Skipping already processed file: part-00043-tid-3342515310755238742-2e94fd7b-2d66-4cf3-b445-b2f56643ce20-3898-1-c000.csv
Skipping already processed file: part-00026-tid-2050708359596842787-ff866193-608c-4ef6-9c78-d50589914e7e-3174-1-c000.csv
Skipping already processed file: part-00009-tid-2121029807467275532-4545430e-426d-48ce-9c4a-53de4ae6730c-2983-1-c000.csv
Skipping already processed file: part-00014-tid-6194725384779829736-e34b779a-ac33-4b5b-9b0b-d43caf95ef05-1719-1-c000.csv
Skipping already processed file: part-00003-tid-4463855928830684722-02083f46-5673-4cfe-82a0-5046b28defde-2927-1-c000.csv
Skipping already processed file: part-00024-tid-1774310925749392111-a7c95fb6-a172-44be-983a-d2c243c84d7f-2822-1-c000.csv
Skipping already processed file:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['amount_a'] = pd.to_numeric(df['amount_a'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['amount_b'] = pd.to_numeric(df['amount_b'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['amount_a'] = df['amount_a'].round(2)
A value is trying to be set on a cop

Skipping already processed file: part-00027-tid-899378660303502663-a1412f7f-f50a-491f-8fea-b0cea8499e8e-1229-1-c000.csv
Skipping already processed file: part-00012-tid-899378660303502663-a1412f7f-f50a-491f-8fea-b0cea8499e8e-1214-1-c000.csv
Skipping already processed file: part-00028-tid-4169140279769324862-af62b2d3-3e4b-4c26-9bf3-1d55523ea062-1442-1-c000.csv
Skipping already processed file: part-00044-tid-2050708359596842787-ff866193-608c-4ef6-9c78-d50589914e7e-3192-1-c000.csv
Skipping already processed file: part-00009-tid-7143810800454225208-bafaa04b-0713-4027-ba9b-18c22e3617a9-2154-1-c000.csv
Skipping already processed file: part-00007-tid-6879364726152781056-96ab9d6d-ae9e-45f8-9334-d2188a3c690a-4935-1-c000.csv
Skipping already processed file: part-00017-tid-6668636964318347777-96f2d544-07ae-4550-bbf6-60cd5ce089ab-1077-1-c000.csv
Skipping already processed file: part-00020-tid-8345041676783108651-73df31c9-464e-47ec-a3dc-0709c0a60d90-4599-1-c000.csv
Skipping already processed file: p

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['amount_a'] = pd.to_numeric(df['amount_a'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['amount_b'] = pd.to_numeric(df['amount_b'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['amount_a'] = df['amount_a'].round(2)
A value is trying to be set on a cop

Skipping already processed file: part-00004-tid-1875109887902425094-bfe12b7e-1ac8-4d27-a384-16d2cb24e950-3569-1-c000.csv
Skipping already processed file: part-00017-tid-2050708359596842787-ff866193-608c-4ef6-9c78-d50589914e7e-3165-1-c000.csv
Skipping already processed file: part-00054-tid-3342515310755238742-2e94fd7b-2d66-4cf3-b445-b2f56643ce20-3909-1-c000.csv
Skipping already processed file: part-00039-tid-2050708359596842787-ff866193-608c-4ef6-9c78-d50589914e7e-3187-1-c000.csv
Skipping already processed file: part-00032-tid-6651406933924064188-efbe7d64-fbed-4129-b85b-6b344b438b77-3447-1-c000.csv
Skipping already processed file: part-00005-tid-6668636964318347777-96f2d544-07ae-4550-bbf6-60cd5ce089ab-1065-1-c000.csv
Skipping already processed file: part-00010-tid-5707708088734423729-95b6f48d-a233-49b7-af47-aaabd94b89d3-556-1-c000.csv
Skipping already processed file: part-00005-tid-1566816878721410657-3751a12a-84cf-40c6-add4-35e08d7323b1-4761-1-c000.csv
Skipping already processed file: 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['amount_a'] = pd.to_numeric(df['amount_a'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['amount_b'] = pd.to_numeric(df['amount_b'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['amount_a'] = df['amount_a'].round(2)
A value is trying to be set on a cop

Skipping already processed file: part-00056-tid-3342515310755238742-2e94fd7b-2d66-4cf3-b445-b2f56643ce20-3911-1-c000.csv
Skipping already processed file: part-00002-tid-4560042665276353197-26d8b4a9-173c-4200-b95d-7f99106c01c8-2078-1-c000.csv
Skipping already processed file: part-00016-tid-3665697299382238007-038e0c85-a9d2-4ce9-a717-17f5645a097d-2304-1-c000.csv
Skipping already processed file: part-00034-tid-7891592605338472512-5e1362cb-16fd-49e4-8fb3-a7039f9e5a05-1532-1-c000.csv
Skipping already processed file: part-00015-tid-7891592605338472512-5e1362cb-16fd-49e4-8fb3-a7039f9e5a05-1513-1-c000.csv
Skipping already processed file: part-00041-tid-4879269636505753236-1a7ef2d6-ddcd-41e4-83bc-77be49616e9e-4036-1-c000.csv
Processing file: part-00013-tid-6510975564714990302-3bfa6bf2-3c3b-49a3-81fc-6648bf2046e9-1000-1-c000.csv
Skipping already processed file: part-00007-tid-330908229526123614-9eebdf2d-8ccc-410b-b606-e6a20c8adbd5-2355-1-c000.csv
Skipping already processed file: part-00016-tid-9

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['amount_a'] = pd.to_numeric(df['amount_a'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['amount_b'] = pd.to_numeric(df['amount_b'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['amount_a'] = df['amount_a'].round(2)
A value is trying to be set on a cop

Skipping already processed file: part-00026-tid-1566816878721410657-3751a12a-84cf-40c6-add4-35e08d7323b1-4782-1-c000.csv
Skipping already processed file: part-00037-tid-6651406933924064188-efbe7d64-fbed-4129-b85b-6b344b438b77-3452-1-c000.csv
Skipping already processed file: part-00064-tid-1323465143157338736-ff930471-9114-4d6c-b4e5-6b7541f28dcc-4193-1-c000.csv
Skipping already processed file: part-00006-tid-2835676307537882535-d0d002c1-3e3e-4410-90dd-02caed0d285a-4431-1-c000.csv
Skipping already processed file: part-00047-tid-8870043383557983114-473032ac-c43b-4b65-9fb7-8c170157e0c9-4349-1-c000.csv
Skipping already processed file: part-00066-tid-8870043383557983114-473032ac-c43b-4b65-9fb7-8c170157e0c9-4368-1-c000.csv
Skipping already processed file: part-00006-tid-4463855928830684722-02083f46-5673-4cfe-82a0-5046b28defde-2930-1-c000.csv
Skipping already processed file: part-00013-tid-4169140279769324862-af62b2d3-3e4b-4c26-9bf3-1d55523ea062-1427-1-c000.csv
Skipping already processed file:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['amount_a'] = pd.to_numeric(df['amount_a'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['amount_b'] = pd.to_numeric(df['amount_b'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['amount_a'] = df['amount_a'].round(2)
A value is trying to be set on a cop

Skipping already processed file: part-00039-tid-3342515310755238742-2e94fd7b-2d66-4cf3-b445-b2f56643ce20-3894-1-c000.csv
Skipping already processed file: part-00011-tid-8510476554429228893-444be295-5949-44f2-9e60-11cc12e6ec0f-2635-1-c000.csv
Skipping already processed file: part-00010-tid-1927729384859525796-68729ee9-e53c-4d7d-8d00-f7328125d97e-2118-1-c000.csv
Skipping already processed file: part-00028-tid-5149185796620252648-ec78fbd7-ac64-40b8-9c82-a0917ea4a1db-2453-1-c000.csv
Skipping already processed file: part-00011-tid-813975222350564112-4fb7077e-63c0-4976-aa26-ce0100b3b91a-873-1-c000.csv
Skipping already processed file: part-00010-tid-6668636964318347777-96f2d544-07ae-4550-bbf6-60cd5ce089ab-1070-1-c000.csv
Skipping already processed file: part-00011-tid-5149185796620252648-ec78fbd7-ac64-40b8-9c82-a0917ea4a1db-2436-1-c000.csv
Skipping already processed file: part-00023-tid-4137380717972787387-6ec44b47-d659-4eff-aead-7d410af2373f-3066-1-c000.csv
Skipping already processed file: p

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['amount_a'] = pd.to_numeric(df['amount_a'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['amount_b'] = pd.to_numeric(df['amount_b'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['amount_a'] = df['amount_a'].round(2)
A value is trying to be set on a cop

Skipping already processed file: part-00006-tid-7143810800454225208-bafaa04b-0713-4027-ba9b-18c22e3617a9-2151-1-c000.csv
Skipping already processed file: part-00012-tid-2281028915943270437-37f12c7e-be69-419b-8a83-bfd6ce0ebae8-192-1-c000.csv
Skipping already processed file: part-00019-tid-8885333525305868225-e1aa2550-2b31-4220-95b4-3430eb1bb826-1777-1-c000.csv
Skipping already processed file: part-00015-tid-1432614296042910416-33cdf916-d299-4553-b502-380e9dd05c13-437-1-c000.csv
Skipping already processed file: part-00061-tid-6651406933924064188-efbe7d64-fbed-4129-b85b-6b344b438b77-3476-1-c000.csv
Skipping already processed file: part-00015-tid-3845790512386894615-a6bf98ae-04c1-4e96-83d7-97364fc9993a-714-1-c000.csv
Skipping already processed file: part-00014-tid-3214840170637168435-41add788-8db9-47bc-8c85-920fdf70f5e2-630-1-c000.csv
Skipping already processed file: part-00021-tid-1875109887902425094-bfe12b7e-1ac8-4d27-a384-16d2cb24e950-3586-1-c000.csv
Skipping already processed file: par

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['amount_a'] = pd.to_numeric(df['amount_a'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['amount_b'] = pd.to_numeric(df['amount_b'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['amount_a'] = df['amount_a'].round(2)
A value is trying to be set on a cop

# combine amount