## Prepping (joining) datasets for DB

In [17]:
import pandas as pd
import os

samples_check = pd.read_csv("/Users/inesschwartz/Desktop/cleandata/samples_check.csv")
morpho = pd.read_csv("/Users/inesschwartz/Desktop/cleandata/morpho.csv")
lab_analysis = pd.read_csv("/Users/inesschwartz/Desktop/cleandata/analysis.csv")
site_info = pd.read_csv("/Users/inesschwartz/Desktop/cleandata/site_info.csv")
soil_type = pd.read_csv("/Users/inesschwartz/Desktop/cleandata/soil_type.csv")

profile_record = pd.read_csv("/Users/inesschwartz/Desktop/cleandata/profile_record_clean.csv")

In [1]:
import pandas as pd
import os

# Set the path to your folder containing the 10 CSV files
csv_folder = "/Users/inesschwartz/Desktop/cleandata"  

# List all CSV files in the folder
csv_files = [f for f in os.listdir(csv_folder) if f.endswith(".csv")]

# Loop through each file and display column data types
for file in csv_files:
    file_path = os.path.join(csv_folder, file)
    print(f"\n📄 File: {file}")
    
    try:
        df = pd.read_csv(file_path)
        print(df.dtypes)
    except Exception as e:
        print(f"⚠️ Error reading {file}: {e}")


📄 File: analysis.csv
lab_sample_id      int64
analysis_id       object
horizon_id        object
sample_id         object
profile           object
                  ...   
Tl               float64
Pb               float64
Bi               float64
Th               float64
U                float64
Length: 71, dtype: object

📄 File: soil_type.csv
soil_type_id     int64
profile         object
CEP_GR          object
CEP_NAME        object
FAO             object
dtype: object

📄 File: geology_mapping.csv
geology_code           object
geology_description    object
dtype: object

📄 File: lithology1954_mapping.csv
lithology_1954_code           object
lithology_1954_description    object
dtype: object

📄 File: samples_check.csv
sample_id              int64
site_info_id          object
profile_record_id    float64
profile               object
horizon_id           float64
shelf                 object
room                  object
year                 float64
dtype: object

📄 File: morpho.csv
horizo

In [4]:
# Folder containing the CSVs
csv_folder = "/Users/inesschwartz/Desktop/cleandata"

# List all CSV files in the folder
csv_files = [f for f in os.listdir(csv_folder) if f.endswith(".csv")]

# Define identifier columns
identifier_columns = [
    'sample_id', 'site_info_id', 'profile',
    'horizon_id', 'lab_sample_id', 'climate_id',
    'geo_features_id', 'topo_features_id', 'profile_record_id'
]

# Function to convert ID columns to string type
def convert_identifiers_to_string(df, id_columns):
    """
    Converts specified identifier columns in a DataFrame to string type,
    safely handling float64 values and preserving missing values (NA).
    """
    df = df.copy()
    for col in id_columns:
        if col in df.columns:
            df.loc[:, col] = df[col].apply(
                lambda x: str(int(x)) if pd.notna(x) and isinstance(x, float) and x.is_integer()
                else str(x) if pd.notna(x)
                else pd.NA
            ).astype("string")
    return df

# Loop through and clean all CSV files
for file in csv_files:
    file_path = os.path.join(csv_folder, file)
    df = pd.read_csv(file_path)
    df_cleaned = convert_identifiers_to_string(df, identifier_columns)
    df_cleaned.to_csv(file_path, index=False)
    print(f"✅ Cleaned: {file}")

print("🎯 All identifier columns standardized to string type.")


✅ Cleaned: analysis.csv
✅ Cleaned: soil_type.csv
✅ Cleaned: geology_mapping.csv
✅ Cleaned: lithology1954_mapping.csv
✅ Cleaned: samples_check.csv
✅ Cleaned: morpho.csv
✅ Cleaned: site_info.csv
✅ Cleaned: profile_record_clean.csv
✅ Cleaned: lithology_mapping.csv
✅ Cleaned: topo_feat.csv
✅ Cleaned: geo_feat.csv
✅ Cleaned: climate_feat.csv
🎯 All identifier columns standardized to string type.


An overview of how much data we had to clean out :(

In [12]:
samples_check.shape

(14715, 8)

In [11]:
morpho.shape

(10434, 30)

In [10]:
lab_analysis.shape

(7763, 71)

In [14]:
site_info.shape

(4321, 12)

## completing profile to sampe mampping table

this is a mapping table to link soil profile to its site info, soil type info, and sample_id

In [22]:
import pandas as pd

# Step 1: Copy site_info_clean and prep soil_profile_record
soil_profile_record = site_info.copy()

# Step 0: Extract sample_id and profile from both dataframes
samples_lab = lab_analysis[['sample_id', 'profile']].dropna().drop_duplicates()
samples_morpho = morpho[['sample_id', 'profile']].dropna().drop_duplicates()

# Ensure both dataframes have same dtypes for sample_id and profile
samples_lab['sample_id'] = samples_lab['sample_id'].astype(str)
samples_morpho['sample_id'] = samples_morpho['sample_id'].astype(str)
samples_lab['profile'] = samples_lab['profile'].astype(str)
samples_morpho['profile'] = samples_morpho['profile'].astype(str)

# Combine using concat and drop duplicates
morphoNlab_samples = pd.concat([samples_lab, samples_morpho], ignore_index=True)
morphoNlab_samples = morphoNlab_samples.drop_duplicates().reset_index(drop=True)

# Step 3: Get sample_id(s) per profile (from merged2)
profile_samples = morphoNlab_samples[['profile', 'sample_id']].dropna().drop_duplicates()
soil_profile_with_samples = soil_profile_record.merge(profile_samples, on='profile', how='left')

# Step 4: Join with soil_type to get soil_type_id by profile
profile_soil_types = soil_type[['profile', 'soil_type_id']].dropna().drop_duplicates()
soil_profile_full = soil_profile_with_samples.merge(profile_soil_types, on='profile', how='left')

# Step 5: Add profile_record_id as a primary key
soil_profile_full = soil_profile_full.reset_index(drop=True)
soil_profile_full.insert(0, 'profile_record_id', range(1, len(soil_profile_full) + 1))

# Step 6: Select required columns
soil_profile_record_final = soil_profile_full[[
    'profile_record_id',
    'profile',
    'site_info_id',
    'sample_id',
    'soil_type_id'
]].copy()

# correct datatypes to all be strings
soil_profile_record_final = soil_profile_record_final.astype("string")

# Optional: View or export
soil_profile_record_final.head()
soil_profile_record_final.to_csv("soil_profile_record.csv", index=False)


Unnamed: 0,profile_record_id,profile,site_info_id,sample_id,soil_type_id
0,1,1_57,1,4835.0,2.0
1,2,1_57,1,4836.0,2.0
2,3,1_57,1,4837.0,2.0
3,4,1_57,1,4838.0,2.0
4,5,1_57,1,4835.0,2.0


In [23]:
duplicates = soil_profile_record_final[soil_profile_record_final.duplicated(subset='sample_id', keep=False)]
print("🔍 Duplicate sample_id(s):")
print(duplicates)


🔍 Duplicate sample_id(s):
      profile_record_id profile site_info_id sample_id soil_type_id
16                   17    1_61            3      <NA>         <NA>
32                   33    1_64            5      <NA>         <NA>
33                   34    1_66            6      <NA>         <NA>
34                   35    1_69            7      <NA>         <NA>
35                   36    1_70            8      <NA>         <NA>
...                 ...     ...          ...       ...          ...
13369             13370  99c_62         4317      <NA>       2515.0
13370             13371  99c_63         4318      <NA>       2516.0
13371             13372   9c_60         4319      <NA>       2517.0
13372             13373   9c_63         4320      <NA>       2518.0
13373             13374   9c_65         4321      <NA>         <NA>

[3118 rows x 5 columns]
