In [1]:
# import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

In [3]:
# 🔹 1. Define your folder path (use raw string to avoid Windows escape errors)
csv_dir = r"C:\Users\Dell\OneDrive\Desktop\Project 1\2024"

# 🔹 2. Get all .csv files in the folder
csv_files = [file for file in os.listdir(csv_dir) if file.endswith(".csv")]
print(f"🗂 Found {len(csv_files)} CSV files.")

# 🔹 3. Create a list to hold individual DataFrames
dataframes = []

# 🔹 4. Loop through files, read them, and append to the list
for file in csv_files:
    file_path = os.path.join(csv_dir, file)
    
    try:
        df = pd.read_csv(file_path)
        df['source_file'] = file  # Optional: Track which file each row came from
        dataframes.append(df)
        print(f"✅ Loaded: {file}")
    except Exception as e:
        print(f"⚠️ Could not read {file}: {e}")

# 🔹 5. Concatenate all loaded DataFrames
if dataframes:
    merged_df = pd.concat(dataframes, ignore_index=True)
    print(f"✅ Successfully merged {len(dataframes)} DataFrames.")
else:
    print("❌ No DataFrames loaded. Check file formats or folder path.")

# 🔹 6. (Optional) Preview and save the merged result
merged_df.head()  # Preview first 5 rows

# 🔹 7. Save to disk
output_path = os.path.join(csv_dir, "2024_merged_freight_data.csv")
merged_df.to_csv(output_path, index=False)
print(f"📁 Merged CSV saved to: {output_path}")


🗂 Found 26 CSV files.
✅ Loaded: dot1_0124.csv
✅ Loaded: dot1_0224.csv
✅ Loaded: dot1_0324.csv
✅ Loaded: dot1_0424.csv
✅ Loaded: dot1_0524.csv
✅ Loaded: dot1_0624.csv
✅ Loaded: dot1_0724.csv
✅ Loaded: dot1_0824.csv
✅ Loaded: dot1_0924.csv
✅ Loaded: dot2_0124.csv
✅ Loaded: dot2_0224.csv
✅ Loaded: dot2_0324.csv
✅ Loaded: dot2_0424.csv
✅ Loaded: dot2_0524.csv
✅ Loaded: dot2_0624.csv
✅ Loaded: dot2_0724.csv
✅ Loaded: dot2_0824.csv
✅ Loaded: dot2_0924.csv
✅ Loaded: dot3_0124.csv
✅ Loaded: dot3_0224.csv
✅ Loaded: dot3_0424.csv
✅ Loaded: dot3_0524.csv
✅ Loaded: dot3_0624.csv
✅ Loaded: dot3_0724.csv
✅ Loaded: dot3_0824.csv
✅ Loaded: dot3_0924.csv
✅ Successfully merged 26 DataFrames.
📁 Merged CSV saved to: C:\Users\Dell\OneDrive\Desktop\Project 1\2024\2024_merged_freight_data.csv


In [5]:
# Use the same path you saved to
merged_path = r"C:\Users\Dell\OneDrive\Desktop\Project 1\2024\2024_merged_freight_data.csv"

# Load the CSV into a new DataFrame
TFDA_2024 = pd.read_csv(merged_path)

# Preview the first 5 rows
TFDA_2024.head()


Unnamed: 0,TRDTYPE,USASTATE,DEPE,DISAGMOT,MEXSTATE,CANPROV,COUNTRY,VALUE,SHIPWT,FREIGHT_CHARGES,DF,CONTCODE,MONTH,YEAR,source_file,COMMODITY2
0,1,AK,0708,5,,XO,1220,24865,0,74,2.0,X,1,2024,dot1_0124.csv,
1,1,AK,0712,5,,XM,1220,9990,0,1,1.0,X,1,2024,dot1_0124.csv,
2,1,AK,0901,5,,XO,1220,20374,0,392,1.0,X,1,2024,dot1_0124.csv,
3,1,AK,2006,3,,XC,1220,12373,39,236,1.0,X,1,2024,dot1_0124.csv,
4,1,AK,20XX,3,,XA,1220,40263,1822,408,1.0,X,1,2024,dot1_0124.csv,


### Understanding the dataset

In [8]:
# Quick overview
TFDA_2024.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1108543 entries, 0 to 1108542
Data columns (total 16 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   TRDTYPE          1108543 non-null  int64  
 1   USASTATE         969546 non-null   object 
 2   DEPE             400100 non-null   object 
 3   DISAGMOT         1108543 non-null  int64  
 4   MEXSTATE         329537 non-null   object 
 5   CANPROV          577941 non-null   object 
 6   COUNTRY          1108543 non-null  int64  
 7   VALUE            1108543 non-null  int64  
 8   SHIPWT           1108543 non-null  int64  
 9   FREIGHT_CHARGES  1108543 non-null  int64  
 10  DF               744242 non-null   float64
 11  CONTCODE         1108543 non-null  object 
 12  MONTH            1108543 non-null  int64  
 13  YEAR             1108543 non-null  int64  
 14  source_file      1108543 non-null  object 
 15  COMMODITY2       847440 non-null   float64
dtypes: float64(2), int

In [10]:
# checking the shape of the dataset
TFDA_2024.shape

(1108543, 16)

In [12]:
# checking the various columns
TFDA_2024.columns

Index(['TRDTYPE', 'USASTATE', 'DEPE', 'DISAGMOT', 'MEXSTATE', 'CANPROV',
       'COUNTRY', 'VALUE', 'SHIPWT', 'FREIGHT_CHARGES', 'DF', 'CONTCODE',
       'MONTH', 'YEAR', 'source_file', 'COMMODITY2'],
      dtype='object')

### Data Cleaning 

In [17]:
TFDA_2024['TRDTYPE'].unique()

array([1, 2], dtype=int64)

In [19]:
# See the distribution of values in TRDTYPE
TFDA_2024['TRDTYPE'].value_counts(dropna=False)

TRDTYPE
1    744242
2    364301
Name: count, dtype: int64

In [21]:
# Fill categorical columns with placeholders
TFDA_2024['USASTATE'] = TFDA_2024['USASTATE'].fillna('UNKNOWN')
TFDA_2024['DEPE'] = TFDA_2024['DEPE'].fillna('0000')
TFDA_2024['MEXSTATE'] = TFDA_2024['MEXSTATE'].fillna('OT')         # 'OT' = State Unknown
TFDA_2024['CANPROV'] = TFDA_2024['CANPROV'].fillna('OT')           # 'OT' = Province Unknown


In [23]:
# Isolate all TRDTYPE == 2 rows (protect them)
trdtype_2_rows = TFDA_2024[TFDA_2024['TRDTYPE'] == 2]

# Clean the rest of the dataset (excluding TRDTYPE == 2)
other_rows = TFDA_2024[TFDA_2024['TRDTYPE'] != 2]
cleaned_other_rows = other_rows.dropna(subset=['DEPE', 'COMMODITY2', 'DF'])

# Combine cleaned rows and protected TRDTYPE==2 rows
TFDA_2024_cleaned = pd.concat([cleaned_other_rows, trdtype_2_rows], ignore_index=True)

# Reset index
TFDA_2024_cleaned.reset_index(drop=True, inplace=True)

# Confirm TRDTYPE distribution
print(TFDA_2024_cleaned['TRDTYPE'].value_counts(dropna=False))

# reassign to main variable
TFDA_2024 = TFDA_2024_cleaned

TRDTYPE
1    579591
2    364301
Name: count, dtype: int64


In [25]:
# Fill numeric columns with zeros or neutral values
TFDA_2024['DF'] = TFDA_2024['DF'].fillna(0).astype(int)
TFDA_2024['COMMODITY2'] = TFDA_2024['COMMODITY2'].fillna(99).astype(int)

In [27]:
TFDA_2024.dtypes

TRDTYPE             int64
USASTATE           object
DEPE               object
DISAGMOT            int64
MEXSTATE           object
CANPROV            object
COUNTRY             int64
VALUE               int64
SHIPWT              int64
FREIGHT_CHARGES     int64
DF                  int32
CONTCODE           object
MONTH               int64
YEAR                int64
source_file        object
COMMODITY2          int32
dtype: object

In [29]:
TFDA_2024.duplicated().sum()

0

In [31]:
# checking for NaNs in the dataset
TFDA_2024.isnull().sum()

TRDTYPE            0
USASTATE           0
DEPE               0
DISAGMOT           0
MEXSTATE           0
CANPROV            0
COUNTRY            0
VALUE              0
SHIPWT             0
FREIGHT_CHARGES    0
DF                 0
CONTCODE           0
MONTH              0
YEAR               0
source_file        0
COMMODITY2         0
dtype: int64

In [33]:
# Summary statistics
TFDA_2024.describe()

Unnamed: 0,TRDTYPE,DISAGMOT,COUNTRY,VALUE,SHIPWT,FREIGHT_CHARGES,DF,MONTH,YEAR,COMMODITY2
count,943892.0,943892.0,943892.0,943892.0,943892.0,943892.0,943892.0,943892.0,943892.0,943892.0
mean,1.385956,4.738033,1530.130566,3140772.0,1280400.0,39922.18,0.820197,5.05058,2024.0,61.381021
std,0.486821,1.224175,385.774982,44841360.0,44779080.0,1160656.0,0.748185,2.576271,0.0,29.185375
min,1.0,1.0,1220.0,0.0,0.0,0.0,0.0,1.0,2024.0,1.0
25%,1.0,5.0,1220.0,14248.0,0.0,0.0,0.0,3.0,2024.0,38.0
50%,1.0,5.0,1220.0,73052.5,0.0,300.0,1.0,5.0,2024.0,69.0
75%,2.0,5.0,2010.0,439992.0,4499.0,3025.0,1.0,7.0,2024.0,87.0
max,2.0,9.0,2010.0,5595625000.0,8729049000.0,238945200.0,2.0,9.0,2024.0,99.0


In [35]:
TFDA_2024.shape

(943892, 16)

In [None]:
TFDA_2024.to_csv("TFDA_2024_cleaned.csv", index=False, encoding='utf-8-sig')