In [1]:
# import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

In [3]:
# 🔹 1. Define your folder path (use raw string to avoid Windows escape errors)
csv_dir = r"C:\Users\Dell\OneDrive\Desktop\Project 1\2023"

# 🔹 2. Get all .csv files in the folder
csv_files = [file for file in os.listdir(csv_dir) if file.endswith(".csv")]
print(f"🗂 Found {len(csv_files)} CSV files.")

# 🔹 3. Create a list to hold individual DataFrames
dataframes = []

# 🔹 4. Loop through files, read them, and append to the list
for file in csv_files:
    file_path = os.path.join(csv_dir, file)
    
    try:
        df = pd.read_csv(file_path)
        df['source_file'] = file  # Optional: Track which file each row came from
        dataframes.append(df)
        print(f"✅ Loaded: {file}")
    except Exception as e:
        print(f"⚠️ Could not read {file}: {e}")

# 🔹 5. Concatenate all loaded DataFrames
if dataframes:
    merged_df = pd.concat(dataframes, ignore_index=True)
    print(f"✅ Successfully merged {len(dataframes)} DataFrames.")
else:
    print("❌ No DataFrames loaded. Check file formats or folder path.")

# 🔹 6. (Optional) Preview and save the merged result
merged_df.head()  # Preview first 5 rows

# 🔹 7. Save to disk
output_path = os.path.join(csv_dir, "2023_merged_freight_data.csv")
merged_df.to_csv(output_path, index=False)
print(f"📁 Merged CSV saved to: {output_path}")


🗂 Found 60 CSV files.
✅ Loaded: dot1_0123.csv
✅ Loaded: dot1_0223.csv
✅ Loaded: dot1_0323.csv
✅ Loaded: dot1_0423.csv
✅ Loaded: dot1_0523.csv
✅ Loaded: dot1_0623.csv
✅ Loaded: dot1_0723.csv
✅ Loaded: dot1_0823.csv
✅ Loaded: dot1_0923.csv
✅ Loaded: dot1_1023.csv
✅ Loaded: dot1_1123.csv
✅ Loaded: dot1_1223.csv
✅ Loaded: dot1_ytd_0123.csv
✅ Loaded: dot1_ytd_0223.csv
✅ Loaded: dot1_ytd_0323.csv
✅ Loaded: dot1_ytd_0423.csv
✅ Loaded: dot1_ytd_0523.csv
✅ Loaded: dot1_ytd_0623.csv
✅ Loaded: dot1_ytd_0723.csv
✅ Loaded: dot1_ytd_0823.csv
✅ Loaded: dot2_0123.csv
✅ Loaded: dot2_0223.csv
✅ Loaded: dot2_0323.csv
✅ Loaded: dot2_0423.csv
✅ Loaded: dot2_0523.csv
✅ Loaded: dot2_0623.csv
✅ Loaded: dot2_0723.csv
✅ Loaded: dot2_0823.csv
✅ Loaded: dot2_0923.csv
✅ Loaded: dot2_1023.csv
✅ Loaded: dot2_1123.csv
✅ Loaded: dot2_1223.csv
✅ Loaded: dot2_ytd_0123.csv
✅ Loaded: dot2_ytd_0223.csv
✅ Loaded: dot2_ytd_0323.csv
✅ Loaded: dot2_ytd_0423.csv
✅ Loaded: dot2_ytd_0523.csv
✅ Loaded: dot2_ytd_0623.csv
✅ Loaded: 

In [5]:
# Use the same path you saved to
merged_path = r"C:\Users\Dell\OneDrive\Desktop\Project 1\2023\2023_merged_freight_data.csv"

# Load the CSV into a new DataFrame
TFDA_2023 = pd.read_csv(merged_path)

# Preview the first 5 rows
TFDA_2023.head()


Unnamed: 0,TRDTYPE,USASTATE,DEPE,DISAGMOT,MEXSTATE,CANPROV,COUNTRY,VALUE,SHIPWT,FREIGHT_CHARGES,DF,CONTCODE,MONTH,YEAR,source_file,COMMODITY2
0,1,AK,0115,5,,XB,1220,6536,0,222,1.0,X,1,2023,dot1_0123.csv,
1,1,AK,0115,5,,XB,1220,10294,0,350,2.0,X,1,2023,dot1_0123.csv,
2,1,AK,0712,5,,XQ,1220,60692,0,2209,1.0,X,1,2023,dot1_0123.csv,
3,1,AK,0901,5,,XO,1220,5480,0,107,1.0,X,1,2023,dot1_0123.csv,
4,1,AK,09XX,3,,XO,1220,12284,5,240,2.0,X,1,2023,dot1_0123.csv,


### Understanding the dataset

In [8]:
# Quick overview
TFDA_2023.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5919169 entries, 0 to 5919168
Data columns (total 16 columns):
 #   Column           Dtype  
---  ------           -----  
 0   TRDTYPE          int64  
 1   USASTATE         object 
 2   DEPE             object 
 3   DISAGMOT         int64  
 4   MEXSTATE         object 
 5   CANPROV          object 
 6   COUNTRY          int64  
 7   VALUE            int64  
 8   SHIPWT           int64  
 9   FREIGHT_CHARGES  int64  
 10  DF               float64
 11  CONTCODE         object 
 12  MONTH            int64  
 13  YEAR             int64  
 14  source_file      object 
 15  COMMODITY2       float64
dtypes: float64(2), int64(8), object(6)
memory usage: 722.6+ MB


In [10]:
# checking the shape of the dataset
TFDA_2023.shape

(5919169, 16)

In [12]:
# checking the various columns
TFDA_2023.columns

Index(['TRDTYPE', 'USASTATE', 'DEPE', 'DISAGMOT', 'MEXSTATE', 'CANPROV',
       'COUNTRY', 'VALUE', 'SHIPWT', 'FREIGHT_CHARGES', 'DF', 'CONTCODE',
       'MONTH', 'YEAR', 'source_file', 'COMMODITY2'],
      dtype='object')

### Data Cleaning 

In [15]:
TFDA_2023['TRDTYPE'].unique()

array([1, 2], dtype=int64)

In [17]:
# See the distribution of values in TRDTYPE
TFDA_2023['TRDTYPE'].value_counts(dropna=False)

TRDTYPE
1    3943619
2    1975550
Name: count, dtype: int64

In [19]:
# Fill categorical columns with placeholders
TFDA_2023['USASTATE'] = TFDA_2023['USASTATE'].fillna('UNKNOWN')
TFDA_2023['DEPE'] = TFDA_2023['DEPE'].fillna('0000')
TFDA_2023['MEXSTATE'] = TFDA_2023['MEXSTATE'].fillna('OT')         # 'OT' = State Unknown
TFDA_2023['CANPROV'] = TFDA_2023['CANPROV'].fillna('OT')           # 'OT' = Province Unknown


In [21]:
# Isolate all TRDTYPE == 2 rows (protect them)
trdtype_2_rows = TFDA_2023[TFDA_2023['TRDTYPE'] == 2]

# Clean the rest of the dataset (excluding TRDTYPE == 2)
other_rows = TFDA_2023[TFDA_2023['TRDTYPE'] != 2]
cleaned_other_rows = other_rows.dropna(subset=['DEPE', 'COMMODITY2', 'DF'])

# Combine cleaned rows and protected TRDTYPE==2 rows
TFDA_2023_cleaned = pd.concat([cleaned_other_rows, trdtype_2_rows], ignore_index=True)

# Reset index
TFDA_2023_cleaned.reset_index(drop=True, inplace=True)

# Confirm TRDTYPE distribution
print(TFDA_2023_cleaned['TRDTYPE'].value_counts(dropna=False))

# reassign to main variable
TFDA_2023 = TFDA_2023_cleaned

TRDTYPE
1    3093154
2    1975550
Name: count, dtype: int64


In [23]:
# Fill numeric columns with zeros or neutral values
TFDA_2023['DF'] = TFDA_2023['DF'].fillna(0).astype(int)
TFDA_2023['COMMODITY2'] = TFDA_2023['COMMODITY2'].fillna(99).astype(int)

In [25]:
TFDA_2023.dtypes

TRDTYPE             int64
USASTATE           object
DEPE               object
DISAGMOT            int64
MEXSTATE           object
CANPROV            object
COUNTRY             int64
VALUE               int64
SHIPWT              int64
FREIGHT_CHARGES     int64
DF                  int32
CONTCODE           object
MONTH               int64
YEAR                int64
source_file        object
COMMODITY2          int32
dtype: object

In [27]:
TFDA_2023.duplicated().sum()

0

In [29]:
# checking again for NaNs to be sure they have been worked on
TFDA_2023.isnull().sum()

TRDTYPE            0
USASTATE           0
DEPE               0
DISAGMOT           0
MEXSTATE           0
CANPROV            0
COUNTRY            0
VALUE              0
SHIPWT             0
FREIGHT_CHARGES    0
DF                 0
CONTCODE           0
MONTH              0
YEAR               0
source_file        0
COMMODITY2         0
dtype: int64

In [31]:
# Summary statistics
TFDA_2023.describe()

Unnamed: 0,TRDTYPE,DISAGMOT,COUNTRY,VALUE,SHIPWT,FREIGHT_CHARGES,DF,MONTH,YEAR,COMMODITY2
count,5068704.0,5068704.0,5068704.0,5068704.0,5068704.0,5068704.0,5068704.0,5068704.0,5068704.0,5068704.0
mean,1.389754,4.736809,1523.908,3163571.0,1319163.0,44766.77,0.8144571,4.146702,2023.0,61.30545
std,0.4876945,1.234557,384.3532,41669200.0,46806850.0,1289883.0,0.748024,2.777258,0.0,29.20467
min,1.0,1.0,1220.0,2.0,0.0,0.0,0.0,1.0,2023.0,1.0
25%,1.0,5.0,1220.0,14333.0,0.0,0.0,0.0,2.0,2023.0,38.0
50%,1.0,5.0,1220.0,73986.0,0.0,324.0,1.0,4.0,2023.0,69.0
75%,2.0,5.0,2010.0,446287.2,5336.0,3225.0,1.0,6.0,2023.0,87.0
max,2.0,9.0,2010.0,4467612000.0,9068700000.0,248721400.0,2.0,12.0,2023.0,99.0


In [33]:
TFDA_2023.shape

(5068704, 16)

In [37]:
TFDA_2023.to_csv("TFDA_2023_cleaned.csv", index=False, encoding='utf-8-sig')