In [1]:
# import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

In [3]:

# 🔹 1. Define your folder path (use raw string to avoid Windows escape errors)
csv_dir = r"C:\Users\Dell\OneDrive\Desktop\Project 1\2021"

# 🔹 2. Get all .csv files in the folder
csv_files = [file for file in os.listdir(csv_dir) if file.endswith(".csv")]
print(f"🗂 Found {len(csv_files)} CSV files.")

# 🔹 3. Create a list to hold individual DataFrames
dataframes = []

# 🔹 4. Loop through files, read them, and append to the list
for file in csv_files:
    file_path = os.path.join(csv_dir, file)
    
    try:
        df = pd.read_csv(file_path)
        df['source_file'] = file  # Optional: Track which file each row came from
        dataframes.append(df)
        print(f"✅ Loaded: {file}")
    except Exception as e:
        print(f"⚠️ Could not read {file}: {e}")

# 🔹 5. Concatenate all loaded DataFrames
if dataframes:
    merged_df = pd.concat(dataframes, ignore_index=True)
    print(f"✅ Successfully merged {len(dataframes)} DataFrames.")
else:
    print("❌ No DataFrames loaded. Check file formats or folder path.")

# 🔹 6. (Optional) Preview and save the merged result
merged_df.head()  # Preview first 5 rows

# 🔹 7. Save to disk
output_path = os.path.join(csv_dir, "2021_merged_freight_data.csv")
merged_df.to_csv(output_path, index=False)
print(f"📁 Merged CSV saved to: {output_path}")


🗂 Found 75 CSV files.
✅ Loaded: dot1_0121.csv
✅ Loaded: dot1_0221.csv
✅ Loaded: dot1_0321.csv
✅ Loaded: dot1_0421.csv
✅ Loaded: dot1_0521.csv
✅ Loaded: dot1_0621.csv
✅ Loaded: dot1_0721.csv
✅ Loaded: dot1_0821.csv
✅ Loaded: dot1_0921.csv
✅ Loaded: dot1_1021.csv
✅ Loaded: dot1_1121.csv
✅ Loaded: dot1_1221.csv
✅ Loaded: dot1_2021.csv
✅ Loaded: dot1_ytd_0121.csv
✅ Loaded: dot1_ytd_0221.csv
✅ Loaded: dot1_ytd_0321.csv
✅ Loaded: dot1_ytd_0421.csv
✅ Loaded: dot1_ytd_0521.csv
✅ Loaded: dot1_ytd_0621.csv
✅ Loaded: dot1_ytd_0721.csv
✅ Loaded: dot1_ytd_0821.csv
✅ Loaded: dot1_ytd_0921.csv
✅ Loaded: dot1_ytd_1021.csv
✅ Loaded: dot1_ytd_1121.csv
✅ Loaded: dot1_ytd_1221.csv
✅ Loaded: dot2_0121.csv
✅ Loaded: dot2_0221.csv
✅ Loaded: dot2_0321.csv
✅ Loaded: dot2_0421.csv
✅ Loaded: dot2_0521.csv
✅ Loaded: dot2_0621.csv
✅ Loaded: dot2_0721.csv
✅ Loaded: dot2_0821.csv
✅ Loaded: dot2_0921.csv
✅ Loaded: dot2_1021.csv
✅ Loaded: dot2_1121.csv
✅ Loaded: dot2_1221.csv
✅ Loaded: dot2_2021.csv
✅ Loaded: dot2_ytd

In [5]:
# Use the same path you saved to
merged_path = r"C:\Users\Dell\OneDrive\Desktop\Project 1\2021\2021_merged_freight_data.csv"

# Load the CSV into a new DataFrame
TFDA_2021 = pd.read_csv(merged_path)

# Preview the first 5 rows
TFDA_2021.head()


Unnamed: 0,TRDTYPE,USASTATE,DEPE,DISAGMOT,MEXSTATE,CANPROV,COUNTRY,VALUE,SHIPWT,FREIGHT_CHARGES,DF,CONTCODE,MONTH,YEAR,source_file,COMMODITY2
0,1,AK,18XX,1,XX,,2010,5940,1136,0,1.0,1,1.0,2021,dot1_0121.csv,
1,1,AK,20XX,3,,XA,1220,7490,26,155,1.0,X,1.0,2021,dot1_0121.csv,
2,1,AK,20XX,3,,XA,1220,24885,13,78,2.0,X,1.0,2021,dot1_0121.csv,
3,1,AK,20XX,3,,XC,1220,16415,139,355,1.0,X,1.0,2021,dot1_0121.csv,
4,1,AK,20XX,3,,XC,1220,9025,5,35,2.0,X,1.0,2021,dot1_0121.csv,


### Understanding the dataset

In [8]:
# Quick overview
TFDA_2021.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10982798 entries, 0 to 10982797
Data columns (total 16 columns):
 #   Column           Dtype  
---  ------           -----  
 0   TRDTYPE          int64  
 1   USASTATE         object 
 2   DEPE             object 
 3   DISAGMOT         int64  
 4   MEXSTATE         object 
 5   CANPROV          object 
 6   COUNTRY          int64  
 7   VALUE            int64  
 8   SHIPWT           int64  
 9   FREIGHT_CHARGES  int64  
 10  DF               float64
 11  CONTCODE         object 
 12  MONTH            float64
 13  YEAR             int64  
 14  source_file      object 
 15  COMMODITY2       float64
dtypes: float64(3), int64(7), object(6)
memory usage: 1.3+ GB


In [10]:
# checking the shape of the dataset
TFDA_2021.shape

(10982798, 16)

In [12]:
# checking the various columns
TFDA_2021.columns

Index(['TRDTYPE', 'USASTATE', 'DEPE', 'DISAGMOT', 'MEXSTATE', 'CANPROV',
       'COUNTRY', 'VALUE', 'SHIPWT', 'FREIGHT_CHARGES', 'DF', 'CONTCODE',
       'MONTH', 'YEAR', 'source_file', 'COMMODITY2'],
      dtype='object')

### Data Cleaning 

In [14]:
TFDA_2021['TRDTYPE'].unique()

array([1, 2], dtype=int64)

In [16]:
# Convert 'TRDTYPE' to categorical (optional but useful)
TFDA_2021['TRDTYPE'] = TFDA_2021['TRDTYPE'].astype('category')

In [18]:
# See the distribution of values in TRDTYPE
TFDA_2021['TRDTYPE'].value_counts(dropna=False)

TRDTYPE
1    7283728
2    3699070
Name: count, dtype: int64

In [20]:
# Fill categorical columns with placeholders
TFDA_2021['USASTATE'] = TFDA_2021['USASTATE'].fillna('UNKNOWN')
TFDA_2021['DEPE'] = TFDA_2021['DEPE'].fillna('0000')
TFDA_2021['MEXSTATE'] = TFDA_2021['MEXSTATE'].fillna('OT')         # 'OT' = State Unknown
TFDA_2021['CANPROV'] = TFDA_2021['CANPROV'].fillna('OT')           # 'OT' = Province Unknown


In [52]:
# Isolate all TRDTYPE == 2 rows (protect them)
trdtype_2_rows = TFDA_2021[TFDA_2021['TRDTYPE'] == 2]

# Clean the rest of the dataset (excluding TRDTYPE == 2)
other_rows = TFDA_2021[TFDA_2021['TRDTYPE'] != 2]
cleaned_other_rows = other_rows.dropna(subset=['DEPE', 'COMMODITY2', 'DF'])

# Combine cleaned rows and protected TRDTYPE==2 rows
TFDA_2021_cleaned = pd.concat([cleaned_other_rows, trdtype_2_rows], ignore_index=True)

# Reset index
TFDA_2021_cleaned.reset_index(drop=True, inplace=True)

# Confirm TRDTYPE distribution
print(TFDA_2021_cleaned['TRDTYPE'].value_counts(dropna=False))

# reassign to main variable
TFDA_2021 = TFDA_2021_cleaned

TRDTYPE
1    5601122
2    3620719
Name: count, dtype: int64


In [30]:
# Fill numeric columns with zeros or neutral values
TFDA_2021['DF'] = TFDA_2021['DF'].fillna(0).astype(int)
TFDA_2021['COMMODITY2'] = TFDA_2021['COMMODITY2'].fillna(99).astype(int)

In [34]:
TFDA_2021.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9428572 entries, 0 to 9428571
Data columns (total 16 columns):
 #   Column           Dtype   
---  ------           -----   
 0   TRDTYPE          category
 1   USASTATE         object  
 2   DEPE             object  
 3   DISAGMOT         int64   
 4   MEXSTATE         object  
 5   CANPROV          object  
 6   COUNTRY          int64   
 7   VALUE            int64   
 8   SHIPWT           int64   
 9   FREIGHT_CHARGES  int64   
 10  DF               int32   
 11  CONTCODE         object  
 12  MONTH            float64 
 13  YEAR             int64   
 14  source_file      object  
 15  COMMODITY2       int32   
dtypes: category(1), float64(1), int32(2), int64(6), object(6)
memory usage: 1016.1+ MB


In [36]:
TFDA_2021.duplicated().sum()

0

In [38]:
# checking again for NaNs to be sure they have been worked on
TFDA_2021.isnull().sum()

TRDTYPE                 0
USASTATE                0
DEPE                    0
DISAGMOT                0
MEXSTATE                0
CANPROV                 0
COUNTRY                 0
VALUE                   0
SHIPWT                  0
FREIGHT_CHARGES         0
DF                      0
CONTCODE                0
MONTH              206731
YEAR                    0
source_file             0
COMMODITY2              0
dtype: int64

In [40]:
# For example: keep rows where DEPE, COMMODITY2, and DF are not null
TFDA_2021.dropna(subset=['DEPE', 'COMMODITY2', 'DF', 'MONTH'], inplace=True)

In [44]:
# Summary statistics
TFDA_2021.describe()

Unnamed: 0,DISAGMOT,COUNTRY,VALUE,SHIPWT,FREIGHT_CHARGES,DF,MONTH,YEAR,COMMODITY2
count,9221841.0,9221841.0,9221841.0,9221841.0,9221841.0,9221841.0,9221841.0,9221841.0,9221841.0
mean,4.735469,1522.225,2686561.0,1334779.0,37416.01,0.8088051,4.954855,2021.0,60.79244
std,1.216247,383.9503,32436340.0,40334020.0,941061.9,0.7466581,3.054144,0.0,29.33959
min,1.0,1220.0,0.0,0.0,0.0,0.0,1.0,2021.0,1.0
25%,5.0,1220.0,13516.0,0.0,0.0,0.0,2.0,2021.0,35.0
50%,5.0,1220.0,66890.0,0.0,263.0,1.0,4.0,2021.0,68.0
75%,5.0,2010.0,399224.0,6252.0,2900.0,1.0,7.0,2021.0,87.0
max,9.0,2010.0,3921320000.0,8450848000.0,209324700.0,2.0,12.0,2021.0,99.0


In [46]:
TFDA_2021.dtypes

TRDTYPE            category
USASTATE             object
DEPE                 object
DISAGMOT              int64
MEXSTATE             object
CANPROV              object
COUNTRY               int64
VALUE                 int64
SHIPWT                int64
FREIGHT_CHARGES       int64
DF                    int32
CONTCODE             object
MONTH               float64
YEAR                  int64
source_file          object
COMMODITY2            int32
dtype: object

In [48]:
TFDA_2021.shape

(9221841, 16)

In [50]:
TFDA_2021.to_csv("TFDA_2021_cleaned.csv", index=False, encoding='utf-8-sig')