<a href="https://colab.research.google.com/github/ihsanandrinal/customs-duty-estimation/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import numpy as np
import pandas as pd
import io
from google.colab import files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load Dataset from CSV
upload = files.upload()
filename = list(upload.keys())[0]
df = pd.read_csv(
     io.BytesIO(upload[filename]),
     sep=";",  # Explicitly set the delimiter to comma
     quotechar='"',  # Explicitly set the quote character to double quote
     # You can use error_bad_lines=False to skip bad lines instead of raising an error
     # or on_bad_lines='skip' to skip them
     on_bad_lines='skip', # Skip lines with too many fields
    )

# Ensure the dataset has the required columns
required_columns = ["doc_date", "hscode", "description", "origin", "customs_duty", "coo"]
if not all(column in df.columns for column in required_columns):
  print(f"Avalaible columns in CSV: {df.columns.tolist()}")
  raise ValueError(f"The CSV file must contain the following columns: {required_columns}")

# Preprocess date data
df["doc_date"] = pd.to_datetime(df["doc_date"], format="%d/%m/%Y")
df["year"] = df["doc_date"].dt.year
df["month"] = df["doc_date"].dt.month
df["day"] = df["doc_date"].dt.day

year_feature = df['year'].values.reshape(-1, 1)  # Reshape to 2D array
month_feature = df['month'].values.reshape(-1, 1)
day_feature = df['day'].values.reshape(-1, 1)  # Add more date features as needed

# Preprocess hscode data
encoder = OneHotEncoder(sparse_output=False)
hscode_encoded = encoder.fit_transform(df[["hscode"]])

# Preprocess Text Descriptions
vectorizer = TfidfVectorizer(max_features=10)  # Limit features to 10 for simplicity
text_features = vectorizer.fit_transform(df["description"]).toarray()

# Encode Categorical Features
encoder = OneHotEncoder(sparse_output=False)
categorical_features = encoder.fit_transform(df[["origin"]])

# Normalize Numeric Features
df["customs_duty"] = df["customs_duty"].str.replace(',', '.').astype(float)
scaler = MinMaxScaler()
scaled_values = scaler.fit_transform(df[["customs_duty"]])

# Preprocess coo data
#encoder = OneHotEncoder(sparse_output=False)
#coo_encoded = encoder.fit_transform(df[["coo"]])

# Step 5: Combine All Features
combined_features = np.hstack((year_feature, month_feature, day_feature, hscode_encoded, text_features, categorical_features, scaled_values))

# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(combined_features, df["customs_duty"], test_size=0.2, random_state=42)

# Step 7: Train the Model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Step 8: Make Predictions
predictions = model.predict(X_test)

# Step 9: Evaluate the Model
mae = mean_absolute_error(y_test, predictions)
print("Mean Absolute Error:", mae)

# Example: Predict for a new description
new_data = {
    "doc_date": ["01/01/2023"],
    "hscode": ["090210"],
    "description": ["Coffee"],
    "customs_duty": [10000],
    "origin": ["CH"]
}
new_df = pd.DataFrame(new_data)

# Preprocess new data
new_df["doc_date"] = pd.to_datetime(new_df["doc_date"], format="%d/%m/%Y")
new_df["year"] = new_df["doc_date"].dt.year
new_df["month"] = new_df["doc_date"].dt.month
new_df["day"] = new_df["doc_date"].dt.day

year_feature_new = new_df['year'].values.reshape(-1, 1)
month_feature_new = new_df['month'].values.reshape(-1, 1)
day_feature_new = new_df['day'].values.reshape(-1, 1)

encoder_hscode = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder_hscode.fit(df[["hscode"]])
hscode_encoded_new = encoder_hscode.transform(new_df[["hscode"]])

txt_features = vectorizer.transform(new_df["description"]).toarray()
scaled_val = scaler.transform(new_df[["customs_duty"]])
cat_features = encoder.transform(new_df[["origin"]])

new_combined_features = np.hstack((year_feature_new, month_feature_new, day_feature_new, hscode_encoded_new, txt_features, scaled_val, cat_features))
new_prediction = model.predict(new_combined_features)

print("Predicted Customs Duty:", new_prediction[0])

Saving test_sample.csv to test_sample (5).csv
Mean Absolute Error: 18986268.367750935
Predicted Customs Duty: 72189.32199999999
