In [26]:
import pandas as pd

# Load the training data
train_data_path = 'Cargo Volume - Train.csv'
train_data = pd.read_csv(train_data_path)

# Display the first few rows of the training data
train_data.head()


Unnamed: 0,Activity Period,Operating Airline,Operating Airline IATA Code,Published Airline,Published Airline IATA Code,GEO Summary,GEO Region,Activity Type Code,Cargo Type Code,Cargo Aircraft Type,Cargo Metric TONS
0,201601,Sun Country Airlines,SY,Sun Country Airlines,SY,Domestic,US,Enplaned,Cargo,Passenger,8.871509
1,200811,United Airlines,UA,United Airlines,UA,Domestic,US,Deplaned,Cargo,Passenger,53.946194
2,200709,Qantas Airways,QF,Qantas Airways,QF,International,Australia / Oceania,Deplaned,Express,Passenger,0.043999
3,201608,Philippine Airlines,PR,Philippine Airlines,PR,International,Asia,Deplaned,Mail,Passenger,19.02353
4,201708,Singapore Airlines,SQ,Singapore Airlines,SQ,International,Asia,Deplaned,Express,Passenger,6.95142


In [27]:
# Load the test data
test_data_path = 'Cargo Volume - Test.csv'
test_data = pd.read_csv(test_data_path)

# Display the first few rows of the test data
test_data.head()

Unnamed: 0,Activity Period,Operating Airline,Operating Airline IATA Code,Published Airline,Published Airline IATA Code,GEO Summary,GEO Region,Activity Type Code,Cargo Type Code,Cargo Aircraft Type
0,200907,United Airlines,UA,United Airlines,UA,Domestic,US,Deplaned,Mail,Passenger
1,202001,Nippon Cargo Airlines,KZ,Nippon Cargo Airlines,KZ,International,Asia,Deplaned,Cargo,Freighter
2,201107,China Airlines,CI,China Airlines,CI,International,Asia,Enplaned,Cargo,Passenger
3,201809,China Airlines,CI,China Airlines,CI,International,Asia,Deplaned,Cargo,Passenger
4,200811,China Cargo Airlines,CK,China Cargo Airlines,CK,International,Asia,Enplaned,Cargo,Freighter


In [28]:
# Check for missing values in the training data
missing_values_train = train_data.isnull().sum()

# Check for missing values in the test data
missing_values_test = test_data.isnull().sum()

(missing_values_train, missing_values_test)


(Activity Period                 0
 Operating Airline               0
 Operating Airline IATA Code    31
 Published Airline               0
 Published Airline IATA Code    31
 GEO Summary                     0
 GEO Region                      0
 Activity Type Code              0
 Cargo Type Code                 0
 Cargo Aircraft Type             0
 Cargo Metric TONS               0
 dtype: int64,
 Activity Period                 0
 Operating Airline               0
 Operating Airline IATA Code    14
 Published Airline               0
 Published Airline IATA Code    14
 GEO Summary                     0
 GEO Region                      0
 Activity Type Code              0
 Cargo Type Code                 0
 Cargo Aircraft Type             0
 dtype: int64)

In [29]:
# Impute missing values using the mode for categorical columns
for column in ['Operating Airline IATA Code', 'Published Airline IATA Code']:
    mode_value_train = train_data[column].mode()[0]
    train_data[column].fillna(mode_value_train, inplace=True)

    mode_value_test = test_data[column].mode()[0]
    test_data[column].fillna(mode_value_test, inplace=True)

# Verify if missing values have been filled
missing_values_train_after = train_data.isnull().sum()
missing_values_test_after = test_data.isnull().sum()

# Prepare data for one-hot encoding by converting categorical columns to type 'category'
categorical_columns = train_data.select_dtypes(include=['object']).columns
train_data[categorical_columns] = train_data[categorical_columns].astype('category')
test_data[categorical_columns] = test_data[categorical_columns].astype('category')

# Apply one-hot encoding to the categorical data
train_data_encoded = pd.get_dummies(train_data, drop_first=True)
test_data_encoded = pd.get_dummies(test_data, drop_first=True)

# Ensure that both train and test sets have the same features after encoding
test_data_encoded = test_data_encoded.reindex(columns=train_data_encoded.columns, fill_value=0)
test_data_encoded.drop(['Cargo Metric TONS'], axis=1, inplace=True)

(missing_values_train_after, missing_values_test_after, train_data_encoded.shape, test_data_encoded.shape)


(Activity Period                0
 Operating Airline              0
 Operating Airline IATA Code    0
 Published Airline              0
 Published Airline IATA Code    0
 GEO Summary                    0
 GEO Region                     0
 Activity Type Code             0
 Cargo Type Code                0
 Cargo Aircraft Type            0
 Cargo Metric TONS              0
 dtype: int64,
 Activity Period                0
 Operating Airline              0
 Operating Airline IATA Code    0
 Published Airline              0
 Published Airline IATA Code    0
 GEO Summary                    0
 GEO Region                     0
 Activity Type Code             0
 Cargo Type Code                0
 Cargo Aircraft Type            0
 dtype: int64,
 (27230, 360),
 (6807, 359))

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Split the training data into features and target variable
X_train = train_data_encoded.drop('Cargo Metric TONS', axis=1)
y_train = train_data_encoded['Cargo Metric TONS']

# Splitting the training data into training and validation sets for model evaluation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train_split, y_train_split)

# Predict on the validation set
y_val_pred = lr_model.predict(X_val_split)

# Calculate MAE on the validation set
mae = mean_absolute_error(y_val_split, y_val_pred)
mae


170.07216307152686

In [31]:
# Predict on the test data using the trained model
test_predictions = lr_model.predict(test_data_encoded)

# Since we cannot calculate MAE for the test set (as we don't have true target values), we'll create a DataFrame with the predictions
test_predictions_linear_df = pd.DataFrame(test_predictions, columns=['Predicted Cargo Metric TONS - Linear Regression'])

# Show the first few rows of the predictions DataFrame
test_predictions_linear_df.head()

Unnamed: 0,Predicted Cargo Metric TONS - Linear Regression
0,469.074877
1,404.838212
2,283.131343
3,305.419591
4,206.267583


In [32]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Initialize and train the Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train_split, y_train_split)

# Predict on the validation set
y_val_pred_rf = rf_model.predict(X_val_split)

# Calculate MAE on the validation set for Random Forest
mae_rf = mean_absolute_error(y_val_split, y_val_pred_rf)

# Initialize and train the Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train_split, y_train_split)

# Predict on the validation set
y_val_pred_gb = gb_model.predict(X_val_split)

# Calculate MAE on the validation set for Gradient Boosting
mae_gb = mean_absolute_error(y_val_split, y_val_pred_gb)

(mae_rf, mae_gb)


(41.41488617408556, 148.56370999554102)

In [33]:
# Predict on the test data using the trained Random Forest model
test_predictions_rf = rf_model.predict(test_data_encoded)

# Create a DataFrame with the Random Forest predictions
test_predictions_rf_df = pd.DataFrame(test_predictions_rf, columns=['Predicted Cargo Metric TONS - RF'])

# Show the first few rows of the Random Forest predictions DataFrame
test_predictions_rf_df.head()

Unnamed: 0,Predicted Cargo Metric TONS - RF
0,57.078366
1,159.869728
2,173.231605
3,221.179678
4,108.546884


In [34]:
# Predict on the test data using the trained Gradient Boosting model
test_predictions_gb = gb_model.predict(test_data_encoded)

# Create a DataFrame with the Gradient Boosting predictions
test_predictions_gb_df = pd.DataFrame(test_predictions_gb, columns=['Predicted Cargo Metric TONS - GB'])

# Show the first few rows of the Gradient Boosting predictions DataFrame
test_predictions_gb_df.head()

Unnamed: 0,Predicted Cargo Metric TONS - GB
0,398.788651
1,446.741773
2,203.364812
3,227.265092
4,397.325644


In [36]:
# Combine the test predictions from both models into one DataFrame
result_df = test_data.copy()
result_df['Predicted Cargo Metric TONS - Linear Regression'] = test_predictions_linear_df['Predicted Cargo Metric TONS - Linear Regression']
result_df['Predicted Cargo Metric TONS - RF'] = test_predictions_rf_df['Predicted Cargo Metric TONS - RF']
result_df['Predicted Cargo Metric TONS - GB'] = test_predictions_gb_df['Predicted Cargo Metric TONS - GB']

# Define the result file
result_file = 'dc1.csv'

# Save the result DataFrame to a CSV file
result_df.to_csv(result_file, index=False)

result_file


'dc1.csv'