In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tqdm import tqdm

import time
import pandas as pd

In [3]:
"""
Data processing
"""
train_and_val = pd.read_excel('/Users/trongphan/Downloads/Rice_Datathon/rice-datathon-2025/data/training.xlsx')

# Convert 'Date' to datetime format and extract the year
train_and_val['Date'] = pd.to_datetime(train_and_val['Date'], format='%Y').dt.year

# Fill missing values in 'Model Year' with the median value
train_and_val['Model Year'].fillna(train_and_val['Model Year'].median(), inplace=True)

# Convert 'Model Year' to integer
train_and_val['Model Year'] = train_and_val['Model Year'].astype(int)

# Convert categorical columns to category dtype
categorical_columns = ['Vehicle Category', 'GVWR Class', 'Fuel Type', 'Fuel Technology', 'Electric Mile Range', 'Number of Vehicles Registered at the Same Address', 'Model Year']
for col in categorical_columns:
    train_and_val[col] = train_and_val[col].astype('category')

# Remove duplicates
train_and_val.drop_duplicates(inplace=True)

# Reset index
train_and_val.reset_index(drop=True, inplace=True)

# Display the cleaned dataframe
train_and_val = train_and_val.drop(columns=["Region"])
# Load data (replace with your actual dataset)
# Assuming the data is in a CSV file named "data.csv"
# data = train_and_val[:1000]
data = train_and_val
# Preprocessing
# One-hot encode categorical columns
categorical_cols = ["Vehicle Category", "Fuel Type", "Fuel Technology", "Electric Mile Range"]
data = pd.get_dummies(data, columns=categorical_cols)
# Ordinal encode "Number of Vehicles Registered at the Same Address"
ordinal_mapping = {1: int(1), 2: int(2), 3: int(3), "\u22654": 4, "Unknown": -1}
data["Number of Vehicles Registered at the Same Address"] = data["Number of Vehicles Registered at the Same Address"].map(ordinal_mapping)
# Preprocessing
# Impute missing values in "Model Year"
imputer = SimpleImputer(strategy="median")
data["Model Year"] = imputer.fit_transform(data[["Model Year"]])
# Handle non-numeric values in "GVWR Class"
data["GVWR Class"] = data["GVWR Class"].replace({"Not Applicable": -1, "Unknown": -1})
# Feature engineering
data["Vehicle Age"] = data["Date"] - data["Model Year"]
# Check for missing values in the dataset
print("Missing values in each column:")
print(data.isnull().sum())

# Handle missing values in all columns
for col in data.columns:
    if data[col].dtype == "object" or pd.api.types.is_categorical_dtype(data[col]):  # Categorical columns
        data[col].fillna(data[col].mode()[0], inplace=True)
    else:  # Numerical columns
        data[col].fillna(data[col].median(), inplace=True)
# Split data
X_train = data.drop(columns=["Vehicle Population"])
y_train = data["Vehicle Population"]
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


"""
Test set
"""
df_test = pd.read_excel('/Users/trongphan/Downloads/Rice_Datathon/rice-datathon-2025/data/scoring & submission format.xlsx')
test_data = df_test
# Convert 'Date' to datetime format and extract the year
test_data['Date'] = pd.to_datetime(test_data['Date'], format='%Y').dt.year

# Fill missing values in 'Model Year' with the median value
test_data['Model Year'].fillna(test_data['Model Year'].median(), inplace=True)

# Convert 'Model Year' to integer
test_data['Model Year'] = test_data['Model Year'].astype(int)

# Convert categorical columns to category dtype
categorical_columns = ['Vehicle Category', 'GVWR Class', 'Fuel Type', 'Fuel Technology', 'Electric Mile Range', 'Number of Vehicles Registered at the Same Address', 'Region']
for col in categorical_columns:
    test_data[col] = test_data[col].astype('category')
# One-hot encode categorical columns
categorical_cols = ["Vehicle Category", "Fuel Type", "Fuel Technology", "Electric Mile Range"]
test_data = pd.get_dummies(df_test, columns=categorical_cols)
# Display the cleaned dataframe
test_data = test_data.drop(columns=["Region"])
# Remove duplicates
test_data.drop_duplicates(inplace=True)
# Reset index
test_data.reset_index(drop=True, inplace=True)
# Ordinal encode "Number of Vehicles Registered at the Same Address"
ordinal_mapping = {1: int(1), 2: int(2), 3: int(3), "\u22654": 4, "Unknown": -1}
test_data["Number of Vehicles Registered at the Same Address"] = test_data["Number of Vehicles Registered at the Same Address"].map(ordinal_mapping)

# Preprocessing
# Impute missing values in "Model Year"
imputer = SimpleImputer(strategy="median")
test_data["Model Year"] = imputer.fit_transform(test_data[["Model Year"]])
# Handle non-numeric values in "GVWR Class"
test_data["GVWR Class"] = test_data["GVWR Class"].replace({"Not Applicable": -1, "Unknown": -1})

# Feature engineering
test_data["Vehicle Age"] = test_data["Date"] - test_data["Model Year"]

# Check for missing values in the test_dataset
print("Missing values in each column:")
print(test_data.isnull().sum())
# Handle missing values in all columns
for col in test_data.columns:
    if test_data[col].dtype == "object" or pd.api.types.is_categorical_dtype(test_data[col]):  # Categorical columns
        test_data[col].fillna(test_data[col].mode()[0], inplace=True)
    else:  # Numerical columns
        test_data[col].fillna(test_data[col].median(), inplace=True)
  
# Split test_data
X_test = test_data.drop(columns=["Vehicle Population"])
y_test = test_data["Vehicle Population"]

"""
Handle missing columns of X_test, which is Fuel Type_Unknown
"""

# Find the index of 'Fuel Type_Natural Gas'
insert_index = X_test.columns.get_loc('Fuel Type_Natural Gas') + 1

# Insert 'Fuel Type_Unknown' after 'Fuel Type_Natural Gas' and fill with 0
X_test.insert(insert_index, 'Fuel Type_Unknown', 0)

# Now X_test has the 'Fuel Type_Unknown' column after 'Fuel Type_Natural Gas'
print(X_test.columns)



Missing values in each column:
Date                                                 0
GVWR Class                                           0
Model Year                                           0
Number of Vehicles Registered at the Same Address    0
Vehicle Population                                   0
Vehicle Category_B                                   0
Vehicle Category_BS                                  0
Vehicle Category_BT                                  0
Vehicle Category_MC                                  0
Vehicle Category_MH                                  0
Vehicle Category_P                                   0
Vehicle Category_T1                                  0
Vehicle Category_T2                                  0
Vehicle Category_T3                                  0
Vehicle Category_T4                                  0
Vehicle Category_T5                                  0
Vehicle Category_T6                                  0
Vehicle Category_T7               

#### Label Encoder

In [97]:
# Convert all values in 'GVWR Class' column to strings
df_train['GVWR Class'] = df_train['GVWR Class'].astype(str)
df_test['GVWR Class'] = df_test['GVWR Class'].astype(str)

In [98]:
from sklearn.preprocessing import LabelEncoder

# Convert all values in 'GVWR Class' column to strings
df_train['GVWR Class'] = df_train['GVWR Class'].astype(str)
df_test['GVWR Class'] = df_test['GVWR Class'].astype(str)
                                                        
df_train['Number of Vehicles Registered at the Same Address'] = df_train['Number of Vehicles Registered at the Same Address'].astype(str)
df_test['Number of Vehicles Registered at the Same Address'] = df_test['Number of Vehicles Registered at the Same Address'].astype(str)

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'Vehicle Category' column
df_train['Vehicle Category'] = label_encoder.fit_transform(df_train['Vehicle Category'])
df_test['Vehicle Category'] = label_encoder.fit_transform(df_test['Vehicle Category'])

# Fit and transform the 'GVWR Class' column
df_train['GVWR Class'] = label_encoder.fit_transform(df_train['GVWR Class'])
df_test['GVWR Class'] = label_encoder.fit_transform(df_test['GVWR Class'])

# Fit and transform the 'Fuel Type' column
df_train['Fuel Type'] = label_encoder.fit_transform(df_train['Fuel Type'])
df_test['Fuel Type'] = label_encoder.fit_transform(df_test['Fuel Type'])

# Fit and transform the 'Future Technology' column
df_train['Fuel Technology'] = label_encoder.fit_transform(df_train['Fuel Technology'])
df_test['Fuel Technology'] = label_encoder.fit_transform(df_test['Fuel Technology'])

# Fit and transform the 'Electric Mile Range' column
df_train['Electric Mile Range'] = label_encoder.fit_transform(df_train['Electric Mile Range'])
df_test['Electric Mile Range'] = label_encoder.fit_transform(df_test['Electric Mile Range'])

# Apply the custom function to the 'Number of Vehicles Registered at the Same Address' column

df_train['Number of Vehicles Registered at the Same Address'] = label_encoder.fit_transform(df_train['Number of Vehicles Registered at the Same Address'])
df_test['Number of Vehicles Registered at the Same Address'] = label_encoder.fit_transform(df_test['Number of Vehicles Registered at the Same Address'])

# Display the first few rows of the updated dataset
print(df_train.head())
print(df_test.head())

   Date  Vehicle Category  GVWR Class  Fuel Type  Model Year  Fuel Technology  \
0  2019                 5           8          2      2020.0                2   
1  2020                 5           8          2      2020.0                2   
2  2021                 5           8          2      2020.0                2   
3  2019                 5           8          2      2019.0                2   
4  2019                 5           8          2      2018.0                2   

   Electric Mile Range  Number of Vehicles Registered at the Same Address  \
0                    4                                                  4   
1                    4                                                  0   
2                    4                                                  0   
3                    4                                                  4   
4                    4                                                  4   

   Vehicle Population  
0              395883  
1 

#### Fill missing values (I do not think this is necessary)

In [24]:
# Fill the model year empty entry with the mean
df_train['Model Year'] = df_train['Model Year'].fillna(df_train['Model Year'].mean())
df_test['Model Year'] = df_test['Model Year'].fillna(df_test['Model Year'].mean())

In [25]:
# Convert model year to normal distribution
df_train['Model Year'] = df_train['Model Year'].apply(lambda x: (x - df['Model Year'].mean()) / df['Model Year'].std())

# Convert date to normal distribution
df_train['Date'] = df_train['Date'].apply(lambda x: (x - df['Date'].mean()) / df['Date'].std())

# Convert model year to normal distribution
df_test['Model Year'] = df_test['Model Year'].apply(lambda x: (x - df['Model Year'].mean()) / df['Model Year'].std())

# Convert date to normal distribution
df_test['Date'] = df_test['Date'].apply(lambda x: (x - df['Date'].mean()) / df['Date'].std())

print(df_train.head())
print(df_test.head())

NameError: name 'df' is not defined

In [86]:
# Handle missing values (example: fill with mean)
# df_train = df_train.fillna(df_train.mean())
df_test = df_test.fillna(df_test.mean())

#### Model Training

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import numpy as np

In [5]:
from tqdm import tqdm

# Define the K-fold Cross Validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [6]:
# Convert all columns to categorical data using one-hot encoding
X_train_encoded = pd.get_dummies(X_train, columns=X_train.columns)
X_test_encoded = pd.get_dummies(X_test, columns=X_test.columns)

# Ensure the same columns in train and test sets
X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='left', axis=1, fill_value=0)

In [7]:
print(X_train_encoded)
print(X_test_encoded)

       Date_2019  Date_2020  Date_2021  Date_2022  Date_2023  GVWR Class_1  \
0              1          0          0          0          0             0   
1              0          1          0          0          0             0   
2              0          0          1          0          0             0   
3              1          0          0          0          0             0   
4              1          0          0          0          0             0   
...          ...        ...        ...        ...        ...           ...   
41035          1          0          0          0          0             0   
41036          1          0          0          0          0             0   
41037          1          0          0          0          0             0   
41038          1          0          0          0          0             0   
41039          1          0          0          0          0             0   

       GVWR Class_2  GVWR Class_3  GVWR Class_4  GVWR Class_5  

#### Simple neural network

In [8]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import numpy as np

# Define the K-fold Cross Validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define hyperparameters to tune
hyperparameters = [
    {'units1': 64, 'units2': 32, 'dropout': 0.2, 'learning_rate': 0.001},
    {'units1': 128, 'units2': 64, 'dropout': 0.3, 'learning_rate': 0.001},
    {'units1': 128, 'units2': 64, 'dropout': 0.2, 'learning_rate': 0.0005},
    # Add more hyperparameter combinations as needed
]

# Initialize lists to store results
models = []
mse_scores = []

In [9]:
from joblib import Parallel, delayed

# Function to train and evaluate the model for one fold
def train_evaluate_fold(train_index, val_index, params):
    X_train_fold, X_val_fold = X_train_encoded.iloc[train_index], X_train_encoded.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Build the neural network model
    model = Sequential()
    model.add(Dense(params['units1'], input_dim=X_train_encoded.shape[1], activation='relu'))
    model.add(Dropout(params['dropout']))
    model.add(Dense(params['units2'], activation='relu'))
    model.add(Dropout(params['dropout']))
    model.add(Dense(1, activation='linear'))

    # Compile the model
    optimizer = Adam(learning_rate=params['learning_rate'])
    model.compile(loss='mean_squared_error', optimizer=optimizer)

    # Train the model
    model.fit(X_train_fold, y_train_fold, epochs=50, batch_size=16, verbose=0)

    # Evaluate the model on the validation fold
    y_val_pred = model.predict(X_val_fold)
    mse = mean_squared_error(y_val_fold, y_val_pred)
    return mse, model

# Perform KFold cross-validation
for params in hyperparameters:
    results = Parallel(n_jobs=-1)(delayed(train_evaluate_fold)(train_index, val_index, params) for train_index, val_index in kf.split(X_train_encoded))
    fold_mse_scores, fold_models = zip(*results)

    # Store the average MSE for this hyperparameter combination
    avg_mse = np.mean(fold_mse_scores)
    mse_scores.append(avg_mse)
    models.append(fold_models[0])  # Store one of the trained models

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m257/257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m257/257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m257/257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m257/257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m257/257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m257/257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m257/257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m257/257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 740us/step
[1m257/257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m257/257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m257/257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m257/257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 509us/step
[1m257/257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 977us/step
[1m257/257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m257/257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [10]:
# Select the top 3 models based on validation MSE
top_3_indices = np.argsort(mse_scores)[:3]
top_3_models = [models[i] for i in top_3_indices]

# Evaluate the top 3 models on the test set
test_mse_scores = []
test_rmse_scores = []
for model in top_3_models:
    y_test_pred = model.predict(X_test_encoded)
    mse_test = mean_squared_error(y_test, y_test_pred)
    test_mse_scores.append(mse_test)
    test_rmse_scores.append(np.sqrt(mse_test))

# Print the test MSE scores for the top 3 models
print("Test MSE scores for the top 3 models:", test_mse_scores)
print("Test RMSE scores for the top 3 models:", test_rmse_scores)

[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 903us/step
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Test MSE scores for the top 3 models: [28039246.740397878, 35819880.639902145, 40260046.056439005]
Test RMSE scores for the top 3 models: [5295.209791915508, 5984.971231334545, 6345.0804609901525]


In [11]:
import math
y_test_pred = model.predict(X_test_encoded)
mse_test = mean_squared_error(y_test, y_test_pred)
print(f"TEST RMSE: {math.sqrt(mse_test)}")

[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
TEST RMSE: 6345.0804609901525


In [12]:
# Construct a DataFrame of y_test and y_test_pred
df_results = pd.DataFrame({
    'y_test': y_test,
    'y_test_pred': y_test_pred.flatten()
})

print(df_results)

      y_test    y_test_pred
0     316065  266766.843750
1     315986  270386.937500
2     306487  268434.906250
3     284754  245489.625000
4     284153  264903.218750
...      ...            ...
7540       1     -44.441864
7541       1     -51.635880
7542       1     -53.184044
7543       1     -38.355873
7544       1     -71.180710

[7545 rows x 2 columns]


#### K-fold Cross Val

In [43]:
# K-fold Cross Validation model evaluation
for train_index, val_index in tqdm(kf.split(X_train)):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Build the neural network model
    model = Sequential()
    model.add(Dense(64, input_dim=X_train_fold.shape[1], activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='linear'))
    
    # Compile the model
    optimizer = Adam(learning_rate=0.001)
    model.compile(loss='mean_squared_error', optimizer=optimizer)
    
    # Train the model
    model.fit(X_train_fold, y_train_fold, epochs=100, batch_size=32, verbose=0)

    # Evaluate the model on the validation fold
    y_val_pred = model.predict(X_val_fold)
    mse = mean_squared_error(y_val_fold, y_val_pred)
    print(f"MSE: {mse}")
    mse_scores.append(mse)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 332us/step


1it [00:35, 35.66s/it]

MSE: 0.09478531330351113


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
1it [00:50, 50.67s/it]


KeyboardInterrupt: 

In [39]:
# Calculate the average MSE from cross-validation
print(mse_scores)
average_mse_cv = np.mean(mse_scores)
print(f"Cross-Validation Mean Squared Error: {average_mse_cv}")

# Evaluate the model on the testing dataset
y_test_pred = model.predict(X_test)
# Calculate RMSE for y_test_pred
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f"Testing Root Mean Squared Error: {rmse_test}")

# Testing Mean Squared Error: 0.26270263641985175

[0.07589031293707155, 0.09949474760384672, 0.11542433867776858, 0.0964253643221126, 0.16584865380707592]
Cross-Validation Mean Squared Error: 0.11061668346957507
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 349us/step
Testing Root Mean Squared Error: 0.4532908615170071


In [42]:
y_test_pred

array([[ 8.858052  ],
       [ 9.234387  ],
       [ 9.411184  ],
       ...,
       [-0.15806201],
       [-0.15560657],
       [-0.15710393]], dtype=float32)

In [41]:
# Construct a DataFrame of y_test and y_test_pred
df_results = pd.DataFrame({
    'y_test': y_test,
    'y_test_pred': y_test_pred.flatten()
})

print(df_results.head())

      y_test  y_test_pred
0  16.477212     8.858052
1  16.473047     9.234387
2  15.972279     9.411184
3  14.826558     9.821026
4  14.794875     9.599652


#### Another Model (MLP)

In [89]:
# Implement a neural network model to it to predict the Vehicle Population
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline

# Define the target and features
X_train = df_train.drop(columns=['Vehicle Population']) 
y_train = df_train['Vehicle Population']  
X_test = df_test.drop(columns=['Vehicle Population'])  
y_test = df_test['Vehicle Population']  


# Create a pipeline with standardization and MLPRegressor
pipeline = make_pipeline(StandardScaler(), MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42))
# Fit the model
pipeline.fit(X_train, y_train)
# Make predictions
y_pred = pipeline.predict(X_test)
# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)

In [90]:
print("Mean Squared Error:", mse)

Mean Squared Error: 0.3220264745127927


In [91]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import numpy as np

# Define the target and features
X_train = df_train.drop(columns=['Vehicle Population']) 
y_train = df_train['Vehicle Population']  
X_test = df_test.drop(columns=['Vehicle Population'])  
y_test = df_test['Vehicle Population']  

# Build the neural network model
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='linear'))

# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='mean_squared_error', optimizer=optimizer)

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78