In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
collision_train = pd.read_csv("train_data-Dell.csv")
#collision_train.info()
#collision_train.head()

In [3]:
collision_test = pd.read_csv("test_data-Dell.csv")
#collision_test.info()
#collision_test.head()

Dummy encoding converts a variable that has n possible distinct values, into n-1 binary variables. 

For regression-type problems Dummy Encoding is preferable as we wish to avoid perfect collinearity between the binary variables that are being generated by the encoding. In the classification problem however, we are not concerned about inflating factors and hence can choose either of the two schemes. They will produce equivalent encodings.

By default pd.get_dummies() does one-hot enocoding. To do dummy encoding, you need to pass the parameter drop_first=True.

In [4]:
categorical_columns = collision_train.select_dtypes(include=['object']).columns

#print("Categorical columns train:")
#print(categorical_columns)

#test
categorical_columns_test = collision_test.select_dtypes(include=['object']).columns

#print("Categorical columns test:")
#print(categorical_columns_test)


In [5]:
# Assuming collision_train is your original DataFrame
# Copy the DataFrame to avoid chained assignment issues
collision_train_copy = collision_train.copy()

# Drop rows with any NaN values
collision_train_copy.dropna(inplace=True)

# Replace missing values in 'c_object_type' column with the mode of non-NaN values
mode_c_object_type = collision_train_copy['c_object_type'].mode()[0]  # Get the mode value
collision_train_copy['c_object_type'].fillna(mode_c_object_type, inplace=True)

# Creating dummy variables
dummy_columns = pd.get_dummies(collision_train_copy['c_object_type'], prefix='c_object_type')

# Concatenating the original DataFrame with the dummy variables
new_collision_train = pd.concat([collision_train_copy, dummy_columns], axis=1)

# Dropping the original 'c_object_type' column
new_collision_train.drop('c_object_type', axis=1, inplace=True)

# Displaying the new DataFrame
print(new_collision_train)


       event_id  time_to_tca  mission_id       risk  max_risk_estimate  \
5             1     6.530455           5  -7.561299          -7.254301   
6             1     5.561646           5  -9.315693          -7.468904   
7             1     5.226504           5  -7.422508          -7.051001   
8             1     3.570013           5  -9.248105          -7.327533   
29            3     6.950088          19  -9.568315          -7.057793   
...         ...          ...         ...        ...                ...   
70096      5638     1.441664           2 -30.000000          -5.937042   
70097      5638     1.259823           2 -30.000000          -5.936667   
70098      5638     0.852358           2 -30.000000          -5.936667   
70099      5638     0.512948           2 -30.000000          -5.934047   
70100      5638     0.228585           2 -30.000000          -6.259953   

       max_risk_scaling  miss_distance  relative_speed  relative_position_r  \
5              2.746782         

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  collision_train_copy['c_object_type'].fillna(mode_c_object_type, inplace=True)


In [6]:
# Repeat the same process for the test
# Copy the DataFrame to avoid chained assignment issues
collision_test_copy = collision_test.copy()

# Drop rows with any NaN values
collision_test_copy.dropna(inplace=True)

# Replace missing values in 'c_object_type' column with the mode of non-NaN values
mode_c_object_type_test = collision_test_copy['c_object_type'].mode()[0]  # Get the mode value
collision_test_copy['c_object_type'].fillna(mode_c_object_type_test, inplace=True)

# Creating dummy variables
dummy_columns_test = pd.get_dummies(collision_test_copy['c_object_type'], prefix='c_object_type')

# Concatenating the original DataFrame with the dummy variables
new_collision_test = pd.concat([collision_test_copy, dummy_columns_test], axis=1)

# Dropping the original 'c_object_type' column
new_collision_test.drop('c_object_type', axis=1, inplace=True)

# Displaying the new DataFrame
print(new_collision_test)


       event_id  time_to_tca  mission_id       risk  max_risk_estimate  \
37            4     6.702359           5 -30.000000          -6.957818   
38            4     6.351916           5  -6.657380          -6.283746   
39            4     6.038843           5  -6.884722          -6.387534   
40            4     5.705221           5  -6.868061          -6.378408   
41            4     5.038975           5  -9.558462          -6.828566   
...         ...          ...         ...        ...                ...   
24464      2165     3.596498           2 -14.871278          -6.488518   
24465      2165     3.290859           2 -13.675101          -6.458296   
24466      2165     2.950547           2 -13.856673          -6.463442   
24467      2165     2.570998           2 -13.486516          -6.450629   
24468      2165     2.263915           2 -13.484391          -6.450751   

       max_risk_scaling  miss_distance  relative_speed  relative_position_r  \
37            88.002734        1

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  collision_test_copy['c_object_type'].fillna(mode_c_object_type_test, inplace=True)


In [7]:
#transform data to proper scale for train

from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the data
new_collision_train_scaled = scaler.fit_transform(new_collision_train)

# Convert the scaled data back to a DataFrame
new_collision_train_scaled = pd.DataFrame(new_collision_train_scaled, columns=new_collision_train.columns)


In [8]:
#option for test

new_collision_test_scaled = scaler.transform(new_collision_test)

new_collision_test_scaled = pd.DataFrame(new_collision_test_scaled, columns=new_collision_test.columns)

In [9]:
X_train = new_collision_train_scaled.drop(columns=['time_to_tca']).values
y_train = new_collision_train_scaled['time_to_tca'].values

In [17]:
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor

# Define the chunk size
chunk_size = 3  # Adjust this value according to your memory constraints

# Determine the number of chunks needed
num_chunks = len(X_train) // chunk_size + 1

# Split the data into chunks
data_splits = []
for i in range(num_chunks):
    start_idx = i * chunk_size
    end_idx = min((i + 1) * chunk_size, X_train.shape[0])
    X_chunk = X_train[start_idx:end_idx]
    y_chunk = y_train[start_idx:end_idx]
    data_splits.append((X_chunk, y_chunk))

# Now data_splits is an iterable of (X_chunk, y_chunk) pairs

# Initialize the classifier
regressor = RandomForestRegressor(warm_start=True, n_estimators=20)
#max_depth=4, min_samples_split=2
# Training loop with progress bar
for _ in tqdm(range(15), desc="Training Progress"):  # 10 passes through the data
    for X, y in data_splits:
        if len(X) == 0:  # Skip empty chunks
            continue
        regressor.fit(X, y)
        regressor.n_estimators += 1


Training Progress: 100%|██████████| 15/15 [06:11<00:00, 24.79s/it]


from sklearn.ensemble import RandomForestRegressor

# Define hyperparameters
# n_estimators = [3, 5]  # Assign specific values
# max_features = ['auto', 'sqrt']
max_depth = [2, 3]
min_samples_split = [2, 3]
# min_samples_leaf = [1, 2]
bootstrap = [True, False]

param_grid = {
    # 'n_estimators': n_estimators,
    # 'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    # 'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}

print(param_grid)


In [18]:
#from sklearn.ensemble import RandomForestRegressor
# Initialize RandomForestRegressor
#regressor = RandomForestRegressor()

# Fit the model using the training data
regressor.fit(X_train, y_train)


In [18]:
#from sklearn.model_selection import GridSearchCV
#rf_Grid = GridSearchCV(estimator = regressor, param_grid = param_grid, cv = 3, verbose=2, n_jobs = 4)

In [None]:
#rf_Grid.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [None]:
#rf_Grid.best_params_

In [19]:
X_test = new_collision_test_scaled.drop(columns=['time_to_tca']).values
y_test = new_collision_test_scaled['time_to_tca'].values

In [20]:
# Prediction on the testing dataset
y_test_pred = regressor.predict(X_test)

In [21]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
#Evaluating on test set

mae = mean_absolute_error(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)
rmse = np.sqrt(mse)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R^2 Score:", r2)

Mean Absolute Error (MAE): 0.19825734753457974
Mean Squared Error (MSE): 0.05983666369380995
Root Mean Squared Error (RMSE): 0.24461533822270826
R^2 Score: -0.47989129164619926


In [22]:
from sklearn.metrics import explained_variance_score
evs = explained_variance_score(y_test, y_test_pred)

print('The EVS of the test is: %.2f' % evs)

The EVS of the test is: 0.01


In [23]:
#Prediction on the training set

y_train_pred = regressor.predict(X_train)

mae = mean_absolute_error(y_train, y_train_pred)
mse = mean_squared_error(y_train, y_train_pred)
r2 = r2_score(y_train, y_train_pred)
rmse = np.sqrt(mse)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R^2 Score:", r2)

Mean Absolute Error (MAE): 0.24304681663967873
Mean Squared Error (MSE): 0.07879575614328413
Root Mean Squared Error (RMSE): 0.2807058177937966
R^2 Score: 0.01536414862818869


In [17]:
from sklearn.metrics import explained_variance_score
evs = explained_variance_score(y_train, y_train_pred)

print('The EVS of the train is: %.2f' % evs)

The EVS of the train is: 0.99


In [18]:
# Create loop throuth x columns
feature_list = [f"feature_{i}" for i in range(X_train.shape[1])]

# Create a pandas Series with feature importances
feature_imp = pd.Series(regressor.feature_importances_, index=feature_list).sort_values(ascending = False)

print("List of important features:", feature_imp)

List of important features: feature_32     4.312650e-01
feature_62     8.198626e-02
feature_2      7.634163e-02
feature_87     5.045525e-02
feature_24     3.793469e-02
                   ...     
feature_101    2.339181e-05
feature_102    2.270022e-05
feature_83     1.450949e-05
feature_84     1.796103e-06
feature_104    1.383355e-07
Length: 105, dtype: float64


An Explained Variance Score (EVS) 
In this case, an EVS of 0.99 suggests that the model's predictions are highly accurate and explain almost all of the variance in the target variable.

import matplotlib.pyplot as plt

# Initialize the classifier
#regressor_test = RandomForestRegressor(warm_start=True, n_estimators=1)

# Training loop
#for _ in range(10):  # 10 passes through the data
    #for X, y in data_splits:
        #regressor.fit(X, y)
        #regressor.n_estimators += 1

# Get feature importances from the trained Random Forest model
feature_importances_test = regressor_test.feature_importances_

# Get feature names
feature_names_test = new_collision_test_scaled.drop(columns=['time_to_tca']).columns

# Plotting feature importances with feature names
plt.figure(figsize=(10, 6))
plt.bar(range(len(feature_importances_test)), feature_importances_test, align='center')
plt.xticks(range(len(feature_importances_test)), feature_names_test, rotation=90)  # Set x-axis labels to feature names
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Random Forest Feature Importances')
plt.tight_layout()
plt.show()
