In [1]:
import pandas as pd
import numpy as np
import vaex
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, accuracy_score
from sklearn.model_selection import train_test_split

# Read the data from CSV files
data = pd.read_csv('Doceree-HCP_Train.csv', encoding='latin-1')
test = pd.read_csv('Doceree-HCP-Test.csv', encoding='latin-1')

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Encode string columns in 'data'
for column in data.columns:
    if data[column].dtype == 'object':
        data[column] = label_encoder.fit_transform(data[column])

# Encode string columns in 'test'
for column in test.columns:
    if test[column].dtype == 'object':
        test[column] = label_encoder.fit_transform(test[column])

# Export modified dataframes to HDF5 files
vaex_data=vaex.from_pandas(data)
vaex_test=vaex.from_pandas(test)
vaex_data.export_hdf5('Doceree-HCP_Train.hdf5')
vaex_test.export_hdf5('Doceree-HCP-Test.hdf5')

# data.to_hdf('Doceree-HCP_Train.hdf5', key='data', mode='w')
# test.to_hdf('Doceree-HCP-Test.hdf5', key='test', mode='w')

# Open HDF5 files as Vaex DataFrames
vaex_df = vaex.open('Doceree-HCP_Train.hdf5')
vaex_test = vaex.open('Doceree-HCP-Test.hdf5')

# Shuffle the DataFrame with a fixed random state
vaex_df = vaex_df.shuffle(random_state=42)

# Fill missing values with 0 in vaex_df
for column in vaex_df.columns:
    vaex_df[column].fillna(0, inplace=True)

for column in vaex_test.columns:
    # Fill missing values with 0 for the current column
    vaex_test[column].fillna(0, inplace=True)

# Split vaex_df into train and test sets
df_train, df_test = vaex_df.ml.train_test_split(test_size=0.2, verbose=False)

# Define features and target for IS_HCP model
features = ['DEVICETYPE', 'PLATFORM_ID', 'BIDREQUESTIP', 'USERPLATFORMUID', 'USERCITY', 'USERZIPCODE','USERAGENT', 'PLATFORMTYPE', 'CHANNELTYPE', 'URL', 'KEYWORDS']
target = 'IS_HCP'

# Extract features and target arrays for IS_HCP model
X_train = df_train[features].values
y_train = df_train[target].values

X_test = df_test[features].values
y_test = df_test[target].values

# Create and fit the RandomForestRegressor model for IS_HCP
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Set the threshold for classification
threshold = 0.5

# Make predictions for IS_HCP model
predictions = model.predict(X_test)

# Apply threshold for classification
class_predictions = np.where(predictions >= threshold, 1, 0)

# Calculate and print evaluation metrics for IS_HCP model
r2 = r2_score(y_test, class_predictions)
mae = mean_absolute_error(y_test, class_predictions)
print("IS_HCP Model - R-squared:", r2)
print("IS_HCP Model - Mean Absolute Error:", mae)

# Define features and target for Taxonomy model
txo_features = ['DEVICETYPE', 'BIDREQUESTIP', 'USERPLATFORMUID', 'USERCITY', 'USERZIPCODE','USERAGENT', 'PLATFORMTYPE', 'CHANNELTYPE', 'URL', 'KEYWORDS']
txo_target = 'IS_HCP'

# Extract features and target arrays for Taxonomy model
X = vaex_df[txo_features].values
y = vaex_df[txo_target].values

# Encode the target variable for the Taxonomy model
encoded_labels = label_encoder.fit_transform(y)

# Split data for the Taxonomy model
X_train, X_test, y_train, y_test = train_test_split(X, encoded_labels, test_size=0.2, random_state=42)

# Create and fit the RandomForestRegressor model for Taxonomy
txo_model = RandomForestRegressor()
txo_model.fit(X_train, y_train)

# Make predictions for the Taxonomy model
t_predictions = txo_model.predict(X_test)

# Calculate and print evaluation metrics for the Taxonomy model
r2 = r2_score(y_test, t_predictions)
mae = mean_absolute_error(y_test, t_predictions)
print("Taxonomy Model - R-squared:", r2)
print("Taxonomy Model - Mean Absolute Error:", mae)

# Decode the encoded labels back to original labels for accuracy calculation
decoded_labels = label_encoder.inverse_transform(y_test)

# Calculate accuracy for the Taxonomy model
accuracy = accuracy_score(decoded_labels, np.round(t_predictions))
print("Taxonomy Model - Accuracy:", accuracy)


ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values