In [20]:
import pandas as pd
import joblib
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor

In [26]:
# Load training data
train_data = pd.read_csv("/home/khanh/projects/Project-ML/train_file.csv")
X_train = train_data.drop('price', axis=1)
y_train = train_data['price']

# One-hot encode categorical variables in the training data
x_train_en = pd.get_dummies(X_train, drop_first=True)

# Impute missing values
imputer = KNNImputer()
x_train_imputed = imputer.fit_transform(x_train_en)
x_train_imputed_df = pd.DataFrame(x_train_imputed, columns=x_train_en.columns)

# Scale features
feature_scaler = MinMaxScaler()
x_train_scaled = feature_scaler.fit_transform(x_train_imputed_df)

# Scale target
target_scaler = MinMaxScaler()
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))

# Feature selection using RFE with a DecisionTreeRegressor
dtr = DecisionTreeRegressor()
rfe = RFE(dtr, n_features_to_select=6)  # Adjust n_features_to_select as needed
rfe.fit(x_train_scaled, y_train_scaled.ravel())

# Select the important features
selected_columns = x_train_imputed_df.columns[rfe.support_].tolist()

# Train the model using only the selected features
x_train_selected = x_train_imputed_df[selected_columns]
x_train_selected_scaled = feature_scaler.fit_transform(x_train_selected)

model = GradientBoostingRegressor(learning_rate=0.10903, max_depth=5, n_estimators=54)
model.fit(x_train_selected_scaled, y_train_scaled.ravel())

# Save the model, scalers, and selected columns
joblib.dump(model, 'final_gradient_model.pkl')
joblib.dump(feature_scaler, 'feature_scaler.pkl')
joblib.dump(target_scaler, 'target_scaler.pkl')
joblib.dump(selected_columns, 'selected_columns.pkl')


['selected_columns.pkl']

In [25]:
import pandas as pd
import joblib
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

# Load the saved model
final_gradient = joblib.load('final_gradient_model.pkl')

# Load the original training data to ensure alignment of columns
train_data = pd.read_csv("/home/khanh/projects/Project-ML/train_file.csv")

# Separate features (X) and target (y) from the training data
X_train = train_data.drop('price', axis=1)
y_train = train_data['price']

# One-hot encode categorical variables in the training data
x_train_en = pd.get_dummies(X_train, drop_first=True)

# Load test data
test_data = pd.read_csv("/home/khanh/projects/Project-ML/test_file.csv")

# Preprocess the first row of test data
test_row = test_data.head(1).copy()

# Save test data identifiers
Id_pred = test_row['id']
Room_pred = test_row['room']
Area_pred = test_row['area']
Toilet_pred = test_row['toilet']

# Drop unnecessary columns
test_row.drop(['room', 'area', 'toilet', 'id', 'price'], inplace=True, axis=1)

# One-hot encode categorical variables in the test data
test_en = pd.get_dummies(test_row, drop_first=True)

# Align train and test data columns
test_en = test_en.reindex(columns=x_train_en.columns, fill_value=0)

# Impute missing values in the test data
imputer = KNNImputer()
imputer.fit(x_train_en)
test_en_3 = imputer.transform(test_en)
test_en_3 = pd.DataFrame(test_en_3, columns=test_en.columns)

# Scale the test data
mm_scaler = MinMaxScaler()
mm_scaler.fit(x_train_en)
test_scaled = pd.DataFrame(mm_scaler.transform(test_en_3), columns=test_en_3.columns)

# Select the same features as used in the training
selected_features = ['room', 'area', 'x', 'y', 'khoang_cach', 'n_hospital']
test_selected = test_scaled[selected_features]

# Predict using the loaded model
ypred_scale = final_gradient.predict(test_selected)
ypred_scale = pd.DataFrame(ypred_scale)

# Load the target scaler used during training
target_scaler = MinMaxScaler()
y_train_df = pd.DataFrame(y_train)
target_scaler.fit(y_train_df)

# Inverse transform the prediction to get the original scale
ypred = target_scaler.inverse_transform(ypred_scale)

# Create a DataFrame for predictions
pred_data = pd.DataFrame(ypred, columns=['price'])
target_pred = pd.concat([Id_pred, Room_pred, Area_pred, Toilet_pred, pred_data], axis=1)

# Display the prediction
print(target_pred)


     id  room  area  toilet      price
0  3309     3  84.7       2  28.514112
