Given our target variable ('total_cost') is a continuous variable, this is a regression problem. 

Here's a simplified step-by-step guide using Python and scikit-learn, a popular machine learning library:

1. Data Preprocessing:
    - Handle missing values: Check and handle missing values in your dataset.
    - Encode categorical variables: Convert categorical variables into numerical format, as most machine learning models require numerical input.
    
2. Feature Selection:
    - Identify relevant features: Select features that are likely to have an impact on predicting 'total_cost.'
    
3. Split the Data:
    - Split your dataset into a training set and a testing set. The training set is used to train the model, and the testing set is used to evaluate its performance.

4. Build a Model:
    - Choose a regression algorithm: Common choices include Linear Regression, Decision Trees, Random Forest, Gradient Boosting, or Support Vector Machines.
    - Train the model using the training data.

5. Evaluate the Model:
    - Use the testing set to evaluate the model's performance.
    - Common evaluation metrics for regression include Mean Absolute Error (MAE), Mean Squared Error (MSE), and R-squared.

6. Tune and Improve:
    - If the model performance is not satisfactory, you may need to fine-tune hyperparameters or try different algorithms.
    - Feature engineering: Experiment with creating new features or transforming existing ones to improve model performance.


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [110]:
df = pd.read_csv('savegame.csv')

# Handle missing values
df.fillna(0, inplace=True)

# Encode categorical variables
label_encoder = LabelEncoder()
df['tour_arrangement'] = label_encoder.fit_transform(df['tour_arrangement'])
df['main_activity'] = label_encoder.fit_transform(df['main_activity'])

# Select features and target
features = df[['tour_arrangement', 'main_activity', 'night_total']]
target = df['total_cost']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Build the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
print('Mean Absolute Error:', mean_absolute_error(y_test, predictions))
print('Mean Squared Error:', mean_squared_error(y_test, predictions))
print('R-squared:', r2_score(y_test, predictions))

Mean Absolute Error: 6109365.916803202
Mean Squared Error: 108286955223790.94
R-squared: 0.23309402707635152


In [113]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


df = pd.read_csv('savegame.csv')

# Split the data
features = df[['tour_arrangement', 'main_activity', 'night_total', 'country', 'age_group']]
target = df['total_cost']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Define the numerical and categorical features
numeric_features = ['night_total']
categorical_features = ['tour_arrangement', 'main_activity', 'country', 'age_group']

# Create transformers for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),

    
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with the preprocessor and the regression model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

# Fit the model
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
print('Mean Absolute Error:', mean_absolute_error(y_test, predictions))
print('Mean Squared Error:', mean_squared_error(y_test, predictions))
print('R-squared:', r2_score(y_test, predictions))



Mean Absolute Error: 6159056.626087277
Mean Squared Error: 102174374823900.92
R-squared: 0.2763843237604138


In [63]:
o=OneHotEncoder()
o.fit(X_train, y_train)


In [118]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load your data (replace 'your_data.csv' with the actual file name)
df = pd.read_csv('savegame.csv')

# Split the data
features = df[['tour_arrangement', 'main_activity', 'night_total', 'country', 'age_group', 'most_impressing', 'first_trip_tz', 'info_source']]
target = df['total_cost']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Define the numerical and categorical features
numeric_features = ['night_total']
categorical_features = ['tour_arrangement', 'main_activity', 'country', 'age_group', 'most_impressing', 'first_trip_tz', 'info_source']

# Create transformers for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with the preprocessor and a regression model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                            ('regressor', LinearRegression())])

# Define a parameter grid for GridSearchCV
param_grid = {
    'regressor': [LinearRegression(), Ridge(alpha=0.5), DecisionTreeRegressor(), RandomForestRegressor(), XGBRegressor()],
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Make predictions
predictions = best_model.predict(X_test)

# Evaluate the best model
print('Best Model:', best_model.named_steps['regressor'])
print('Mean Absolute Error:', mean_absolute_error(y_test, predictions))
print('Mean Squared Error:', mean_squared_error(y_test, predictions))
print('R-squared:', r2_score(y_test, predictions))

def calculate_mape(actual, predicted):
    return np.mean(np.abs((actual - predicted) / actual)) * 100

mape_value = calculate_mape(y_test, predictions)
print(f'MAPE: {mape_value:.2f}%')

best_model

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END .......................regressor=LinearRegression(); total time=   0.0s
[CV] END .......................regressor=LinearRegression(); total time=   0.0s
[CV] END .......................regressor=LinearRegression(); total time=   0.0s
[CV] END .......................regressor=LinearRegression(); total time=   0.0s
[CV] END .......................regressor=LinearRegression(); total time=   0.0s
[CV] END .........................regressor=Ridge(alpha=0.5); total time=   0.0s
[CV] END .........................regressor=Ridge(alpha=0.5); total time=   0.0s
[CV] END .........................regressor=Ridge(alpha=0.5); total time=   0.0s
[CV] END .........................regressor=Ridge(alpha=0.5); total time=   0.0s
[CV] END .........................regressor=Ridge(alpha=0.5); total time=   0.0s
[CV] END ..................regressor=DecisionTreeRegressor(); total time=   0.0s
[CV] END ..................regressor=DecisionTree

# XGBoost

In [119]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load your data (replace 'your_data.csv' with the actual file name)
df = pd.read_csv('savegame.csv')

# Split the data
features = df[['tour_arrangement', 'main_activity', 'purpose', 'night_total', 'age_group', 'travel_with']]
target = df['total_cost']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Define the numerical and categorical features
numeric_features     = ['night_total']
categorical_features = ['tour_arrangement', 'main_activity', 'purpose', 'night_total', 'age_group', 'travel_with']

# Create transformers for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with the preprocessor and XGBoost regressor
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                            ('regressor', XGBRegressor())])

# Fit the model
pipeline.fit(X_train, y_train)

# Make predictions
predictions = pipeline.predict(X_test)

# Evaluate the model
print('Mean Absolute Error:', mean_absolute_error(y_test, predictions))
print('Mean Squared Error:', mean_squared_error(y_test, predictions))
print('R-squared:', r2_score(y_test, predictions))


Mean Absolute Error: 5507358.952724207
Mean Squared Error: 100199676884897.12
R-squared: 0.2903694583596117


# Saving the model

In [37]:
from scripts.model_serializer import ModelSerializer

serializer = ModelSerializer('models/blabla.bla')
serializer.dump(pipeline)

# Loading the Model
# serializer = ModelSerializer('models/blabla.bla')
#pipeline = serializer.load()
#pipeline

In this example:

The ColumnTransformer is used to apply different transformations to numerical and categorical features separately.
The Pipeline includes both preprocessing steps and the regression model.
Numerical features are imputed with the mean and scaled using StandardScaler.
Categorical features are imputed with the most frequent value and one-hot encoded using OneHotEncoder.
This pipeline allows for a cleaner and more organized approach to data preprocessing and model building. Adjust the column names and other parameters based on your specific dataset and requirements.


In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load your data (replace 'your_data.csv' with the actual file name)
df = pd.read_csv('savegame.csv')

# Define features (add more columns as needed)
features = df[['tour_arrangement', 'main_activity', 'night_total', 'age_group', 'travel_with', 'total_female', 'total_male']]

# Add more numerical and categorical features based on your data
numeric_features = ['night_total', 'total_female', 'total_male']
categorical_features = ['tour_arrangement', 'main_activity', 'age_group', 'travel_with']

# Split the data
target = df['total_cost']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create transformers for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with the preprocessor and Random Forest Regressor
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', RandomForestRegressor())])

# Create a pipeline with the preprocessor and XGBoost Regressor
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('regressor', XGBRegressor())])

# Fit and evaluate Random Forest
rf_pipeline.fit(X_train, y_train)
rf_predictions = rf_pipeline.predict(X_test)

# Fit and evaluate XGBoost
xgb_pipeline.fit(X_train, y_train)
xgb_predictions = xgb_pipeline.predict(X_test)

# Evaluate the models
print("Random Forest Metrics:")
print('Mean Absolute Error:', mean_absolute_error(y_test, rf_predictions))
print('Mean Squared Error:', mean_squared_error(y_test, rf_predictions))
print('R-squared:', r2_score(y_test, rf_predictions))

print("\nXGBoost Metrics:")
print('Mean Absolute Error:', mean_absolute_error(y_test, xgb_predictions))
print('Mean Squared Error:', mean_squared_error(y_test, xgb_predictions))
print('R-squared:', r2_score(y_test, xgb_predictions))

Random Forest Metrics:
Mean Absolute Error: 5520792.649007987
Mean Squared Error: 109639976056711.22
R-squared: 0.22351171168007633

XGBoost Metrics:
Mean Absolute Error: 5681060.104981062
Mean Squared Error: 115560834311945.83
R-squared: 0.18157922266152737


In [72]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_predict, KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Load your data (replace 'your_data.csv' with the actual file name)
df = pd.read_csv('savegame.csv')

# Split the data
features = df[['tour_arrangement', 'main_activity', 'night_total', 'age_group', 'purpose']]
target = df['total_cost']

# Log-transform the target variable
target_log = np.log1p(target)

# Define the numerical and categorical features
numeric_features = ['night_total']
categorical_features = ['tour_arrangement', 'main_activity']

# Create transformers for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with the preprocessor and XGBoost regressor
xgboost_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('regressor', XGBRegressor())])

# Create a pipeline with the preprocessor and Linear Regression
linear_regression_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                              ('regressor', LinearRegression())])

# Create a pipeline with the preprocessor and RandomForestRegressor
random_forest_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                          ('regressor', RandomForestRegressor())])

# Define the models
models = {
    'XGBoost': xgboost_pipeline,
    'Linear Regression': linear_regression_pipeline,
    'Random Forest': random_forest_pipeline
}

# Evaluate each model using k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for model_name, model in models.items():
    print(f'\n{model_name} Results:')

    # Use cross_val_predict for k-fold cross-validation
    predictions_log = cross_val_predict(model, features, target_log, cv=kf)
    
    # Inverse transform log predictions
    predictions_inverse = np.expm1(predictions_log)

    # Evaluate the model
    print('Mean Absolute Error:', mean_absolute_error(target, predictions_inverse))
    print('Mean Squared Error:', mean_squared_error(target, predictions_inverse))
    print('R-squared:', r2_score(target, predictions_inverse))


XGBoost Results:
Mean Absolute Error: 5677866.177860651
Mean Squared Error: 133249143149661.16
R-squared: 0.10820736470381032


NotFittedError: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.