# Importing Libraries

In [46]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Importing the cleaned dataset

In [16]:
df_train = pd.read_csv('data/Train_cleaned.csv')
df_test  = pd.read_csv('data/Test.csv')

### Seperate numerical and categorical features

In [35]:
#num_features = ['total_female', 'total_male', 'night_mainland', 'night_zanzibar', 'night_total', 'number_travelers']
num_features = ['night_total', 'number_travelers']
#num_features = df_train.select_dtypes('number')
cat_features = list(set(df_train.columns) - set(num_features) - set(('total_cost', 'total_cost_euro')))
cat_features = ['country', 'age_group', 'travel_with', 'purpose', 'main_activity', 'tour_arrangement', 'first_trip_tz']

### Setup Target and Features

In [36]:
y_train = df_train['total_cost_euro']
X_train = df_train[num_features + cat_features]

In [57]:
X_test  = df_test

# Baseline Model

Given our target variable ('total_cost') is a continuous variable, this is a regression problem. 

In [55]:
print(df_train.total_cost_euro.mode())
print(df_train.total_cost_euro.mean())

0    604.99
Name: total_cost_euro, dtype: float64
3070.1119641657333


In [58]:
df_train.total_cost_euro.describe()

count     4465.000000
mean      3070.111964
std       4522.171829
min         17.880000
25%        302.490000
50%       1337.020000
75%       3690.420000
max      36329.500000
Name: total_cost_euro, dtype: float64

## Categorization of total_cost

In [78]:

bin_edges  = [0, 500, 1000, 2000, float('inf')]
bin_labels = ['Low', 'Medium', 'High', 'Very High']

# Create a new column 'cost_category' based on the binning
df_train['cost_category'] = pd.cut(df_train['total_cost_euro'], bins=bin_edges, labels=bin_labels, right=False)

# Display the DataFrame with the new column
print(df_train[['total_cost_euro', 'cost_category']])

      total_cost_euro cost_category
0              246.23           Low
1             1173.44          High
2             1209.97          High
3             2843.44     Very High
4              604.99        Medium
...               ...           ...
4460          1209.97          High
4461          3902.17     Very High
4462           820.02        Medium
4463           423.49           Low
4464          4839.90     Very High

[4465 rows x 2 columns]


In [83]:
X_train = df_train.drop(['total_cost','total_cost_euro', 'cost_category'], axis=1)
y_train = df_train['cost_category']



# Create a pipeline

### Building the pipeline

In [89]:
# Evaluate the best model
print('Best Model:', best_model.named_steps['regressor'])
print('Mean Absolute Percentage Error:', round(mean_absolute_percentage_error(y_test, y_pred_test), 2))
print('Mean Absolute Error:', round(np.sqrt(mean_absolute_error(y_test, y_pred_test)), 2))
print('Mean Squared Error:', round(np.sqrt(mean_squared_error(y_test, y_pred_test)), 2))
print('R-squared:', round(r2_score(y_test, y_pred_test), 2))








def coeff_info(model):
    coeff_used = np.sum(model.coef_!=0)
    print('The model is using', coeff_used, 'out of 66 features.')
    print( "The highest coefficient has a value of:", max(model.coef_.round(3)))

    
#coeff_info(best_model)

Best Model: LogisticRegression(class_weight='balanced', max_iter=10000)


NameError: name 'y_test' is not defined

In [None]:
def error_analysis(y_test, y_pred_test):
    """Generated true vs. predicted values and residual scatter plot for models

    Args:
        y_test (array): true values for y_test
        y_pred_test (array): predicted values of model for y_test
    """     
    # Calculate residuals
    residuals = y_test - y_pred_test
    
    # Plot real vs. predicted values 
    fig, ax = plt.subplots(1,2, figsize=(15, 5))
    plt.subplots_adjust(right=1)
    plt.suptitle('Error Analysis')
    
    ax[0].scatter(y_pred_test, y_test, color="#FF5A36", alpha=0.7)
    ax[0].plot([-400, 350], [-400, 350], color="#193251")
    ax[0].set_title("True vs. predicted values", fontsize=16)
    ax[0].set_xlabel("predicted values")
    ax[0].set_ylabel("true values")
    ax[0].set_xlim((y_pred_test.min()-10), (y_pred_test.max()+10))
    ax[0].set_ylim((y_test.min()-40), (y_test.max()+40))
    
    ax[1].scatter(y_pred_test, residuals, color="#FF5A36", alpha=0.7)
    ax[1].plot([-400, 350], [0,0], color="#193251")
    ax[1].set_title("Residual Scatter Plot", fontsize=16)
    ax[1].set_xlabel("predicted values")
    ax[1].set_ylabel("residuals")
    ax[1].set_xlim((y_pred_test.min()-10), (y_pred_test.max()+10))
    ax[1].set_ylim((residuals.min()-10), (residuals.max()+10))
    
error_analysis(y_test, y_pred_test)

In [32]:

# Plotting the results
plt.figure(figsize=(8, 6))

# Scatter plot of true vs. predicted values
plt.scatter(y_test, y_pred, color='blue', label='Actual vs. Predicted')

# Plot the identity line (y = x)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', label='Identity Line')

plt.title('Regression Model: Actual vs. Predicted Values')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.legend()
plt.show()

NameError: name 'y_test' is not defined

<Figure size 800x600 with 0 Axes>

# Saving the model

In [None]:
from scripts.model_serializer import ModelSerializer

best_model_serializer = ModelSerializer('models/best_model.sav')
best_model_serializer.dump(best_model)

# Loading the model

In [None]:
serializer = ModelSerializer('models/best_model.sav')
best_model = serializer.load()
best_model