In [4]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import pickle
from sklearn.preprocessing import StandardScaler
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv('risk_appetite_data.csv')

df = data[['Age','Income','Expenses','Investment','riskcategory']]

#df['q1_answer'] = df['q1_answer'].replace(6, 5)
df['Savings'] = df['Income']-df['Expenses']
df = df[df['Savings']>0]

y = df['riskcategory']
X = df.drop('riskcategory', axis=1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the encoded data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Set the parameters for XGBoost
params = {
    'objective': 'multi:softmax',  # Multi-class classification objective
    'num_class': len(set(y)),  # Number of classes
    'max_depth': 3,  # Maximum tree depth
    'eta': 0.01,  # Learning rate
    'subsample': 0.8,  # Subsample ratio of the training instances
    'colsample_bytree': 0.8 # Subsample ratio of features
}

# Create XGBoost DMatrix for efficient processing
dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
dtest = xgb.DMatrix(X_test_scaled, label=y_test)

# Train the XGBoost model
model = xgb.train(params, dtrain)

# Make predictions on the test set
y_pred = model.predict(dtest)

# Convert the predicted labels back to their original values
y_pred = label_encoder.inverse_transform(y_pred.astype(int))

# Calculate the accuracy of the model
accuracy = accuracy_score(label_encoder.inverse_transform(y_test), y_pred)
print("Accuracy:", accuracy)

# Take input data from the user
age = int(input("Enter age: "))
income = float(input("Enter income: "))
expenses = float(input("Enter expenses: "))
investment = float(input("Enter investment: "))
savings = income-expenses

# Create a dictionary with the input data
input_data = {
    'age': [age],
    'income': [income],
    'expenses': [expenses],
    'investment': [investment],
    'savings': [savings]
    # Add more features as required
}

Input = pd.DataFrame(input_data)

# Convert input_data to DMatrix format
dinput = xgb.DMatrix(Input)

# Make predictions on the input data
y_pred = model.predict(dinput)

# Map the predicted label to its respective category using a dictionary
label_to_category = {
    1: "Highly Conservative",
    2: "Low Conservative",
    3: "Neutral",
    4: "Low Risk Taking",
    5: "Highly Risk Taking"
}

predicted_category = label_to_category[int(y_pred[0])]

# Output the predicted category
print("Predicted Category:", predicted_category)

with open('xgboost.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Load the model using pickle
with open('xgboost.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

Accuracy: 0.5135135135135135
Enter age: 23
Enter income: 340000
Enter expenses: 3489
Enter investment: 1245
Predicted Category: Neutral
