In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read in the data

In [2]:
## Importing the dataset
data = pd.read_csv('data/divorce_data.csv', sep=';')

# Data Exploration

In [4]:
# Get the shape of the dataset
num_rows, num_cols = data.shape

# Check for missing values
missing_values = data.isnull().sum().sum()

# Check the balance of the target variable
divorce_counts = data['Divorce'].value_counts()

num_rows, num_cols, missing_values, divorce_counts


(170,
 55,
 0,
 0    86
 1    84
 Name: Divorce, dtype: int64)

The dataset contains 170 rows (i.e., couples) and 55 columns (54 predictors and 1 target). There are no missing values in the dataset, which is good as it simplifies the preprocessing steps.

The target variable "Divorce" is fairly balanced with 86 instances of non-divorced couples (value 0) and 84 instances of divorced couples (value 1). This is beneficial because imbalanced datasets can often lead to biased models.

# Data Preprocessing

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Separate features and target
features = data.drop('Divorce', axis=1)
target = data['Divorce']

# Split the data into training and test sets
features_train, features_test, target_train, target_test = train_test_split(
    features, target, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
features_train = scaler.fit_transform(features_train)
features_test = scaler.transform(features_test)

features_train.shape, features_test.shape, target_train.shape, target_test.shape


((136, 54), (34, 54), (136,), (34,))

The data has been successfully split into training and test sets. We have 136 instances in the training set and 34 instances in the test set. Each instance has 54 features.

In [36]:
## KNN Classifier using all 54 features in the dataset

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

knn = KNeighborsClassifier(n_neighbors=3)

# Fit the model to the training data
knn.fit(features_train, target_train)

# Predict the target for the test data
target_pred_knn = knn.predict(features_test)

# Compute accuracy
accuracy_knn = accuracy_score(target_test, target_pred_knn)

# Compute F1 score
f1_knn = f1_score(target_test, target_pred_knn)

accuracy_knn, f1_knn

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


(0.9705882352941176, 0.9743589743589743)

In [37]:
## Permutation Feature Importance on the KNN Classifier with all 54 features
## (calculated manually - i.e. without sci-kit learn)

# Define feature names
feature_names = features.columns.tolist()

# Convert pandas DataFrames to numpy arrays for faster computation
X_test = features_test.copy()
y_test = target_test.to_numpy()

importances = []

# Compute the original accuracy
original_accuracy = accuracy_score(y_test, target_pred_knn)

for i in range(X_test.shape[1]):
    # Permute the ith feature in a copy of the test set
    X_test_permuted = X_test.copy()
    np.random.shuffle(X_test_permuted[:, i])
    
    # Make predictions using the permuted data
    predictions_permuted = knn.predict(X_test_permuted)
    
    # Compute the accuracy
    accuracy_permuted = accuracy_score(y_test, predictions_permuted)
    
    # Compute the feature importance
    importance = original_accuracy - accuracy_permuted
    
    # Add the feature importance to the list
    importances.append(importance)

# Create a DataFrame to store the results
perm_importance_custom_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# Sort the DataFrame by importance
perm_importance_custom_df = perm_importance_custom_df.sort_values(by='Importance', ascending=False)

perm_importance_custom_df


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mo

Unnamed: 0,Feature,Importance
0,Q1,0.0
40,Q41,0.0
29,Q30,0.0
30,Q31,0.0
31,Q32,0.0
32,Q33,0.0
33,Q34,0.0
34,Q35,0.0
35,Q36,0.0
36,Q37,0.0


In [39]:
## Permutation Feature Importance on the RandomForest Classifier with all 54 features
## - note: this may be better than KNN initially with lots of features

from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

# Initialize the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Fit the model to the training data
rf.fit(features_train, target_train)

# Compute permutation feature importance
# perm_importance = permutation_importance(rf, features_test, target_test, n_repeats=10, random_state=42)

importances = rf.feature_importances_

# Create a DataFrame to store the results
perm_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# Sort the DataFrame by importance
perm_importance_df = perm_importance_df.sort_values(by='Importance', ascending=False)

perm_importance_df

Unnamed: 0,Feature,Importance
39,Q40,0.096593
16,Q17,0.095148
17,Q18,0.091988
18,Q19,0.089627
11,Q12,0.089565
19,Q20,0.063145
15,Q16,0.057146
10,Q11,0.055608
14,Q15,0.047422
25,Q26,0.041697


# Feature Importance

In this step, we'll use a decision tree-based method to rank the importance of the features in predicting divorce. This will help us identify the key predictors of divorce.

We'll use the Random Forest algorithm from scikit-learn for this. A Random Forest is an ensemble of Decision Trees that is often used for feature selection because it provides a measure of the importance of each feature.

In [7]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Fit the model to the training data
rf.fit(features_train, target_train)

# Get feature importances
importances = rf.feature_importances_

# Create a DataFrame of features and importances
feature_importances = pd.DataFrame({
    'Feature': features.columns,
    'Importance': importances
})

# Sort the DataFrame by importance
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

display(feature_importances)

# Select the top 10 features
top_features = feature_importances['Feature'][:10].tolist()

# Select these top features from the training and test data
features_train_selected = features_train[top_features]
features_test_selected = features_test[top_features]

features_train_selected.head()


Unnamed: 0,Feature,Importance
39,Q40,0.096593
16,Q17,0.095148
17,Q18,0.091988
18,Q19,0.089627
11,Q12,0.089565
19,Q20,0.063145
15,Q16,0.057146
10,Q11,0.055608
14,Q15,0.047422
25,Q26,0.041697


Unnamed: 0,Q40,Q17,Q18,Q19,Q12,Q20,Q16,Q11,Q15,Q26
69,0,4,4,4,4,4,4,4,4,4
138,0,0,0,0,0,0,0,0,0,0
2,3,3,3,3,4,2,3,3,3,2
93,0,0,0,0,0,0,0,0,0,0
136,0,1,0,0,1,0,0,0,0,0


The Random Forest has ranked the features by their importance in predicting the target variable "Divorce".

The five most important features, according to this model, are:

Q40 with an importance of approximately 0.0966

Q17 with an importance of approximately 0.0951

Q18 with an importance of approximately 0.0920

Q19 with an importance of approximately 0.0896

Q12 with an importance of approximately 0.0896

These results suggest that these questions may be particularly important in predicting divorce.

# Feature Selection

As a starting point, let's choose the top 10 features. However, we can adjust this number later if necessary. Now, let's select these top features from our training and test datasets.

In [8]:
# Select the top 10 features
top_features = feature_importances['Feature'][:10].tolist()

# Select these top features from the training and test data
features_train_selected = features_train[top_features]
features_test_selected = features_test[top_features]

features_train_selected.head()


Unnamed: 0,Q40,Q17,Q18,Q19,Q12,Q20,Q16,Q11,Q15,Q26
69,0,4,4,4,4,4,4,4,4,4
138,0,0,0,0,0,0,0,0,0,0
2,3,3,3,3,4,2,3,3,3,2
93,0,0,0,0,0,0,0,0,0,0
136,0,1,0,0,1,0,0,0,0,0


# Implementing k-Nearest Neighbors with Scikt Learn

To train and evaluate a k-NN model using scikit-learn, we can use the KNeighborsClassifier class. After training the model, we can use it to make predictions on the test set, and then compute accuracy and F1 score. We are only going to use the top 10 features.

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

# Initialize the KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)

# Fit the model to the training data
knn.fit(features_train_selected, target_train)

print(features_test_selected)

# Predict the target for the test data
target_pred_knn = knn.predict(features_test_selected)

# Compute accuracy
accuracy_knn = accuracy_score(target_test, target_pred_knn)

# Compute F1 score
f1_knn = f1_score(target_test, target_pred_knn)

accuracy_knn, f1_knn


     Q40  Q17  Q18  Q19  Q12  Q20  Q16  Q11  Q15  Q26
139    0    0    0    0    0    0    1    1    1    0
30     4    3    4    3    4    4    2    3    3    4
119    0    0    0    0    2    0    0    0    0    0
29     4    4    3    4    3    3    2    4    3    3
144    0    0    0    2    0    0    0    0    0    0
163    0    1    0    0    0    0    2    1    2    1
166    1    0    0    0    0    0    0    0    0    0
51     4    3    3    3    3    3    2    3    3    3
105    0    0    0    0    1    0    0    0    1    0
60     4    3    2    3    2    2    2    3    3    2
15     4    4    4    4    4    4    2    4    3    4
158    1    0    0    0    2    0    0    1    0    1
135    0    0    0    0    0    0    0    0    0    0
45     4    3    3    3    3    3    3    3    2    3
68     3    3    3    3    3    3    2    3    3    3
85     0    0    0    0    0    0    0    0    0    0
24     4    4    2    4    2    2    4    4    3    2
109    0    0    0    0    0

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


(0.9705882352941176, 0.9743589743589743)

The k-Nearest Neighbors (k-NN) model from scikit-learn achieved an accuracy of approximately 0.971 (or 97.1%) and an F1 score of approximately 0.974 on the test data.

# Implementing k-Nearest Neighbors from Scratch

Now let's move on to implement the k-NN algorithm from scratch. The steps are as follows:

1.) Calculate Euclidean distance between two instances.


2.) Get the k nearest neighbors of a given test instance.

3.) Predict the class of the test instance by taking the mode of the class labels of the k nearest neighbors.

Let's start by defining the function for calculating the Euclidean distance.

Converting our DataFrames to numpy arrays will make life easier!

In [28]:

X_train = features_train.to_numpy()
y_train = target_train.to_numpy()
X_test = features_test.to_numpy()
y_test = target_test.to_numpy()

print("Data before conversion to Numpy arrays:")
print("features_train:")
print(features_train.head())
print("target_train:")
print(target_train.head())
print("features_test:")
print(features_test.head())
print("target_test:")
print(target_test.head())

print("\nData after conversion to Numpy arrays:")
print("X_train:")
print(X_train)
print("y_train:")
print(y_train)
print("X_test:")
print(X_test)
print("y_test:")
print(y_test)

Data before conversion to Numpy arrays:
features_train:
     Q1  Q2  Q3  Q4  Q5  Q6  Q7  Q8  Q9  Q10  ...  Q45  Q46  Q47  Q48  Q49  \
69    4   4   4   3   4   2   4   4   4    3  ...    4    0    4    4    4   
138   0   0   1   0   0   0   0   1   1    0  ...    3    3    3    3    0   
2     2   2   2   2   1   3   2   1   1    2  ...    2    3    2    3    1   
93    0   1   0   1   0   0   0   0   0    1  ...    1    1    1    2    1   
136   0   0   2   0   0   0   0   0   0    0  ...    2    3    1    2    1   

     Q50  Q51  Q52  Q53  Q54  
69     3    4    4    4    4  
138    1    3    3    3    1  
2      1    1    2    2    2  
93     1    1    0    0    0  
136    2    1    2    2    0  

[5 rows x 54 columns]
target_train:
69     1
138    0
2      1
93     0
136    0
Name: Divorce, dtype: int64
features_test:
     Q1  Q2  Q3  Q4  Q5  Q6  Q7  Q8  Q9  Q10  ...  Q45  Q46  Q47  Q48  Q49  \
139   3   1   1   0   0   0   0   0   0    0  ...    3    3    2    2    0   
30    3 

In [29]:
def calculate_euclidean_distance(instance1, instance2):
    """
    Calculate the Euclidean distance between two instances.
    - instance1: first instance
    - instance2: second instance
    """
    return np.sqrt(np.sum((instance1 - instance2) ** 2)) # subtracts corresponding elements of the two arrays

print(f"Euclidean distance between the first and second training instance: {calculate_euclidean_distance(X_train[0], X_train[1])}")


Euclidean distance between the first and second training instance: 21.97726097583591


Next, we'll implement the function to get the k nearest neighbors of a given test instance. This function will compute the Euclidean distance from the test instance to each training instance, keep track of the k instances with the smallest distances, and return their indices.

In [30]:
def get_k_nearest_neighbors(X_train, x_test, k):
    """
    Get the k nearest neighbors of a test instance.
    - X_train: training features
    - x_test: test instance
    - k: number of neighbors to return
    """
    # Calculate the Euclidean distance from the test instance to each training instance
    distances = np.array([calculate_euclidean_distance(x_train, x_test) for x_train in X_train])
    
    # Get the indices of the k training instances with the smallest distances
    nearest_neighbors = distances.argsort()[:k]
    
    return nearest_neighbors

# Test the function
print(f"Indices of the 3 nearest neighbors of the first test instance: {get_k_nearest_neighbors(X_train, X_test[0], 3)}")


Indices of the 3 nearest neighbors of the first test instance: [122  89  33]


Now, we implement the function to predict the class of a test instance. This function will get the k nearest neighbors of the test instance, find the most common class label among these neighbors, and return this class label as the prediction.

In [31]:
from scipy import stats
def predict_with_k_nearest_neighbors(X_train, y_train, x_test, k):
    """
    Predict the class of a test instance using the k nearest neighbors.
    - X_train: training features
    - y_train: training target values
    - x_test: test instance
    - k: number of neighbors to consider
    """
    # Get the k nearest neighbors of the test instance
    nearest_neighbors = get_k_nearest_neighbors(X_train, x_test, k)
    
    # Get the class labels of the nearest neighbors
    class_labels = y_train[nearest_neighbors]
   
    # Predict the most common class label
    prediction = stats.mode(class_labels)[0]
    
    return prediction

# Test the function
prediction = predict_with_k_nearest_neighbors(X_train, y_train, X_test[0], 3)
print(f"Predicted class: {prediction}, Actual class: {target_test.values[0]}")


Predicted class: 0, Actual class: 0


The next step is to use this function to make predictions for multiple test instances. 

In [32]:
def predict_with_k_nearest_neighbors_multiple(X_train, y_train, X_test, k):
    """
    Predict the class of multiple test instances using the k nearest neighbors.
    - X_train: training features
    - y_train: training target values
    - X_test: test features
    - k: number of neighbors to consider
    """
    # Make predictions for each test instance
    predictions = [predict_with_k_nearest_neighbors(X_train, y_train, x_test, k) for x_test in X_test]
    
    return predictions

# Test the function
predictions = predict_with_k_nearest_neighbors_multiple(X_train, y_train, X_test, 3)
print(f"Predicted classes: {predictions[:10]}, Actual classes: {y_test[:10]}")


Predicted classes: [0, 1, 0, 1, 0, 0, 0, 1, 0, 1], Actual classes: [0 1 0 1 0 0 0 1 0 1]


The custom implementation of the k-Nearest Neighbors (k-NN) algorithm is working correctly. It made correct predictions for the first 10 instances in the test set!


Now that we have implemented and tested the k-NN algorithm from scratch, let's evaluate its performance on the entire test set. We'll compute the accuracy and F1 score as we did before.

In [33]:
# Make predictions for the entire test set
predictions = predict_with_k_nearest_neighbors_multiple(X_train, y_train, X_test, 3)

# Compute accuracy
accuracy_knn_custom = accuracy_score(y_test, predictions)

# Compute F1 score
f1_knn_custom = f1_score(y_test, predictions)

accuracy_knn_custom, f1_knn_custom


(0.9705882352941176, 0.9743589743589743)

The custom implementation of the k-Nearest Neighbors (k-NN) algorithm achieved an accuracy of approximately 0.971 (or 97.1%) and an F1 score of approximately 0.974 on the test data. These values are quite high and turned out to be the same as the scikit-learn k-NN. Now, let's compute the permutation feature importance for both the scikit-learn and custom k-NN models.

# Permutation Feature Importance

In [18]:
## Importing the dataset
data = pd.read_csv('data/divorce_data.csv', sep=';')
display(data)

from sklearn.model_selection import train_test_split

# Separate features and target
features = data.drop('Divorce', axis=1)
target = data['Divorce']

# Split the data into training and test sets
features_train, features_test, target_train, target_test = train_test_split(
    features, target, test_size=0.2, random_state=42)

from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest classifier
rf = RandomForestClassifier(random_state=42)
# Fit the model to the training data
rf.fit(features_train, target_train)
# Evaluate the model on the test data
accuracy_rf = rf.score(features_test, target_test)
print(f"Accuracy: {accuracy_rf}")


# Let's start with scikit learn's implementation
from sklearn.inspection import permutation_importance

display(features)
display(target)

# Compute permutation feature importance
result = permutation_importance(rf, features, target, n_repeats=30, random_state=42)

importances = result['importances_mean']

for feature_name, importance in zip(features_test.columns, importances):
    print(f"{feature_name}: {importance}")

from sklearn.model_selection import cross_val_score

# # Create a new RandomForestClassifier
# rf = RandomForestClassifier(random_state=42)
# # Fit the model to the training data
# rf.fit(features_train, target_train)
# # Evaluate the model on the test data
# accuracy_rf = rf.score(features_test, target_test)
# print(f"Accuracy: {accuracy_rf}")

# # Perform cross-validation and compute permutation feature importance
# cv_results = cross_val_score(rf, features, target, cv=5, scoring='accuracy')
# print("Cross-validation Accuracy:", cv_results.mean())

# # Compute permutation feature importance on the entire dataset
# result = permutation_importance(rf, features, target, n_repeats=30, random_state=42)
# importances = result['importances_mean']

# # Print feature importances
# for feature_name, importance in zip(features.columns, importances):
#     print(f"{feature_name}: {importance}")



Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q46,Q47,Q48,Q49,Q50,Q51,Q52,Q53,Q54,Divorce
0,2,2,4,1,0,0,0,0,0,0,...,2,1,3,3,3,2,3,2,1,1
1,4,4,4,4,4,0,0,4,4,4,...,2,2,3,4,4,4,4,2,2,1
2,2,2,2,2,1,3,2,1,1,2,...,3,2,3,1,1,1,2,2,2,1
3,3,2,3,2,3,3,3,3,3,3,...,2,2,3,3,3,3,2,2,2,1
4,2,2,1,1,1,1,0,0,0,0,...,2,1,2,3,2,2,2,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,0,0,0,0,0,0,0,0,0,0,...,1,0,4,1,1,4,2,2,2,0
166,0,0,0,0,0,0,0,0,0,0,...,4,1,2,2,2,2,3,2,2,0
167,1,1,0,0,0,0,0,0,0,1,...,3,0,2,0,1,1,3,0,0,0
168,0,0,0,0,0,0,0,0,0,0,...,3,3,2,2,3,2,4,3,1,0


Accuracy: 0.9705882352941176


Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q45,Q46,Q47,Q48,Q49,Q50,Q51,Q52,Q53,Q54
0,2,2,4,1,0,0,0,0,0,0,...,3,2,1,3,3,3,2,3,2,1
1,4,4,4,4,4,0,0,4,4,4,...,2,2,2,3,4,4,4,4,2,2
2,2,2,2,2,1,3,2,1,1,2,...,2,3,2,3,1,1,1,2,2,2
3,3,2,3,2,3,3,3,3,3,3,...,3,2,2,3,3,3,3,2,2,2
4,2,2,1,1,1,1,0,0,0,0,...,2,2,1,2,3,2,2,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,0,0,0,0,0,0,0,0,0,0,...,0,1,0,4,1,1,4,2,2,2
166,0,0,0,0,0,0,0,0,0,0,...,3,4,1,2,2,2,2,3,2,2
167,1,1,0,0,0,0,0,0,0,1,...,2,3,0,2,0,1,1,3,0,0
168,0,0,0,0,0,0,0,0,0,0,...,4,3,3,2,2,3,2,4,3,1


0      1
1      1
2      1
3      1
4      1
      ..
165    0
166    0
167    0
168    0
169    0
Name: Divorce, Length: 170, dtype: int64

Q1: 0.0
Q2: 0.0
Q3: 0.0
Q4: 0.0
Q5: 0.0
Q6: 0.0
Q7: 0.0
Q8: 0.0
Q9: 0.0
Q10: 0.0
Q11: 0.0
Q12: 0.0
Q13: 0.0
Q14: 0.0
Q15: 0.0
Q16: 0.0
Q17: 0.0
Q18: 0.0
Q19: 0.0
Q20: 0.0
Q21: 0.0
Q22: 0.0
Q23: 0.0
Q24: 0.0
Q25: 0.0
Q26: 0.0
Q27: 0.0
Q28: 0.0
Q29: 0.0
Q30: 0.0
Q31: 0.0
Q32: 0.0
Q33: 0.0
Q34: 0.0
Q35: 0.0
Q36: 0.0
Q37: 0.0
Q38: 0.0
Q39: 0.0
Q40: 0.0021568627450980317
Q41: 0.0
Q42: 0.0
Q43: 0.0
Q44: 0.0
Q45: 0.0
Q46: 0.0
Q47: 0.0
Q48: 0.0
Q49: 0.0
Q50: 0.0
Q51: 0.0
Q52: 0.0
Q53: 0.0
Q54: 0.0


# Divorce Predictor

In [50]:
print("Welcome to the Divorce Predictor. Please answer the following questions about your relationship.")
print("Your responses should be one of the following: 0 (Never), 1 (Seldom), 2 (Averagely), 3 (Frequently), 4 (Always)")
print("Please respond as honestly as possible.\n")

questions_df = pd.read_csv('data/reference.tsv', delimiter='|', header=None, names=['attribute_id', 'description'])

selected_feature_names = ['Q40', 'Q17', 'Q18', 'Q19', 'Q12', 'Q20', 'Q16', 'Q11', 'Q15', 'Q26']


# Correct the attribute IDs and re-extract the selected questions
corrected_ids = [str(int(x[1:])) for x in selected_feature_names]
selected_questions = questions_df[questions_df['attribute_id'].isin(corrected_ids)]

# Get the questions in a list
selected_questions_list = selected_questions['description'].tolist()

# Display the selected questions
selected_questions_list

# List of top 10 most important questions
questions = selected_questions_list  # replace with the list of the top 10 questions

# List to store the user's responses
responses = []

# Ask the user to input their responses
for i, question in enumerate(questions, start=1):
    print(f"Question {i}: {question}")
    response = input("Your response (0-4): ")
    while not response.isdigit() or int(response) not in range(5):
        print("Invalid response. Please enter a number from 0 to 4.")
        response = input("Your response (0-4): ")
    responses.append(int(response))

print("\nThank you for your responses. The model is now making a prediction...\n")

# Convert the responses into a format that can be used for prediction
responses_df = pd.DataFrame([responses], columns=selected_feature_names)

# Use the trained model to make a prediction
prediction = knn.predict(responses_df)

# Display the prediction
if prediction == 0:
    print("The model predicts that the couple will not get divorced.")
else:
    print("The model predicts that the couple will get divorced.")

Welcome to the Divorce Predictor. Please answer the following questions about your relationship.
Your responses should be one of the following: 0 (Never), 1 (Seldom), 2 (Averagely), 3 (Frequently), 4 (Always)
Please respond as honestly as possible.

Question 1: I think that one day in the future, when I look back, I see that my spouse and I have been in harmony with each other.
Question 2: My spouse and I have similar values in terms of personal freedom.
Question 3: Our dreams with my spouse are similar and harmonious.
Question 4: We're compatible with my spouse about what love should be.
Question 5: We share the same views about being happy in our life with my spouse
Question 6: My spouse and I have similar ideas about how marriage should be
Question 7: My spouse and I have similar ideas about how roles should be in marriage
Question 8: My spouse and I have similar values in trust.
Question 9: I know my spouse's basic anxieties.
Question 10: We're just starting a discussion before I k