In [16]:
import pandas as pd

In [17]:
import warnings

# Suppressing warnings for metrics due to nature of data set
warnings.filterwarnings("ignore", category = UserWarning, module = "sklearn.metrics._classification")

In [18]:
df = pd.read_csv('final_cleaned.csv')

Kmeans center accuracies
- 20: 0.33
- 15: 0.46
- 10: 0.64
- 9: 0.63
- 8: 0.64

In [19]:
df['length_of_stay_category'] = ''

In [20]:
## 10 centers
# Add new column, length of stay categories
bins = [19, 56, 102, 166, 277, 411, 567, 774, 1024, 1612, float('inf')]
labels = ['0-19', '20-56', '57-102', '103-166', '167-277', '278-411', '412-567', '568-774', '775-1024', '1025-1612']

df['length_of_stay_category'] = pd.cut(df.length_of_stay_days, bins = bins, labels = labels, right = False)

In [21]:
df = df.drop(['length_of_stay_days'], axis = 1)

In [22]:
columns_to_exclude = ['length_of_stay_category', 'age_total_months', 'sentiment_score']
df_dummies = pd.get_dummies(df, columns=[col for col in df.columns if col not in columns_to_exclude])
df_dummies = df_dummies.dropna()

In [23]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

from sklearn.decomposition import PCA


In [24]:
# Separate features (X) and target variable (y)
X = df_dummies.drop('length_of_stay_category', axis=1)
y = df_dummies['length_of_stay_category']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, train_size = 0.8, random_state = 42)

# Standardize features (optional but can be beneficial for logistic regression)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Apply PCA
# Choose the number of components you want to keep (here, 2 components)
n_components = 10
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [25]:
# Create a Random Forest Classifier
rf_classifier_original = RandomForestClassifier(n_estimators = 1000, random_state=42)

# Train the classifier on the training data
rf_classifier_original.fit(X_train, y_train)

# Make predictions on the test data
y_pred_original = rf_classifier_original.predict(X_test)

# Evaluate the performance of the classifier
accuracy_original = accuracy_score(y_test, y_pred_original)
print(f"Accuracy: {accuracy_original:.2f}")

# Train a classifier on the PCA-transformed data
rf_pca = RandomForestClassifier(n_estimators = 1000, random_state=42)
rf_pca.fit(X_train_pca, y_train)
y_pred_pca = rf_pca.predict(X_test_pca)
accuracy_pca = accuracy_score(y_test, y_pred_pca)
print(f'Accuracy after PCA: {accuracy_pca:.2f}')

Accuracy: 0.64
Accuracy after PCA: 0.61


In [26]:
# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred_original))

Classification Report:
               precision    recall  f1-score   support

        0-19       0.63      0.94      0.76       143
   1025-1612       0.00      0.00      0.00         1
     103-166       0.00      0.00      0.00        12
     167-277       0.00      0.00      0.00         3
       20-56       0.69      0.48      0.57        79
     278-411       0.00      0.00      0.00         3
     412-567       0.00      0.00      0.00         5
     568-774       1.00      1.00      1.00         1
      57-102       0.50      0.08      0.13        26

    accuracy                           0.64       273
   macro avg       0.31      0.28      0.27       273
weighted avg       0.58      0.64      0.58       273



In [27]:
feature_importances = rf_classifier_original.feature_importances_

In [28]:
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance':feature_importances})

In [29]:
feature_importance_df = feature_importance_df.sort_values(by = 'Importance', ascending = False)

In [30]:
feature_importance_df.head(20)

Unnamed: 0,Feature,Importance
0,age_total_months,0.081462
127,gender_male,0.01887
126,gender_female,0.018104
158,litter_no,0.016156
159,litter_yes,0.015563
8,size_medium,0.015507
143,special_need_not sure,0.015176
139,housebroken_no,0.01506
7,size_large,0.014891
146,gets_along_with_cats_not sure,0.014847
