In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Loading data
file_path = Path("result.csv")
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,animal_id,datetime_intake,found_location,intake_type,intake_condition,animal_type_intake,name_intake,sex_intake,color_intake,breed_type,datetime_outcome,outcome_type,sex_upon_outcome,fixed_changed,age_bucket,datetime_length
0,A730601,2016-07-07 12:11:00,1109 Shady Ln in Austin (TX),Stray,Normal,Cat,No,Intact Male,Tabby,Mix,2016-07-08 09:00:00,Transfer,Neutered Male,1,7-12 months,74940000000000
1,A676515,2014-04-11 08:45:00,615 E. Wonsley in Austin (TX),Stray,Normal,Dog,Yes,Intact Male,Bicolor,Mix,2014-04-14 18:38:00,Return to Owner,Neutered Male,1,1-6 months,294780000000000
2,A679549,2014-05-22 15:43:00,124 W Anderson in Austin (TX),Stray,Normal,Cat,Yes,Intact Male,Bicolor,Mix,2014-06-16 13:54:00,Transfer,Neutered Male,1,1-6 months,2153460000000000
3,A683798,2016-07-21 12:16:00,3118 Windsor Rd in Austin (TX),Stray,Normal,Cat,Yes,Spayed Female,Bicolor,Mix,2016-10-18 10:55:00,Adoption,Spayed Female,0,1-3 years,7684740000000000
4,A683656,2014-07-13 13:20:00,8238 Research Blvd in Austin (TX),Stray,Normal,Cat,No,Intact Male,Point,Mix,2014-07-17 16:57:00,Adoption,Neutered Male,1,1-6 months,358620000000000


In [3]:
new_df=df.drop(columns=['animal_id','datetime_intake','found_location','datetime_outcome',
                    'sex_intake','fixed_changed'])
new_df.head()

Unnamed: 0,intake_type,intake_condition,animal_type_intake,name_intake,color_intake,breed_type,outcome_type,sex_upon_outcome,age_bucket,datetime_length
0,Stray,Normal,Cat,No,Tabby,Mix,Transfer,Neutered Male,7-12 months,74940000000000
1,Stray,Normal,Dog,Yes,Bicolor,Mix,Return to Owner,Neutered Male,1-6 months,294780000000000
2,Stray,Normal,Cat,Yes,Bicolor,Mix,Transfer,Neutered Male,1-6 months,2153460000000000
3,Stray,Normal,Cat,Yes,Bicolor,Mix,Adoption,Spayed Female,1-3 years,7684740000000000
4,Stray,Normal,Cat,No,Point,Mix,Adoption,Neutered Male,1-6 months,358620000000000


In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
# encode text columns with dummy values
df_encoded = pd.get_dummies(new_df, columns=['intake_type', 'intake_condition', 'animal_type_intake', 'name_intake', 
                    'color_intake', 'sex_upon_outcome', 'age_bucket', 'breed_type'])

In [6]:
le = LabelEncoder()
df_encoded['outcome_type'] = le.fit_transform(df_encoded['outcome_type'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)

{'Adoption': 0, 'Died': 1, 'Euthanasia': 2, 'Missing': 3, 'Return to Owner': 4, 'Transfer': 5}


In [7]:
df_encoded.head()

Unnamed: 0,outcome_type,datetime_length,intake_type_Euthanasia Request,intake_type_Owner Surrender,intake_type_Public Assist,intake_type_Stray,intake_type_Wildlife,intake_condition_Maternity,intake_condition_Medical,intake_condition_Normal,...,age_bucket_4-6 years,age_bucket_7+ years,age_bucket_7-12 months,age_bucket_Less than 1 week,breed_type_Bully Breeds,breed_type_Domestic Shorthair,breed_type_Mix,breed_type_Other,breed_type_Toy Breeds,breed_type_Wildlife
0,5,74940000000000,0,0,0,1,0,0,0,1,...,0,0,1,0,0,0,1,0,0,0
1,4,294780000000000,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,5,2153460000000000,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,0,7684740000000000,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,0,358620000000000,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0


In [8]:
# Define the features set.
X = df_encoded.copy()
X=X.drop('outcome_type',axis=1)
X.head()

Unnamed: 0,datetime_length,intake_type_Euthanasia Request,intake_type_Owner Surrender,intake_type_Public Assist,intake_type_Stray,intake_type_Wildlife,intake_condition_Maternity,intake_condition_Medical,intake_condition_Normal,intake_condition_Other,...,age_bucket_4-6 years,age_bucket_7+ years,age_bucket_7-12 months,age_bucket_Less than 1 week,breed_type_Bully Breeds,breed_type_Domestic Shorthair,breed_type_Mix,breed_type_Other,breed_type_Toy Breeds,breed_type_Wildlife
0,74940000000000,0,0,0,1,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
1,294780000000000,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2,2153460000000000,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,7684740000000000,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,358620000000000,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [9]:
# Define the target set.
y = df_encoded["outcome_type"].ravel()
y[:5]

array([5, 4, 5, 0, 0])

In [10]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [11]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Combination resampling

In [12]:
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled, y_train)

# Random Forest

In [13]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [14]:
# Fitting the model
rf_model = rf_model.fit(X_resampled, y_resampled)

In [15]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [16]:
results = pd.DataFrame({
   "Prediction": predictions,
   "Actual": y_test
}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Actual
0,0,0
1,5,5
2,0,0
3,5,5
4,4,4


In [17]:
# print confusion matrix
cm = confusion_matrix(predictions, y_test)
cm_df = pd.DataFrame(cm,
                     index = ['Adoption', 'Died', 'Euthanasia', 'Missing', 'Return to Owner', 'Transfer'], 
                     columns = ['Adoption', 'Died', 'Euthanasia', 'Missing', 'Return to Owner', 'Transfer'])
cm_df

Unnamed: 0,Adoption,Died,Euthanasia,Missing,Return to Owner,Transfer
Adoption,2602,6,35,3,140,553
Died,38,60,108,1,2,91
Euthanasia,28,50,678,0,23,146
Missing,2,1,1,1,1,3
Return to Owner,198,5,36,0,1014,171
Transfer,157,29,95,0,54,2070


In [18]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [19]:
print(f"Accuracy Score : {acc_score}")
# print classification report
print(classification_report(predictions, y_test, target_names=['Adoption', 'Died', 'Euthanasia', 'Missing', 'Return to Owner', 'Transfer']))

Accuracy Score : 0.7646988812187574
                 precision    recall  f1-score   support

       Adoption       0.86      0.78      0.82      3339
           Died       0.40      0.20      0.27       300
     Euthanasia       0.71      0.73      0.72       925
        Missing       0.20      0.11      0.14         9
Return to Owner       0.82      0.71      0.76      1424
       Transfer       0.68      0.86      0.76      2405

       accuracy                           0.76      8402
      macro avg       0.61      0.57      0.58      8402
   weighted avg       0.77      0.76      0.76      8402



# Boosting Gradient

In [20]:
from sklearn.ensemble import GradientBoostingClassifier

In [21]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,learning_rate=learning_rate,max_features=5,
   max_depth=3,
   random_state=0)
    classifier.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           X_train_scaled,
           y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           X_test_scaled,
           y_test)))

Learning rate:  0.05
Accuracy score (training): 0.747
Accuracy score (validation): 0.749
Learning rate:  0.1
Accuracy score (training): 0.769
Accuracy score (validation): 0.771
Learning rate:  0.25
Accuracy score (training): 0.796
Accuracy score (validation): 0.796
Learning rate:  0.5
Accuracy score (training): 0.806
Accuracy score (validation): 0.800
Learning rate:  0.75
Accuracy score (training): 0.744
Accuracy score (validation): 0.741
Learning rate:  1
Accuracy score (training): 0.283
Accuracy score (validation): 0.274


In [22]:
classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=0.5, max_features=5, max_depth=3, random_state=0)

classifier.fit(X_resampled, y_resampled)
predictions = classifier.predict(X_test_scaled)

In [23]:
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")
print(classification_report(predictions, y_test, target_names=['Adoption', 'Died', 'Euthanasia', 'Missing', 'Return to Owner', 'Transfer']))

Accuracy Score : 0.732325636753154
                 precision    recall  f1-score   support

       Adoption       0.87      0.77      0.82      3415
           Died       0.48      0.16      0.24       452
     Euthanasia       0.63      0.71      0.67       847
        Missing       0.20      0.02      0.04        47
Return to Owner       0.82      0.65      0.73      1556
       Transfer       0.60      0.88      0.72      2085

       accuracy                           0.73      8402
      macro avg       0.60      0.53      0.53      8402
   weighted avg       0.75      0.73      0.72      8402

