# Prediction of number of days an animal (dog-cat) will spend in the shelter before being addopted.

# Table of Contents
1. [Data cleaning and preparation](#1)
2. [EDA](#2)
3. [Model evaluation](#3)   
    3.1 [Features selection](#31)  
    3.2 [Training models](#32)  
        - 3.2.1 [Linear Regression](#321)  
        - 3.2.2 [Random Forest](#322)


In [1]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from collections import Counter
from itertools import combinations
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score



In [2]:
# read prepared dataset
df = pd.DataFrame(pd.read_csv('data.csv'))

#### 1.1 Data types & formats inside df

parameters

In [None]:
percentage_limit_breed_words = 80
percentage_limit_breed_words_combination = 0.8  
percentage_limit_breed_words_unique = 2 
percentage_limit_color = 0.05  

creating function

In [None]:

datetime_columns = ['datetime_intake', 'datetime_outcome']
# convert the datetime columns to datetime type
for column in datetime_columns:
    df[column] = pd.to_datetime(df[column])
datetime_columns

# rest of the columns from dataframe that are not datetime categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
categorical_columns

for column in categorical_columns:
    df[column] = df[column].str.lower()

##########################
# days in shelter
df['days_in_shelter'] = (df['datetime_outcome'] - df['datetime_intake']).dt.days

##########################
# castrated
#########################
def castrated_status(row):
    if 'neutered' in row or 'spayed' in row:
        return 'yes'
    else:
        return 'no'

df['castrated'] = df['sex_upon_intake'].apply(castrated_status)
df['is_castrated'] = df['castrated'].apply(lambda x: 1 if x =='yes' else 0)

##########################
# sex upon intake
##########################
def sex_upon_intake(row):
    if 'female' in row:
        return 'female'
    elif 'male' in row:
        return 'male'
    else:
        return 'unknown'

df['sex_upon_intake'] = df['sex_upon_intake'].apply(sex_upon_intake) 

#############################################
# Age in months when the animal was taken in
#############################################
def convert_to_months(age):
    # Split the age into value and unit
    parts = age.split()
    if len(parts) != 2:  # Handle unexpected formats
        return None
    
    value, unit = int(parts[0]), parts[1].lower()
    
    # Convert the age to months
    if 'year' in unit:
        return value * 12
    elif 'month' in unit:
        return value 
    elif 'week' in unit:
        return value / 4
    elif 'day' in unit:
        return value / 30
    else:
        return None  # Handle unknown units

df['age_in_months'] = df['age_upon_intake'].apply(convert_to_months) 
df['age_in_months'] = df['age_in_months'].abs()

##########################
# Breed group creation 
##########################
# Hair type
df['hair_type'] = df['breed'].apply(
    lambda x: 'long' if 'longhair' in x else 'short' if 'shorthair' in x
    else 'medium' if 'medium hair' in x else 'unknown'
)

# Mix breed
df['mix_breed'] = df['breed'].apply(lambda x: 'mix' if 'mix' in x else 'not mix')
df['is_mix_breed'] = df['mix_breed'].apply(lambda x: 1 if x == 'mix' else 0)

# Miniature breed
df['miniature'] = df['breed'].apply(lambda x: 'miniature' if 'miniature' in x else 'non-miniature')
df['is_miniature'] = df['miniature'].apply(lambda x: 1 if x == 'miniature' else 0)

# Domestic breed
df['domestic'] = df['breed'].apply(lambda x: 'domestic' if 'domestic' in x else 'non-domestic')
df['is_domestic'] = df['domestic'].apply(lambda x: 1 if x == 'domestic' else 0)

# Clean up breed column by removing specific words
words_to_remove = ['mix', 'shorthair', 'longhair', 'medium hair', 'miniature', 'domestic', 'dog', 'cat']
for word in words_to_remove:
    df['breed'] = df['breed'].str.replace(word, '', regex=True).str.strip()

# Tokenize and count words in the breed column
all_words = df['breed'].str.split(expand=True).stack()
word_counts = Counter(all_words)
word_freq_df = pd.DataFrame(word_counts.items(), columns=['Word', 'Frequency']).sort_values(by='Frequency', ascending=False)

# Calculate percentage and cumulative percentage of word frequencies
total_words = word_freq_df['Frequency'].sum()
word_freq_df['Percentage'] = (word_freq_df['Frequency'] / total_words) * 100
word_freq_df['Cumulative Percentage'] = word_freq_df['Percentage'].cumsum()

# Select important words based on cumulative percentage limit
important_words_df = word_freq_df[word_freq_df['Cumulative Percentage'] <= percentage_limit_breed_words]

# Generate pairs of words in the breed column
df['breed_tokenized'] = df['breed'].str.split()
word_pairs = df['breed_tokenized'].apply(lambda x: list(combinations(x, 2)))

all_pairs = [pair for pairs in word_pairs for pair in pairs]
pair_counts = Counter(all_pairs)
pair_freq_df = pd.DataFrame(pair_counts.items(), columns=['Pair', 'Frequency']).sort_values(by='Frequency', ascending=False)

# Calculate percentage and cumulative percentage of pair frequencies
total_pairs = pair_freq_df['Frequency'].sum()
pair_freq_df['Percentage'] = (pair_freq_df['Frequency'] / total_pairs) * 100
pair_freq_df['Cumulative Percentage'] = pair_freq_df['Percentage'].cumsum()

# Select frequent pairs based on percentage limit
frequent_pairs_df = pair_freq_df[pair_freq_df['Percentage'] >= percentage_limit_breed_words_combination]

# Function to assign a breed group based on frequent pairs
def assign_breed_group(breed, frequent_pairs):
    for pair in frequent_pairs:
        if all(word in breed for word in pair):
            return f"{pair[0]}_{pair[1]}"
    return None

df['breed_group1'] = df['breed'].apply(lambda x: assign_breed_group(x, frequent_pairs_df['Pair'].tolist()))

# Clean up the breed column by removing frequent pairs
top_combinations = frequent_pairs_df['Pair'].tolist()

def remove_combinations(breed, combinations):
    for pair in combinations:
        if all(word in breed for word in pair):
            breed = breed.replace(f"{pair[0]} {pair[1]}", "")
    return breed.strip()

df['breed'] = df['breed'].apply(lambda x: remove_combinations(x, top_combinations))

# Recalculate frequent words after cleaning the breed column
df_no_breed_group = df[df['breed_group1'].isna()]
all_words = df_no_breed_group['breed'].str.split(expand=True).stack()
word_counts = Counter(all_words)

word_freq_df = pd.DataFrame(word_counts.items(), columns=['Word', 'Frequency']).sort_values(by='Frequency', ascending=False)

# Select important words based on frequency percentage limit
total_words = word_freq_df['Frequency'].sum()
word_freq_df['Percentage'] = (word_freq_df['Frequency'] / total_words) * 100
word_freq_df['Cumulative Percentage'] = word_freq_df['Percentage'].cumsum()
frequent_words_df = word_freq_df[word_freq_df['Percentage'] >= percentage_limit_breed_words_unique]

# Exclude specific words from the frequent words
exclude_words = ['bull', 'pit']
frequent_words_df = frequent_words_df[~frequent_words_df['Word'].isin(exclude_words)]

# Function to assign breed group based on frequent words
def assign_breed_word(breed, frequent_words):
    for word in frequent_words:
        if word in breed:
            return word
    return None

df['breed_group2'] = df['breed'].apply(lambda x: assign_breed_word(x, frequent_words_df['Word'].tolist()))

# Combine breed groups into a single column
df['breed_group'] = df['breed_group1'].fillna(df['breed_group2']).fillna('Other')

# Group breeds into specific categories
df['breed_group'] = df['breed_group'].apply(
    lambda x: 'larger_dangerous' if 'pit' in x or 'bull' in x or 'american_terrier' in x
    else 'small_dog' if 'chihuahua' in x or 'terrier' in x or 'dachshund' in x or 'poodle' in x or 'jack_russell' in x or 'russell_terrier' in x
    else x
)
##########################
# color 
#########################
# Split the color column into components
color_combinations = df['color'].str.split('/')  # Split by '/'
split_colors = color_combinations.apply(lambda x: [part.split()[0] for part in x] if isinstance(x, list) else [])  # Extract first word of each component

# Extract primary and secondary colors
df['color_primary'] = split_colors.apply(lambda x: x[0] if len(x) > 0 else None)  # First color
df['color_secondary'] = split_colors.apply(lambda x: x[1] if len(x) > 1 else None)  # Second color

# Identify dominant single-color groups 
single_colors = df[df['color_secondary'].isnull()]['color_primary']  # Single-color records
single_color_counts = single_colors.value_counts()
total_single = single_colors.count()
dominant_single_colors = single_color_counts[single_color_counts / total_single > percentage_limit_color].index

df['single_color_group'] = df['color_primary'].apply(
    lambda x: x if x in dominant_single_colors else 'other_single_colour'
)

# Process two-color combinations
df['sorted_combination'] = df.apply(
    lambda row: tuple(sorted([row['color_primary'], row['color_secondary']]))
    if pd.notnull(row['color_secondary']) else None,
    axis=1
)

# Identify dominant two-color combinations
percentage_limit_combinations = 0.05
combination_counts = df['sorted_combination'].dropna().value_counts()
total_combinations = combination_counts.sum()
dominant_combinations = combination_counts[combination_counts / total_combinations > percentage_limit_color].index

df['combination_group'] = df['sorted_combination'].apply(
    lambda x: x if x in dominant_combinations else 'other_multiple_color'
)

# Final color group assignment
df['color_group'] = df.apply(
    lambda row: row['single_color_group']
    if pd.isnull(row['sorted_combination'])
    else row['combination_group'],
    axis=1
)

# Convert tuples in color group to readable strings
df['color_group'] = df['color_group'].apply(
    lambda x: ' & '.join(x) if isinstance(x, tuple) else x
)

##########################
# in sex_upon_intake column, just add if is male or not to avoid having unknown values
df['is_male'] = df['sex_upon_intake'].apply(lambda x: 1 if x =='male' else 0) 
# condition at intake: grouping medical (med attn and medical), normal, sick_injured and rest (other)
df['intake_condition_group'] = df['intake_condition'].apply(
    lambda x: 'normal' if 'normal' in x 
    else 'sick_injured' if 'sick' in x 
    else 'sick_injured' if 'injured' in x 
    else 'medical' if 'medical' in x 
    else 'medical' if 'med attn' in x  
    else 'nursing' if 'nursing' in x 
    else 'nursing' if 'neonatal' in x 
    else 'other')

# group intake type: stray, owner surrended and Other
df['intake_type_group'] = df['intake_type'].apply(
    lambda x: 'stray' if 'stray' in x 
    else 'owner surrender' if 'owner surrender' in x 
    else 'other'
)

####################
# is dog
df['is_dog'] = df['animal_type'].apply(lambda x: 1 if x =='dog' else 0)
######################
# day of intake mothn and day of week
df['day_of_week_in'] = df['datetime_intake'].dt.day_of_week
df['month_in'] = df['datetime_intake'].dt.month

#####################
# Function to calculate the overlap count for a given row, considering animal_type
def count_overlapping_by_type(row, df):
    overlapping = df[
        (df['datetime_intake'] <= row['datetime_outcome']) &  # Shelter intake is before or during this outcome
        (df['datetime_outcome'] >= row['datetime_intake']) &  # Shelter outcome is after or during this intake
       ##(df['animal_type'] == row['animal_type']) &           # Same animal type
       ## (df['breed_group'] == row['breed_group']) &        # Same day of the week
        (df.index != row.name)  # Exclude the current record itself
    ]
    return len(overlapping) 
df['animals_in_shelter'] = df.apply(lambda row: count_overlapping_by_type(row, df), axis=1)

# Drop unused columns
cols_to_drop = ['name', 'age_upon_intake', 'breed_group1', 'breed_group2', 'breed', 'breed_tokenized', 'color', 'color_primary', 'color_secondary', 'single_color_group', 'sorted_combination', 'combination_group']
df.drop(columns=cols_to_drop, errors='ignore', inplace=True)


KeyError: 'age_upon_intake'

In [None]:
# # datetime columns: datetime_intake, datetime_outcome
# datetime_columns = ['datetime_intake', 'datetime_outcome']
# # convert the datetime columns to datetime type
# for column in datetime_columns:
#     df[column] = pd.to_datetime(df[column])
# datetime_columns

# # rest of the columns from dataframe that are not datetime categorical columns
# categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
# categorical_columns

# for column in categorical_columns:
#     df[column] = df[column].str.lower()

# ##########################
# # days in shelter
# df['days_in_shelter'] = (df['datetime_outcome'] - df['datetime_intake']).dt.days

# ##########################
# # castrated
# def castrated_status(row):
#     if 'neutered' in row or 'spayed' in row:
#         return 'yes'
#     else:
#         return 'no'

# df['castrated'] = df['sex_upon_intake'].apply(castrated_status)
# df['is_castrated'] = df['castrated'].apply(lambda x: 1 if x =='yes' else 0)

# ##########################
# # sex upon intake
# def sex_upon_intake(row):
#     if 'female' in row:
#         return 'female'
#     elif 'male' in row:
#         return 'male'
#     else:
#         return 'unknown'

# df['sex_upon_intake'] = df['sex_upon_intake'].apply(sex_upon_intake) 

# ##########################
# # age upon intake
# def convert_to_months(age):
#     # Split the age into value and unit
#     parts = age.split()
#     if len(parts) != 2:  # Handle unexpected formats
#         return None
    
#     value, unit = int(parts[0]), parts[1].lower()
    
#     # Convert the age to months
#     if 'year' in unit:
#         return value * 12
#     elif 'month' in unit:
#         return value 
#     elif 'week' in unit:
#         return value / 4
#     elif 'day' in unit:
#         return value / 30
#     else:
#         return None  # Handle unknown units

# df['age_in_months'] = df['age_upon_intake'].apply(convert_to_months) 
# df['age_in_months'] = df['age_in_months'].abs()

# ##########################
# # breed and other properties from it
# # 'hair_type'
# df['hair_type'] = df['breed'].apply(
#     lambda x: 'long' if 'longhair' in x else 'short' if 'shorthair' in x
#       else 'medium' if 'medium hair' in x else 'unknown')

# # mix breed column
# df['mix_breed'] = df['breed'].apply(lambda x: 'mix' if 'mix' in x else 'not mix') 
# df['is_mix_breed'] = df['mix_breed'].apply(lambda x: 1 if x =='mix' else 0) 
# # miniature breed column
# df['miniature'] = df['breed'].apply(lambda x: 'miniature' if 'miniature' in x else 'non-miniature')
# df['is_miniature'] = df['miniature'].apply(lambda x: 1 if x =='miniature' else 0)
# # domestic breed column
# df['domestic'] = df['breed'].apply(lambda x: 'domestic' if 'domestic' in x else 'non-domestic')
# df['is_domestic'] = df['domestic'].apply(lambda x: 1 if x =='domestic' else 0)


# words_to_remove = ['mix', 'shorthair', 'longhair', 'medium hair', 'miniature', 'domestic','dog','cat']
# for word in words_to_remove:
#     df['breed'] = df['breed'].str.replace(word, '').str.strip()

# # Split the breed column into individual words
# all_words = df['breed'].str.split(expand=True).stack()
# word_counts = Counter(all_words)
 
# word_freq_df = pd.DataFrame(word_counts.items(), columns=['Word', 'Frequency']).sort_values(by='Frequency', ascending=False)
 
# total_words = word_freq_df['Frequency'].sum()
# word_freq_df['Percentage'] = (word_freq_df['Frequency'] / total_words) * 100
# word_freq_df = word_freq_df.sort_values(by='Percentage', ascending=False)
# word_freq_df['Cumulative Percentage'] = word_freq_df['Percentage'].cumsum()
# important_words_df = word_freq_df[word_freq_df['Cumulative Percentage'] <= percentage_limit_breed_words]


# # Tokenize each row in the breed column
# df['breed_tokenized'] = df['breed'].str.split()
# word_pairs = df['breed_tokenized'].apply(lambda x: list(combinations(x, 2)))

# all_pairs = [pair for pairs in word_pairs for pair in pairs]

# pair_counts = Counter(all_pairs)
# pair_freq_df = pd.DataFrame(pair_counts.items(), columns=['Pair', 'Frequency']).sort_values(by='Frequency', ascending=False)
# total_pairs = pair_freq_df['Frequency'].sum()
# pair_freq_df['Percentage'] = (pair_freq_df['Frequency'] / total_pairs) * 100
# pair_freq_df['Cumulative Percentage'] = pair_freq_df['Percentage'].cumsum()

# # I decided to use 80% as the limit to cover the most important pairs
# frequent_pairs_df = pair_freq_df[pair_freq_df['Percentage'] >= percentage_limit_breed_words_combination]

# # Ensure the breed column and frequent pairs are processed correctly
# def assign_breed_group(breed, frequent_pairs):
#     for pair in frequent_pairs:
#         if all(word in breed for word in pair):
#             return f"{pair[0]}_{pair[1]}"
#     return None

# # Apply the function to the DataFrame
# df['breed_group1'] = df['breed'].apply(lambda x: assign_breed_group(x, frequent_pairs_df['Pair'].tolist())) 

# top_combinations = frequent_pairs_df['Pair'].tolist()

# def remove_combinations(breed, combinations):
#     for pair in combinations:
#         if all(word in breed for word in pair):
#             breed = breed.replace(f"{pair[0]} {pair[1]}", "")
#     return breed.strip()

# # Apply the function to clean up the breed column
# df['breed'] = df['breed'].apply(lambda x: remove_combinations(x, top_combinations))

# # after cleaning the breed column, we can re-calculate the frequent words
# df_no_breed_group = df[df['breed_group1'].isna()]

# all_words = df_no_breed_group['breed'].str.split(expand=True).stack()
# word_counts = Counter(all_words)


# word_freq_df = pd.DataFrame(word_counts.items(), columns=['Word', 'Frequency']).sort_values(by='Frequency', ascending=False)
# # I decided to use 2% as the limit to cover the most important words
# total_words = word_freq_df['Frequency'].sum()
# word_freq_df['Percentage'] = (word_freq_df['Frequency'] / total_words) * 100
# word_freq_df = word_freq_df.sort_values(by='Percentage', ascending=False)
# word_freq_df['Cumulative Percentage'] = word_freq_df['Percentage'].cumsum()
# frequent_words_df = word_freq_df[word_freq_df['Percentage'] >= percentage_limit_breed_words_unique]

# exclude_words = ['bull', 'pit'] 
# frequent_words_df = frequent_words_df[~frequent_words_df['Word'].isin(exclude_words)]

# # Ensure the breed column and frequent pairs are processed correctly
# def assign_breed_word(breed, frequent_words):
#     for word in frequent_words:
#         if word in breed:
#             return word
#     return None
# # Apply the function to the DataFrame
# df['breed_group2'] = df['breed'].apply(lambda x: assign_breed_word(x, frequent_words_df['Word'].tolist()))

# df['breed_group'] = df['breed_group1'].fillna(df['breed_group2'])

# # assign the breed_group to other if it is still null
# df['breed_group'] = df['breed_group'].fillna('Other')

# df['breed_group'] = df['breed_group'].apply(
#     lambda x: 'larger_dangerous' if 'pit' in x 
#     else 'larger_dangerous' if 'bull' in x 
#     else 'larger_dangerous' if 'american_terrier' in x  
#     else 'small_dog' if 'chihuahua' in x 
#     else 'small_dog' if 'terrier' in x 
#     else 'small_dog' if 'dachshund' in x 
#     else 'small_dog' if 'poodle' in x 
#     else 'small_dog' if 'jack_russell' in x 
#     else 'small_dog' if 'russell_terrier' in x 
#     else x)   
# ##########################
# # color 
# color_combinations = df['color'].str.split('/') # Split the color column by '/'
# split_colors = color_combinations.apply(lambda x: [part.split()[0] for part in x] if isinstance(x, list) else []) # get the first word of each component

# df['color_primary'] = split_colors.apply(lambda x: x[0] if len(x) > 0 else None)  # First color
# df['color_secondary'] = split_colors.apply(lambda x: x[1] if len(x) > 1 else None)  # Second color

# # single-color groups
# single_colors = df[df['color_secondary'].isnull()]['color_primary'] # get single-color records

# percentage_limit  = 0.05
# single_color_counts = single_colors.value_counts()
# total_single = single_colors.count()
# dominant_single_colors = single_color_counts[single_color_counts / total_single >percentage_limit].index

# df['single_color_group'] = df['color_primary'].apply(
#     lambda x: x if x in dominant_single_colors else 'other_single_colour'
# ) 

# # two-color combinations
# df['sorted_combination'] = df.apply(
#     lambda row: tuple(sorted([row['color_primary'], row['color_secondary']]))
#     if pd.notnull(row['color_secondary']) else None,
#     axis=1
# )

# percentage_limit_combinations = 0.05
# combination_counts = df['sorted_combination'].dropna().value_counts()
# total_combinations = combination_counts.sum()
# dominant_combinations = combination_counts[combination_counts / total_combinations > percentage_limit_combinations].index

# df['combination_group'] = df['sorted_combination'].apply(
#     lambda x: x if x in dominant_combinations else 'other_multiple_color'
# ) 

# # final color group assignment
# df['color_group'] = df.apply(
#     lambda row: row['single_color_group']
#     if pd.isnull(row['sorted_combination'])
#     else row['combination_group'],
#     axis=1
# )
# df['color_group'] = df['color_group'].apply(
#     lambda x: ' & '.join(x) if isinstance(x, tuple) else x  # Convert tuples to readable strings
# ) 

# ##########################
# # in sex_upon_intake column, just add if is male or not to avoid having unknown values
# df['is_male'] = df['sex_upon_intake'].apply(lambda x: 1 if x =='male' else 0) 
# # condition at intake: grouping medical (med attn and medical), normal, sick_injured and rest (other)
# df['intake_condition_group'] = df['intake_condition'].apply(
#     lambda x: 'normal' if 'normal' in x 
#     else 'sick_injured' if 'sick' in x 
#     else 'sick_injured' if 'injured' in x 
#     else 'medical' if 'medical' in x 
#     else 'medical' if 'med attn' in x  
#     else 'nursing' if 'nursing' in x 
#     else 'nursing' if 'neonatal' in x 
#     else 'other')

# # group intake type: stray, owner surrended and Other
# df['intake_type_group'] = df['intake_type'].apply(
#     lambda x: 'stray' if 'stray' in x 
#     else 'owner surrender' if 'owner surrender' in x 
#     else 'other'
# )



# ####################
# # is dog
# df['is_dog'] = df['animal_type'].apply(lambda x: 1 if x =='dog' else 0)
# ######################
# # day of intake mothn and day of week
# df['day_of_week_in'] = df['datetime_intake'].dt.day_of_week
# df['month_in'] = df['datetime_intake'].dt.month

# #####################
# # Function to calculate the overlap count for a given row, considering animal_type
# def count_overlapping_by_type(row, df):
#     overlapping = df[
#         (df['datetime_intake'] <= row['datetime_outcome']) &  # Shelter intake is before or during this outcome
#         (df['datetime_outcome'] >= row['datetime_intake']) &  # Shelter outcome is after or during this intake
#        ##(df['animal_type'] == row['animal_type']) &           # Same animal type
#        ## (df['breed_group'] == row['breed_group']) &        # Same day of the week
#         (df.index != row.name)  # Exclude the current record itself
#     ]
#     return len(overlapping) 
# df['animals_in_shelter'] = df.apply(lambda row: count_overlapping_by_type(row, df), axis=1)


# # drop the columns that are not useful 
# del df['name'] 
# del df['age_upon_intake']
# del df['breed_group1']
# del df['breed_group2']
# del df['breed']
# del df['breed_tokenized']
# del df['color']
# del df['color_primary']
# del df['color_secondary']   
# del df['single_color_group']
# del df['sorted_combination']
# del df['combination_group']

<h2 id="3">3. Model evaluation</h2>


<h3 id="31">3.1 Features selection pre-model</h3>


In [None]:
df.columns

Index(['animal_id', 'datetime_intake', 'found_location', 'intake_type',
       'intake_condition', 'animal_type', 'sex_upon_intake',
       'datetime_outcome', 'outcome_type', 'days_in_shelter', 'castrated',
       'is_castrated', 'age_in_months', 'hair_type', 'mix_breed',
       'is_mix_breed', 'miniature', 'is_miniature', 'domestic', 'is_domestic',
       'breed_group', 'color_group', 'is_male', 'intake_condition_group',
       'intake_type_group', 'is_dog', 'day_of_week_in', 'month_in',
       'animals_in_shelter'],
      dtype='object')

In [None]:
numerical = [
    'age_in_months', 
    'month_in', 
    'animals_in_shelter',
    'day_of_week_in',
    'is_dog',
    'is_mix_breed', 
    'is_miniature', 
    'is_domestic',
    'is_castrated'] 
  
categorical = [    
    'intake_type_group',
    'intake_condition_group',  
    'is_male', 
    'hair_type',
   # 'day_of_week_in',
     'breed_group',     
     'color_group'
]
#   

In [None]:
# checckiing for missing values in columns inside numerical or categorical
missing_numerical = df[numerical].isnull().sum()
missing_categorical = df[categorical].isnull().sum()

print(f"Missing values in numerical columns:\n{missing_numerical[missing_numerical > 0]}\n")
print(f"Missing values in categorical columns:\n{missing_categorical[missing_categorical > 0]}")


Missing values in numerical columns:
Series([], dtype: int64)

Missing values in categorical columns:
Series([], dtype: int64)


------

<h3 id="32">3.2 Training models</h3>


For training the model we will use the whole dataset except the last 20% of the records ordered by datetime_intake that would be the test

Preparing the training dataset, validation and test dataset.

In [None]:
# for test: last 20% of the records based on the datetime_intake column
df.reset_index(drop=True, inplace=True)
df_test = df.sort_values('datetime_intake').tail(int(len(df) * 0.2))
print('test df from: ',df_test['datetime_intake'].min(), 'test df to: ' ,df_test['datetime_intake'].max())

# for train: the rest of the records that I will divide into train and validation sets
df_full_train = df.drop(df_test.index).reset_index(drop=True)
df_train, df_val = train_test_split(df_full_train, test_size=0.20, random_state=1)

df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_full_train = df_full_train.days_in_shelter.values
y_train = df_train.days_in_shelter.values
y_val = df_val.days_in_shelter.values
y_test = df_test.days_in_shelter.values

del df_full_train['days_in_shelter']
del df_train['days_in_shelter']
del df_val['days_in_shelter']
del df_test['days_in_shelter']

test df from:  2022-06-22 12:20:00 test df to:  2024-11-10 13:10:00


In [None]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

In [None]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)  # Use transform, not fit_transform

In [None]:
full_train_dict = df_full_train[categorical + numerical].to_dict(orient='records')
X_full_train = dv.transform(full_train_dict)


In [None]:
test_dict = df_test[categorical + numerical].to_dict(orient='records')
X_test = dv.transform(test_dict)

In [None]:
print(dv.feature_names_)

['age_in_months', 'animals_in_shelter', 'breed_group=Other', 'breed_group=australian_shepherd', 'breed_group=australian_tle', 'breed_group=black_mouth', 'breed_group=border_collie', 'breed_group=boxer', 'breed_group=german_shepherd', 'breed_group=great_pyrenees', 'breed_group=hound', 'breed_group=labrador_retriever', 'breed_group=larger_dangerous', 'breed_group=siamese', 'breed_group=siberian_husky', 'breed_group=small_dog', 'color_group=black', 'color_group=black & brown', 'color_group=black & tan', 'color_group=black & white', 'color_group=blue', 'color_group=blue & white', 'color_group=brown', 'color_group=brown & white', 'color_group=orange', 'color_group=other_multiple_color', 'color_group=other_single_colour', 'color_group=tan', 'color_group=tan & white', 'color_group=tricolor', 'color_group=white', 'day_of_week_in', 'hair_type=long', 'hair_type=medium', 'hair_type=short', 'hair_type=unknown', 'intake_condition_group=medical', 'intake_condition_group=normal', 'intake_condition_gr

---

#### XGboost

In [None]:
import io
from contextlib import redirect_stdout
import xgboost as xgb
 
features = list(dv.get_feature_names_out()) 

dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

# Watchlist for monitoring training and validation
watchlist = [(dtrain, 'train'), (dval, 'val')]



In [None]:
# final model
# Train the final model with optimal hyperparameters
xgb_params = {
    'eta': 0.1,                    
    'max_depth': 6,                
    'gamma': 0.1,                  
    'subsample': 0.8,              
    'colsample_bytree': 0.8,       
    'min_child_weight': 15,        
    'objective': 'reg:squarederror',
    'nthread': -1,                 
    'seed': 1,                     
    'verbosity': 1,                
    'eval_metric': 'rmse'                
}

In [None]:
model = xgb.train(
    xgb_params,
    dtrain,
    num_boost_round=200,  # Maximum boosting rounds
    evals=[(dtrain, 'train'), (dval, 'val')],  # Monitor train and validation RMSE
    verbose_eval=5,  # Print every 5 rounds
    early_stopping_rounds=5  # Stop if no improvement for 10 rounds
)

# Make predictions on the validation set
y_pred = model.predict(dval)

# Calculate RMSE
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"Validation RMSE: {rmse}")


[0]	train-rmse:31.53376	val-rmse:31.32161
[5]	train-rmse:21.95944	val-rmse:21.91448
[10]	train-rmse:16.33567	val-rmse:16.43894
[15]	train-rmse:13.18411	val-rmse:13.42728
[20]	train-rmse:11.55946	val-rmse:11.89706
[25]	train-rmse:10.65552	val-rmse:11.07049
[30]	train-rmse:10.15637	val-rmse:10.61791
[35]	train-rmse:9.88664	val-rmse:10.40141
[40]	train-rmse:9.76418	val-rmse:10.30913
[45]	train-rmse:9.68331	val-rmse:10.26511
[50]	train-rmse:9.60270	val-rmse:10.21823
[55]	train-rmse:9.55257	val-rmse:10.19293
[60]	train-rmse:9.50368	val-rmse:10.16764
[65]	train-rmse:9.46555	val-rmse:10.14825
[70]	train-rmse:9.42785	val-rmse:10.13125
[75]	train-rmse:9.39144	val-rmse:10.11717
[80]	train-rmse:9.36862	val-rmse:10.11293
[85]	train-rmse:9.34698	val-rmse:10.11335
[90]	train-rmse:9.30707	val-rmse:10.10562
[95]	train-rmse:9.28346	val-rmse:10.10230
[100]	train-rmse:9.24718	val-rmse:10.09624
[105]	train-rmse:9.22805	val-rmse:10.09146
[110]	train-rmse:9.20983	val-rmse:10.08727
[115]	train-rmse:9.17874	v



In [None]:

model = xgb.train(
    xgb_params,
    dtrain,
    num_boost_round=171
)

y_pred = model.predict(dval)

# Calculate RMSE
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"Final Validation RMSE: {rmse}")


Final Validation RMSE: 10.044178004245143




In [None]:
# Make predictions on the test set with full training data
dtest = xgb.DMatrix(X_test, feature_names=features)
y_pred = model.predict(dtest)

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Test RMSE: {rmse}") 

Test RMSE: 10.299410796312996


