In [12]:
import json
import requests
import pandas as pd
import numpy as np
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
from thefuzz import process
from model.pet_adoption_api import PetAdoptionAPI
pd.options.plotting.backend = "plotly"

## Data Ingestion

In [37]:
# search_params = {
#     "animal_kind": "狗", 
#     "fuzzy_search_dict": {"animal_opendate": "20"}
# }

# pet_food_api = PetAdoptionAPI(**search_params)
# url = pet_food_api.build_api_url()
# resp = requests.get(url)
# df = pd.DataFrame(json.loads(resp.text))

In [3]:
data = pd.read_csv('../data/dog_data.csv', index_col=0)
data.head()

Unnamed: 0,animal_id,animal_subid,animal_area_pkid,animal_shelter_pkid,animal_place,animal_kind,animal_Variety,animal_sex,animal_bodytype,animal_colour,...,animal_opendate,animal_closeddate,animal_update,animal_createtime,shelter_name,album_file,album_update,cDate,shelter_address,shelter_tel
0,337213,VAAAG112120103,2,49,臺北市動物之家,狗,混種犬,F,MEDIUM,黑色,...,2023-12-17,2999-12-31,2023/12/17,2023/12/01,臺北市動物之家,https://www.pet.gov.tw/upload/pic/170141495180...,,2023/12/17,臺北市內湖區安美街191號,02-87913254
1,338905,VAAAG112121403,2,49,臺北市動物之家,狗,混種犬,F,MEDIUM,棕黑色,...,2023-12-17,2999-12-31,2023/12/17,2023/12/14,臺北市動物之家,https://www.pet.gov.tw/upload/pic/170254017903...,,2023/12/17,臺北市內湖區安美街191號,02-87913254
2,339155,AAAHG1121217001,3,58,新北市五股區公立動物之家,狗,貴賓犬,F,SMALL,米色,...,,2999-12-31,2023/12/17,2023/12/17,新北市五股區公立動物之家,https://www.pet.gov.tw/upload/pic/170278261086...,,2023/12/17,新北市五股區外寮路9-9號,02-82925265
3,339152,DAAAG1121216002,7,63,新竹縣動物保護教育園區,狗,混種犬,M,MEDIUM,黑色,...,2023-12-16,2999-12-31,2023/12/16,2023/12/15,新竹縣動物保護教育園區,https://www.pet.gov.tw/upload/pic/170273775931...,,2023/12/16,新竹縣竹北市縣政五街192號,03-5519548
4,339154,DAAAG1121216004,7,63,新竹縣動物保護教育園區,狗,混種犬,F,MEDIUM,黑色,...,2023-12-16,2999-12-31,2023/12/16,2023/12/15,新竹縣動物保護教育園區,https://www.pet.gov.tw/upload/pic/170273799146...,,2023/12/16,新竹縣竹北市縣政五街192號,03-5519548


In [3]:
data.shape

(5757, 28)

In [4]:
data.animal_createtime.min(), data.animal_createtime.max()

('2013/04/02', '2023/12/17')

In [4]:
df = data.copy()
df = df.drop_duplicates()
df = df.dropna(how='all', axis=1)
df.shape

(5757, 25)

In [38]:
# df.to_csv("../data/dog_data.csv")

## Data Preprocess

In [8]:
region_dict = {
    2: "臺北市", 13: "雲林縣",
    3: "新北市", 14: "嘉義縣",
    4: "基隆市", 15: "嘉義市",
    5: "宜蘭縣", 16: "臺南市",
    6: "桃園縣", 17: "高雄市",
    7: "新竹縣", 18: "屏東縣",
    8: "新竹市", 19: "花蓮縣",
    9: "苗栗縣", 20: "臺東縣",
    10: "臺中市", 21: "澎湖縣",
    11: "彰化縣", 22: "金門縣",
    12: "南投縣", 23: "連江縣"
}

shelter_dict = {
    48: "基隆市寵物銀行", 71: "嘉義市流浪犬收容中心",
    49: "臺北市動物之家", 72: "嘉義縣流浪犬中途之家",
    50: "新北市板橋區公立動物之家", 73: "臺南市動物之家灣裡站",
    51: "新北市新店區公立動物之家", 74: "臺南市動物之家善化站",
    53: "新北市中和區公立動物之家", 75: "高雄市壽山動物保護教育園區",
    55: "新北市淡水區公立動物之家", 76: "高雄市燕巢動物保護關愛園區",
    56: "新北市瑞芳區公立動物之家", 77: "屏東縣流浪動物收容所",
    58: "新北市五股區公立動物之家", 78: "宜蘭縣流浪動物中途之家",
    59: "新北市八里區公立動物之家", 79: "花蓮縣流浪犬中途之家",
    60: "新北市三芝區公立動物之家", 80: "臺東縣動物收容中心",
    61: "桃園市動物保護教育園區", 81: "連江縣流浪犬收容中心",
    62: "新竹市動物收容所", 82: "金門縣動物收容中心",
    63: "新竹縣動物收容所", 83: "澎湖縣流浪動物收容中心",
    67: "臺中市動物之家南屯園區", 89: "雲林縣流浪動物收容所",
    68: "臺中市動物之家后里園區", 92: "新北市政府動物保護防疫處",
    69: "彰化縣流浪狗中途之家", 96: "苗栗縣生態保育教育中心",
    70: "南投縣公立動物收容所"
}

breed_standard_list = [
    '混種犬', '拉不拉多貴賓犬', '貴賓犬', '瑪爾貴賓混種犬', '瑪爾濟斯犬', '博美犬', '狐狸博美', '柴犬',
    '臘腸犬', '長毛臘腸', '吉娃娃犬', '長毛吉娃娃', '台灣犬', '約克夏', '西施犬', '迷你雪納瑞',
    '大型雪納瑞', '黃金獵犬', '雪納瑞', '米格魯犬', '拉不拉多犬', '哈士奇(西伯利亞雪橇犬)',
    '法國鬥牛犬', '威爾斯柯基犬', '日本狐狸犬', '鬥牛犬(英國)', '德國狼犬(德國牧羊犬)',
    '喜樂蒂牧羊犬', '比熊犬', '英國古代牧羊犬', '邊境牧羊犬', '哈瓦那犬', '可卡獵犬', '可卡犬(美系)',
    '澳洲牧羊犬', '可卡犬(英系)', '比利時牧羊犬(格羅安達/馬利諾/坦比連)', '迷你品(迷你杜賓)',
    '可麗牧羊犬(蘇格蘭牧羊犬)', '中亞牧羊犬', '巴哥犬', '馬瑞馬牧羊犬', '加泰隆尼亞牧羊犬',
    '迷你美國牧羊犬', '邊境梗', '白毛(瑞士)牧羊犬', '德國剛毛指示犬', '秋田犬', '美系秋田犬', '鬆獅犬',
    '西高地白梗', '大麥町', '諾福克梗犬', '北京犬', '獒犬', '杜賓犬', '洛威納犬(羅威那犬)',
    '傑克羅素梗', '拉薩犬', '日本狆', '蝴蝶犬', '牛頭梗', '波士頓梗', '沙皮犬', '英國獵狐犬', '獵狐梗',
    '史必茲(史畢諾犬)', '阿茲卡爾', '巴吉度犬', '高山犬', '騎士比熊', '喜樂蒂柯基', '奧斯卡貴賓犬',
    '黃金貴賓犬', '查理士犬', '美國惡霸犬', '標準貴賓犬', '迷你貴賓犬', '惠比特犬', '玩具貴賓犬',
    '拳師犬', '大白熊(庇里牛斯山犬)', '大丹狗', '薩摩耶犬', '伯恩山犬', '蘇格蘭梗', '阿富汗獵犬',
    '波音達指示犬(英系)', '靈提', '阿拉斯加雪橇犬', '高加索犬', '西藏獒犬', '騎士查理王獵犬',
    '馬士提夫犬', '聖伯納犬', '鬥牛獒犬', '萬能梗', '澳洲(絲毛)梗', '紐波利頓犬(拿波里獒犬)',
    '中國冠毛犬', '愛斯基摩犬', '日本土佐犬', '甲斐犬', '紀州犬', '灰狗(靈提)', '紐芬蘭犬', '西藏梗',
    '波利犬', '阿根廷杜告犬', '貝生吉犬', '貝靈頓梗', '波爾多獒犬', '小型獅子犬', '斯塔福郡鬥牛梗',
    '四國犬', '史賓格犬(激飛犬)', '伯瑞犬', '義大利獒犬', '巴西菲勒犬', '威瑪獵犬', '泰國脊背', '比特犬', '其他犬'
]

In [9]:
# 以 animal_createtime 作為拾獲日期
df['year'] = df['animal_createtime'].apply(lambda x: x.split('/')[0])
df['month'] = df['animal_createtime'].apply(lambda x: x.split('/')[1])
df['day'] = df['animal_createtime'].apply(lambda x: x.split('/')[2])

In [13]:
def fuzzy_match_breed(breed: str) -> str:
    breed = breed.replace('敖', '獒')
    breed = breed.replace('馬爾濟斯', '瑪爾濟斯')
    match, _score = process.extractOne(breed+'犬', breed_standard_list)
    if match == '混種犬':
        return '米克斯'
    else:
        return match

df['animal_Variety'] = df['animal_Variety'].fillna('')
df['animal_Variety_converted'] = df['animal_Variety'].apply(fuzzy_match_breed)

In [14]:
df['animal_area'] = df['animal_area_pkid'].apply(lambda x: region_dict.get(x))
df['animal_shelter'] = df['animal_shelter_pkid'].apply(lambda x: shelter_dict.get(x))

## 每年分佈

2023 年明顯遠高於其他年份，但無法確定是因為流浪動物今年大幅增加，還是政府今年開始積極登記流浪動物，需待更多資訊才能確認此原因。

In [106]:
year_groupby = df.groupby('year', as_index=False).count()
fig = px.bar(year_groupby, x='year', y='animal_id', title='每年分佈')
fig.show()

## 各月分佈

整體來看，會發現 9~12 月拾獲較多流浪狗，但若是拆成不同年份來看，就會注意到數據還是受 2023 年影響為主，所以增加的原因也是需要更多資訊才能確定。

In [109]:
month_groupby = df.groupby('month', as_index=False).count()
fig = px.bar(month_groupby, x='month', y='animal_id', title='每月分佈', text_auto=True)
fig.show()

In [80]:
fig = make_subplots(rows=2, cols=2)
month_2023 = df.query("year == '2023'").groupby("month", as_index=False).count()
month_2022 = df.query("year == '2022'").groupby("month", as_index=False).count()
month_2021 = df.query("year == '2021'").groupby("month", as_index=False).count()
month_2020 = df.query("year == '2020'").groupby("month", as_index=False).count()

fig.add_trace(go.Bar(x=month_2020["month"], y=month_2020["animal_id"], name='2020'), row=1, col=1)
fig.add_trace(go.Bar(x=month_2021["month"], y=month_2021["animal_id"], name='2021'), row=1, col=2)
fig.add_trace(go.Bar(x=month_2022["month"], y=month_2022["animal_id"], name='2022'), row=2, col=1)
fig.add_trace(go.Bar(x=month_2023["month"], y=month_2023["animal_id"], name='2023'), row=2, col=2)

fig.update_layout(height=600, title_text="2020 ~ 2023年每月分佈")
fig.show()

## 公母分佈

In [220]:
def draw_pie(
    data_df,
    target_col: str,
    title: str,
    filter_col: str = None,
    filter_val: str = None,
    width: int = 600,
    textfont_size: int = 24,
    color_by_target_col: bool = True,
) -> None:
    tmp_df = data_df.copy()
    if filter_col and filter_val:
        tmp_df = tmp_df.query(f"{filter_col} == '{filter_val}'")
    tmp_groupby_df = tmp_df.groupby(target_col, as_index=False).count()
    if color_by_target_col:
        fig = px.pie(
            tmp_groupby_df,
            names=target_col,
            values="animal_id",
            title=title,
            width=width,
            color=target_col,
        )
    else:
        fig = px.pie(
            tmp_groupby_df,
            names=target_col,
            values="animal_id",
            title=title,
            width=width,
        )
    fig.update_traces(textposition="inside", textfont_size=textfont_size)
    fig.show()

In [221]:
draw_pie(df, 'animal_sex', title='公母比例')
draw_pie(df, 'animal_sex', title='公母比例（大型犬）', filter_col="animal_bodytype", filter_val='BIG')
draw_pie(df, 'animal_sex', title='公母比例（中型犬）', filter_col="animal_bodytype", filter_val='MEDIUM')
draw_pie(df, 'animal_sex', title='公母比例（小型犬）', filter_col="animal_bodytype", filter_val='SMALL')

## 流浪狗絕育比例

In [222]:
animal_sterilization_exclude_N = df.query("animal_sterilization != 'N'")
draw_pie(animal_sterilization_exclude_N, 'animal_sterilization', title='絕育比例')
draw_pie(animal_sterilization_exclude_N, 'animal_sterilization', title='絕育比例（大型犬）', filter_col='animal_bodytype', filter_val='BIG')
draw_pie(animal_sterilization_exclude_N, 'animal_sterilization', title='絕育比例（中型犬）', filter_col='animal_bodytype', filter_val='MEDIUM')
draw_pie(animal_sterilization_exclude_N, 'animal_sterilization', title='絕育比例（小型犬）', filter_col='animal_bodytype', filter_val='SMALL')

## 流浪狗疫苗注射比例

In [224]:
draw_pie(df, 'animal_bacterin', title='疫苗注射比例')

## 流浪狗品種分佈

In [226]:
draw_pie(df, 'animal_Variety_converted', title='品種比例', color_by_target_col=False)

In [231]:
animal_Variety_exclude_mix = (
    df
    .query("animal_Variety_converted != '米克斯'")
    .groupby("animal_Variety_converted", as_index=False)
    .count()
)
animal_Variety_exclude_mix = animal_Variety_exclude_mix.sort_values(by="animal_id")
fig = px.bar(
    data_frame=animal_Variety_exclude_mix,
    y="animal_Variety_converted",
    x="animal_id",
    title="品種比例（不含米克斯）",
    orientation="h",
    height=900,
    text_auto=True,
)
fig.update_traces(textposition='inside', textfont_size=24)
fig.show()

## 流浪狗體型分佈

In [232]:
draw_pie(df, 'animal_bodytype', '體型比例')

## 年齡比例

多數為成犬，其中以大型犬的成犬幼犬比例（98:2）最懸殊，代表流浪的大型犬幾乎都已是成犬，猜測大型犬因為體型大而不易飼養，導致被遺棄的可能性高

In [238]:
draw_pie(df, 'animal_age', '年齡比例')
draw_pie(df, 'animal_age', title='年齡比例（大型犬）', filter_col='animal_bodytype', filter_val='BIG')
draw_pie(df, 'animal_age', title='年齡比例（中型犬）', filter_col='animal_bodytype', filter_val='MEDIUM')
draw_pie(df, 'animal_age', title='年齡比例（小型犬）', filter_col='animal_bodytype', filter_val='SMALL')

## 流浪狗毛色比例

多數為小黑與小黃，前四名都是黑黃色為主的狗，佔了整體的四分之三

In [235]:
draw_pie(df, 'animal_colour', '毛色比例', color_by_target_col=False)
draw_pie(df, 'animal_colour', title='毛色比例（大型犬）', filter_col='animal_bodytype', filter_val='BIG', color_by_target_col=False)
draw_pie(df, 'animal_colour', title='毛色比例（中型犬）', filter_col='animal_bodytype', filter_val='MEDIUM', color_by_target_col=False)
draw_pie(df, 'animal_colour', title='毛色比例（小型犬）', filter_col='animal_bodytype', filter_val='SMALL', color_by_target_col=False)

In [248]:
tmp_df = df.copy()
tmp_groupby_df = tmp_df.groupby(['animal_colour', 'animal_bodytype'], as_index=False).count()
fig = px.scatter(data_frame=tmp_groupby_df, x='animal_bodytype', y='animal_colour', size='animal_id', width=600, height=600, color='animal_colour')
fig.show()

In [249]:
tmp_df = df.copy()
tmp_groupby_df = tmp_df.groupby(['animal_area', 'animal_bodytype'], as_index=False).count()
fig = px.scatter(data_frame=tmp_groupby_df, x='animal_bodytype', y='animal_area', size='animal_id', width=600, height=600, color='animal_area')
fig.show()

In [265]:
tmp_df = df.copy()
tmp_groupby_df = tmp_df.groupby(['animal_area', 'animal_colour'], as_index=False).count()
fig = px.scatter(data_frame=tmp_groupby_df, x='animal_colour', y='animal_area', size='animal_id', width=1000, height=800, color='animal_colour')
fig.show()

In [256]:
tmp_df = df.copy()
tmp_groupby_df = tmp_df.groupby(['animal_area', 'animal_age'], as_index=False).count()
fig = px.scatter(data_frame=tmp_groupby_df, x='animal_age', y='animal_area', size='animal_id', width=300, height=600, color='animal_age')
fig.show()

In [264]:
tmp_df = df.copy()
tmp_df = tmp_df.query("animal_Variety_converted != '米克斯'")
tmp_groupby_df = tmp_df.groupby(['animal_area', 'animal_Variety_converted'], as_index=False).count()
fig = px.scatter(data_frame=tmp_groupby_df, x='animal_Variety_converted', y='animal_area', size='animal_id', width=1400, height=800, color='animal_Variety_converted')
fig.show()

In [270]:
draw_pie(df, 'animal_colour', '毛色比例', color_by_target_col=False)
draw_pie(df, 'animal_colour', '毛色比例', color_by_target_col=False)

## 各縣市拾獲數量分佈

In [159]:
animal_area = df.groupby('animal_area', as_index=False).count()
animal_area = animal_area.sort_values(by="animal_id")
fig = px.bar(
    animal_area,
    y="animal_area",
    x="animal_id",
    title="各縣市分佈",
    orientation="h",
    height=900,
    text_auto=True,
)
fig.update_traces(textposition='inside', textfont_size=24)
fig.show()

In [160]:
animal_shelter = df.groupby('animal_shelter', as_index=False).count()
animal_shelter = animal_shelter.sort_values(by="animal_id")
fig = px.bar(
    animal_shelter,
    y="animal_shelter",
    x="animal_id",
    title="各收容所分佈",
    orientation="h",
    height=900,
    text_auto=True,
)
fig.update_traces(textposition='inside', textfont_size=24)
fig.show()

## 各縣市收容所數量

In [16]:
animal_area_shelter = df[['animal_area', 'animal_shelter']].copy().drop_duplicates()
animal_area_shelter = animal_area_shelter.groupby('animal_area', as_index=False).count()
animal_area_shelter = animal_area_shelter.sort_values(by="animal_shelter")
fig = px.bar(
    animal_area_shelter,
    y="animal_area",
    x="animal_shelter",
    title="各縣市收容所數量",
    orientation="h",
    height=900,
    text_auto=True,
)
fig.update_traces(textposition='inside', textfont_size=24)
fig.show()