# 1. 데이터 탐색

In [None]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import DataFrame, Series
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
import csv

pd.set_option('display.max_rows', 30000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
df = pd.read_csv('data/train.csv')
print(np.shape(df))
df.tail()

In [None]:
df.describe()

In [None]:
len(df['Color'].unique())

### [데이터 탐색 결과]<br>
1) AnimalID : 동물의 일련번호로 primary key 역할을 한다 <font color="blue">(정수로 encoding 필요)</font><br><br>
2) Name : 동물의 이름. <font color="blue">(가장 많은 'Max'도 136마리에 불과하므로 binarize만 한다)</font><br><br>
3) DateTime : 결과가 행해진 시간. <font color="blue">(계절성을 띄는지 등의 변수 특성을 탐색할 필요가 있음)</font><br><br>
4) OutcomeType : 결과의 형태이자 <font color="red">Target</font>. 결과는 Adoption, Transfer, Return, Euthanasia(안락사), Died로 다섯 가지.<font color="blue">(One-Hot-Encoding필요)</font><br><br>
5) AnimalType : 동물의 종. dog와 cat 두가지로 나뉜다.<font color="blue">(dog면 1, cat이면 0으로 binarize필요)</font><br><br>
6) SexuponOutcome : 중성화 여부. 남성/여성/중성화남성/중성화여성/unknown으로 크게 다섯 가지로 나뉜다.<font color="blue">(성의 중요도도 클 것이라 예상)</font><br><br>
7) AgeuponOutcome : 결과가 행해질 당시의 동물의 나이.<font color="blue">(매우 중요할 것으로 예상, 단위 통일 필요)</font><br><br>
8) Breed : 동물의 종<font color="blue">(개/고양이의 세부 종, <font color="red">1380종</font>으로 매우 다양)</font><br><br>
9) Color : 동물의 색 <font color="blue">(<font color="red">366가지</font>로 매우다양)</font><br>

# 2. 데이터 전처리
- 전처리 된 독립변수를 df_pre에 새로 저장한다

## - 2.1 OutcomeType (One Hot Encoding)
- Label Binarizer를 통해 문자열을 OHE 처리

In [None]:
df["OutcomeType"].unique()

In [None]:
from sklearn.preprocessing import LabelBinarizer

dfX = pd.DataFrame(LabelBinarizer().fit_transform(df["OutcomeType"]), 
                    columns=['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'], index=df.index)
df_pre = pd.concat([df, dfX], axis=1)
df_pre.tail()

In [None]:
print(df['OutcomeType'].value_counts())

print(df_pre['Adoption'].value_counts())
print(df_pre['Transfer'].value_counts())
print(df_pre['Return_to_owner'].value_counts())
print(df_pre['Euthanasia'].value_counts())
print(df_pre['Died'].value_counts())

## - 2.2 Name (Binarize)

In [None]:
def Naming(x):
    if type(x) == float:
        return 0
    else:
        return 1

In [None]:
df_pre['Name'] = df['Name'].transform(Naming)

In [None]:
df_pre.tail()

In [None]:
print(df_pre['Name'].value_counts())


In [None]:
import seaborn as sns
sns.set(style="ticks")
ax = sns.factorplot(x="Name", col="OutcomeType", data=df_pre, kind='count' )
ax.set_xticklabels(rotation=90, ha="right")

## - 2.3 Color (전처리)
- Color는 약 366종류로 다양하게 분포되어 있으며, 비슷하거나 mix된 경우도 많다. <br> 따라서 메인 색상(대표하는 색, 믹스된 경우 먼저 나온 색상을 사용)만을 남겨놓고 통합했다.<br><font color="blue">예시) 'Brown Tabby/White' --> 'Brown'</font><br>
- 그 결과 <font color="red">366종의 컬러를 25종으로</font> 줄일 수 있었다.

In [None]:
len(df['Color'].unique())

In [None]:
# 'A/B' = 'A'로 표기
def color_del(x):
    if '/' in x:
        x = x[0:x.index('/')]
        return x
    else:
        return x

In [None]:
df_pre['Color_pre'] = df['Color'].copy()

In [None]:
df_pre['Color_pre'] = df_pre['Color_pre'].apply(color_del)

In [None]:
df_pre.tail(3)

In [None]:
# 대표 컬러로 통합
def color_union(x):
    if 'Black' in x:
        x = 'Black'
        return x
    elif 'Blue' in x:
        x = 'Blue'
        return x
    elif 'Red' in x:
        x = 'Red'
        return x
    elif 'Silver' in x:
        x = 'Silver'
        return x
    elif 'Brown' in x:
        x = 'Brown'
        return x
    elif 'Orange' in x:
        x = 'Orange'
        return x
    elif 'Cream' in x:
        x = 'Cream'
        return x
    elif 'Gray' in x:
        x = 'Gray'
        return x
    elif 'Calico' in x:
        x = 'Calico'
        return x
    elif 'Chocolate' in x:
        x = 'Chocolate'
        return x
    elif 'Tortie' in x:
        x = 'Tortie'
        return x
    elif 'Yellow' in x:
        x = 'Yellow'
        return x
    elif 'Liver' in x:
        x = 'Liver'
        return x
    else:
        return x

In [None]:
df_pre['Color_pre'] = df_pre['Color_pre'].apply(color_union)

In [None]:
# 40마리 이하인 색깔은 'etc'로 통합
df_pre['Color_pre'] = df_pre['Color_pre'].replace('Agouti', 'etc')
df_pre['Color_pre'] = df_pre['Color_pre'].replace('Apricot', 'etc')
df_pre['Color_pre'] = df_pre['Color_pre'].replace('Lilac Point', 'etc')
df_pre['Color_pre'] = df_pre['Color_pre'].replace('Pink', 'etc')
df_pre['Color_pre'] = df_pre['Color_pre'].replace('Ruddy', 'etc')

In [None]:
df_pre.groupby(['Color_pre']).count()

In [None]:
print(len(df['Color'].value_counts()))
print(len(df_pre['Color_pre'].value_counts()))

## - 2.4 AgeuponOutcome (단위 통일)
- 중구난방인 단위의 나이 데이터들을 모두 한 단위, day로 통일시켜 주었다.

In [None]:
def DayuponOutcome(age_str):
    num, unit = age_str.split(' ')
    if unit == 'day' or unit == 'days':
        return( int(num) )
    elif unit == 'week' or unit == 'weeks':
        return(int(num) * 7 )
    elif unit == 'month' or unit == 'months':
        return(int(num) * 30 )
    elif unit == 'year' or unit == 'years':
        return(int(num) * 365 )

In [None]:
df_pre['DayuponOutcome'] = df_pre['AgeuponOutcome'].copy()
df_pre['DayuponOutcome'] = df_pre['DayuponOutcome'].fillna('0 day')
df_pre['DayuponOutcome'] = df_pre['DayuponOutcome'].apply(DayuponOutcome)

In [None]:
df_pre['AgeuponOutcome'].fillna('Unknown', inplace=True)

In [None]:
df_pre['DayuponOutcome'].describe()

In [None]:
def Age(x):
    if 0 < x < 730:
        return 2
    elif 731 < x < 1460:
        return 4
    elif 1461 < x < 2190:
        return 6
    elif 2191 < x < 2920:
        return 8
    elif 2921 < x < 3650:
        return 10
    elif 3651 < x < 4380:
        return 12
    elif 4381 < x < 5110:
        return 14
    elif 5111 < x < 5840:
        return 16
    elif 5841 < x < 6570:
        return 18
    else:
        return 20    

In [None]:
df_pre['AboutAge'] = df_pre['DayuponOutcome'].copy()
df_pre['AboutAge'] = df_pre['AboutAge'].apply(Age)

In [None]:
df_pre.tail(3)

In [None]:
plt.figure(figsize=(6,4))
ax = sns.countplot(x="AboutAge", data=df_pre)

In [None]:
sns.set(style="ticks")
ax = sns.factorplot(x="AboutAge", col="OutcomeType", data=df_pre, kind='count' )
ax.set_xticklabels(rotation=90, ha="right")

## - 2.5 Breed (전처리)
- Mix는 별도의 컬럼으로 만들어 binarize(mix는 1, 순종은 0)

In [None]:
def BreedMix(x):
    if '/' in  x:
        return 1
    elif 'Mix' in x:
        return 1
    else:
        return 0

In [None]:
df_pre['BreedMix'] = df_pre['Breed'].apply(BreedMix)

In [None]:
def Nature_Breed(x):
    if '/' in x:
        x = x[0:x.index('/')]
        return x
    elif ' Mix' in x:
        x = x[0:-4]
        return x
    else:
        return x

In [None]:
df_pre['Breed_pre'] = df_pre['Breed'].copy()

In [None]:
df_pre['Breed_pre'] = df_pre['Breed_pre'].apply(Nature_Breed)

In [None]:
df_pre.head(5)

In [None]:
df_pre.groupby(['AnimalType', 'Breed_pre']).count()

In [None]:
df_pre.groupby(['Breed_pre', 'AboutAge']).count()

## - 2.6 AnimalType (Binarize)
- dog는 1, cat은 0

In [None]:
encoder = LabelEncoder()
df_pre['AnimalType'] = encoder.fit_transform(df['AnimalType'])

In [None]:
df_pre.tail()

In [None]:
df_pre['AnimalType'].value_counts()

## - 2.7 SexuponOutcome
- Nan값을 Unknown으로 처리

In [None]:
df_pre['SexuponOutcome'].value_counts()

In [None]:
df_pre['SexuponOutcome'].fillna("Unknown", inplace=True)

In [None]:
df_pre['SexuponOutcome'].value_counts()

## - 2.8 AnimalID (string제거, 정수화)

In [None]:
df_pre['AnimalID'] = encoder.fit_transform(df['AnimalID'])

In [None]:
df_pre.tail()

## -2.9 DateTime 전처리 및 분석

In [None]:
import re

In [None]:
def MakeYear(x):
    A = re.search(r'\d\d\d\d', str(x))
    return A.group()

In [None]:
df_pre['Year'] = df_pre['DateTime'].apply(MakeYear)

In [None]:
def MakeMonth(x):
    A = re.search(r'-\d\d-', str(x))
    B = re.search(r'\d\d', str(A))
    return B.group()

In [None]:
df_pre['Month'] = df_pre['DateTime'].apply(MakeMonth)

In [None]:
df_pre['YearMonth'] = df_pre['Year'] + df_pre['Month']

In [None]:
def Integerize(x):
    x = int(x)
    return x

In [None]:
df_pre['YearMonth'] = df_pre['YearMonth'].apply(Integerize)

In [None]:
df_pre.tail()

In [None]:
import seaborn as sns
plt.figure(figsize=(6,3))
ax = sns.countplot(x="YearMonth", data=df_pre, order=[201310, 201311, 201312, 201401, 201402, 201403, 201404, 201405, 201406, 201407, 201408, 201409, 201410, 201411, 201412, 201501, 201502, 201503, 201504, 201505, 201506, 201507, 201508, 201509, 201510, 201511, 201512, 201601, 201602])
ax.set_xticklabels(labels=[201310, 201311, 201312, 201401, 201402, 201403, 201404, 201405, 201406, 201407, 201408, 201409, 201410, 201411, 201412, 201501, 201502, 201503, 201504, 201505, 201506, 201507, 201508, 201509, 201510, 201511, 201512, 201601, 201602], rotation=90, ha="right")

In [None]:
sns.set(style="ticks")
ax = sns.factorplot(x="YearMonth", col="OutcomeType", data=df_pre, kind='count' )
ax.set_xticklabels(rotation=90, ha="right")

In [None]:
sns.set(style="ticks")
ax = sns.factorplot(x="YearMonth", col="Transfer", data=df_pre, kind='count' )
ax.set_xticklabels(rotation=90, ha="right")

In [None]:
sns.set(style="ticks")
ax = sns.factorplot(x="YearMonth", col="Adoption", data=df_pre, kind='count' )
ax.set_xticklabels(rotation=90, ha="right")

In [None]:
sns.set(style="ticks")
ax = sns.factorplot(x="YearMonth", col="Euthanasia", data=df_pre, kind='count' )
ax.set_xticklabels(rotation=90, ha="right")

### 2.9.1 DateTime을 계절로 변환하여 추가

In [None]:
df_pre['Season'] = df_pre['Month'].copy()

In [None]:
df_pre['Season'] = df_pre['Season'].apply(Integerize)

In [None]:
df_pre.tail()

In [None]:
def Seasonerize(x):
    if x == 12 or x == 1 or x == 2:
        return 'Winter'
    elif 2 < x < 6:
        return 'Spring'
    elif 5 < x < 9:
        return 'Summer'
    elif 8 < x < 12:
        return 'Automn'
    else:
        return 'Unknown'    

In [None]:
df_pre['Season'] = df_pre['Season'].apply(Seasonerize)
df_pre.tail()

In [None]:
df_pre['Season'].value_counts()

### 2.9.2 DateTime을 분기로 변환하여 추가

In [None]:
df_pre['Quarter'] = df_pre['Month'].copy()

In [None]:
df_pre['Quarter'] = df_pre['Quarter'].apply(Integerize)

In [None]:
def Quarterize(x):
    if 0 < x < 4:
        return '1/4'
    elif 3 < x < 7:
        return '2/4'
    elif 6 < x < 10:
        return '3/4'
    elif 9 < x < 13:
        return '4/4'
    else:
        return 'Unknown'    

In [None]:
df_pre['Quarter'] = df_pre['Quarter'].apply(Quarterize)
df_pre.tail()

In [None]:
df_pre['Quarter'].value_counts()

## - 2.10 성향 그룹 추가
- 동물의 성향이 Adoption에 영향을 미칠 것이다.
- 강아지의 성향 기준은 American Kennel Club(AKC), United Kennel Club (UKC)의 기준을 크롤링해서 dataframe화 했다.

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd


info_url = 'http://www.akc.org/public-education/resources/dog-breeds-sorted-groups/'
response = requests.get(info_url)
soup = BeautifulSoup(response.content.decode('latin1'), 'lxml')
breed_group = list()
group_element = list()

groups = soup.select('h2')
for group in groups:
    breed_group.append(group.get_text())


group_element = breed_group
group_element

In [None]:
group_lists = soup.select('ol')

dog_list = list()
for num,group_list in enumerate(group_lists):
    group_lst = group_list.select('li')
    breed_group[num] = list()
    for dog_breeds in group_lst:
        dog_breed = dog_breeds.find('a')
        breed_group[num].append(dog_breed.get_text())

In [None]:
breed_group

- Manchester Terrier가 중복기입 되어있어서 Toy Group에서 제외.

In [None]:
breed_group[2].remove("Manchester Terrier")

<br>- 기준에는 없지만 동물 수가 많아서 분류가 필요한 종들은 구글링을 통해서 추가로 분류해줬다.

In [None]:
breed_group[0].append('Catahoula')
breed_group[0].append('German Shepherd')
breed_group[2].append('Chihuahua Shorthair')
breed_group[3].append('Miniature Poodle')
breed_group[5].append('Jack Russell Terrier')
breed_group[5].append('Pit Bull')

In [None]:
df_herding = pd.DataFrame(breed_group[0], columns=['Breed_pre'])
df_herding["Group"] = "Herding Group"

df_hound = pd.DataFrame(breed_group[1], columns=['Breed_pre'])
df_hound["Group"] = "Hound Group"

df_toy = pd.DataFrame(breed_group[2], columns=['Breed_pre'])
df_toy["Group"] = "Toy Group"

df_non_sporting = pd.DataFrame(breed_group[3] , columns=['Breed_pre'])
df_non_sporting["Group"] = "Non_Sporting Group"

df_sporting = pd.DataFrame(breed_group[4], columns=['Breed_pre'])
df_sporting["Group"] = "Sporting Group"

df_terrier = pd.DataFrame(breed_group[5], columns=['Breed_pre'])
df_terrier["Group"] = "Terrier Group"

df_working = pd.DataFrame(breed_group[6], columns=['Breed_pre'])
df_working["Group"] = "Working Group"

df_miscellaneous = pd.DataFrame(breed_group[7], columns=['Breed_pre'])
df_miscellaneous["Group"] = "Miscellaneous Class"

df_cat1 = pd.DataFrame({'Breed_pre' : ['Domestic Shorthair'],'Group': ['cat1']})
df_cat2 = pd.DataFrame({'Breed_pre' : ['Domestic Medium Hair'],'Group': ['cat2']})
df_cat3 = pd.DataFrame({'Breed_pre' : ['Domestic Longhair'],'Group': ['cat3']})
df_cat4 = pd.DataFrame({'Breed_pre' : ['Siamese'],'Group': ['cat4']})

In [None]:
df_group_all = pd.concat([df_herding, df_hound ,df_toy ,df_non_sporting , df_sporting ,df_terrier , df_working , df_miscellaneous , df_cat1 , df_cat2 , df_cat3 , df_cat4], axis=0, ignore_index=True)
df_group_all.tail()

In [None]:
df_pre = pd.merge(df_pre, df_group_all, on = 'Breed_pre', how = 'left')
df_pre.tail()

In [None]:
df_pre['Group'].fillna("unknown", inplace=True)

In [None]:
print(df_pre['Group'].value_counts())

In [None]:
len(df_pre)

## - 2.11 크기 그룹 추가
- 동물의 크기또한 결과에 영향을 미칠 것이다.
- 강아지의 성향 기준은 American Kennel Club(AKC), United Kennel Club (UKC)의 기준을 크롤링해서 dataframe화 했다.

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np

res = requests.get('https://www.petplace.com/article/dogs/pet-care/american-kennel-club-akc-breeds-by-size')
soup = BeautifulSoup(res.content, 'html.parser')

breeds_size = soup.find_all(title="'Click here for more information. '")
breeds_size_group = soup.find_all('strong')

breeds_size_list = []

i = 0
while i < 150:
    breeds_size_list.append(breeds_size[i].get_text())
    if i == 150:
        break
    i = i + 1

breeds_size_list_giant = list()
breeds_size_list_large = list()
breeds_size_list_medium = list()
breeds_size_list_small = list()
breeds_size_list_toy = list()

a = 0
while a < 17:
    breeds_size_list_giant.append(breeds_size_list[a])
    a = a + 1

a = 17
while 16 < a < 53:
    breeds_size_list_large.append(breeds_size_list[a])
    a = a + 1
    
a = 53
while 52 < a < 99:
    breeds_size_list_medium.append(breeds_size_list[a])
    a = a + 1
    
a = 99
while 98 < a < 138:
    breeds_size_list_small.append(breeds_size_list[a])
    a = a + 1
    
a = 138
while 137 < a < 150:
    breeds_size_list_toy.append(breeds_size_list[a])
    if a == 150:
        break
    a = a + 1
    
#AKC 기준 Manchester Terrier가 small그룹과 toy그룹에 중복 분류되어 있기 때문에 toy그룹에서 삭제
breeds_size_list_toy.remove("Manchester Terrier")
    
breeds_size_list_medium.append('Catahoula')
breeds_size_list_large.append('German Shepherd')
breeds_size_list_small.append('Chihuahua Shorthair')
breeds_size_list_small.append('Miniature Poodle')
breeds_size_list_small.append('Jack Russell Terrier')
breeds_size_list_medium.append('Pit Bull')

'''
print('Giant_group')
print(breeds_size_list_giant)
print('-'*50)
print('Large_group')
print(breeds_size_list_large)
print('-'*50)
print('Medium_group')
print(breeds_size_list_medium)
print('-'*50)
print('Small_group')
print(breeds_size_list_small)
print('-'*50)
print('Toy_group')
print(breeds_size_list_toy)
'''

In [None]:
df_giant = pd.DataFrame(breeds_size_list_giant, columns=['Breed_pre'])
df_giant["Size"] = "Giant"

df_large = pd.DataFrame(breeds_size_list_large, columns=['Breed_pre'])
df_large["Size"] = "Large"

df_medium = pd.DataFrame(breeds_size_list_medium, columns=['Breed_pre'])
df_medium["Size"] = "Medium"

df_small = pd.DataFrame(breeds_size_list_small, columns=['Breed_pre'])
df_small["Size"] = "Small"

df_toy = pd.DataFrame(breeds_size_list_toy, columns=['Breed_pre'])
df_toy["Size"] = "Toy"

df_cat5 = pd.DataFrame({'Breed_pre' : ['Domestic Shorthair'],'Size': ['cat1']})
df_cat6 = pd.DataFrame({'Breed_pre' : ['Domestic Medium Hair'],'Size': ['cat2']})
df_cat7 = pd.DataFrame({'Breed_pre' : ['Domestic Longhair'],'Size': ['cat3']})
df_cat8 = pd.DataFrame({'Breed_pre' : ['Siamese'],'Size': ['cat4']})

In [None]:
df_size_all = pd.concat([df_giant, df_large, df_medium, df_small, df_toy, df_cat5 , df_cat6 , df_cat7 , df_cat8], axis=0, ignore_index=True)

In [None]:
df_size_all.tail()

In [None]:
df_pre = pd.merge(df_pre, df_size_all, on = 'Breed_pre', how = 'left')
df_pre.tail()

In [None]:
df_pre['Size'].fillna("unknown", inplace=True)

In [None]:
print(df_pre['Size'].value_counts())
print(len(df_pre))

# 3. 모델링

In [None]:
df_pre.head(10)

# Laber Encoding

In [None]:
encoder = LabelEncoder()

df_pre['SexuponOutcome'] = encoder.fit_transform(df_pre['SexuponOutcome'])
df_pre['AgeuponOutcome'] = encoder.fit_transform(df_pre['AgeuponOutcome'])
df_pre['Breed'] = encoder.fit_transform(df_pre['Breed'])
df_pre['Color'] = encoder.fit_transform(df_pre['Color'])
df_pre['Color_pre'] = encoder.fit_transform(df_pre['Color_pre'])
df_pre['Breed_pre'] = encoder.fit_transform(df_pre['Breed_pre'])
df_pre['Year'] = encoder.fit_transform(df_pre['Year'])
df_pre['Month'] = encoder.fit_transform(df_pre['Month'])
df_pre['YearMonth'] = encoder.fit_transform(df_pre['YearMonth'])
df_pre['Group'] = encoder.fit_transform(df_pre['Group'])
df_pre['Size'] = encoder.fit_transform(df_pre['Size'])
df_pre['Season'] = encoder.fit_transform(df_pre['Season'])
df_pre['Quarter'] = encoder.fit_transform(df_pre['Quarter'])
df_pre['DateTime'] = encoder.fit_transform(df_pre['DateTime'])

In [None]:
df_pre.tail()

# df_pre - csv 파일로 저장 

In [None]:
df_pre.to_csv('/data/df_pre.csv',sep=',', na_rep='NaN')