## IMPORTS

In [96]:
import pandas as pd
from sodapy import Socrata
import sklearn
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import datetime as dt
import os
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sn
import statsmodels.discrete.discrete_model as sm
pd.set_option('display.max_columns', None)

## EXTRACT

In [2]:
client = Socrata("data.austintexas.gov", None)



In [4]:
#AAC INTAKES DATA
in_result = client.get("wter-evkm", limit=150000)
intakes = pd.DataFrame(in_result)

In [3]:
#AAC OUTCOMES DATA 
out_result = client.get("9t4d-g238", limit=150000)
outcomes = pd.DataFrame(out_result)

In [6]:
#SHOW WHERE THE TWO DFS HAVE COMMON COLUMNS
col_intersect = sorted(list(set(intakes.columns).intersection(set(outcomes.columns))))
col_intersect.remove("datetime")

In [8]:
#MERGE INTAKES AND OUTCOMES TOGETHER ON ANIMAL IDS EXISTING IN BOTH DFS 
ddf = pd.merge(intakes, outcomes, on=col_intersect,suffixes=('_in', '_out') )
ddf = ddf[sorted(df.columns)]

In [64]:
df = ddf.copy()

## FILTER AND SORT

In [65]:
#ENSURE ALL DATETIME COLUMNS ARE PROPERLY FORMATTED
df['datetime_in'] = pd.to_datetime(df['datetime_in'])
df['datetime_out'] = pd.to_datetime(df['datetime_out'])
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'])

In [66]:
#ONLY FOCUSING ON DOGS WITH OUTCOMES RELATED TO ADOPTION
df = df[df['animal_type']=='Dog']
df = df.loc[df.outcome_type.isin(['Adoption','Transfer','Rto-Adopt'])]

In [67]:
#NO EUTHANASIA REQUESTS
df = df.loc[df.intake_type!='Euthanasia Request']

In [68]:
#SORT VALUES SO EACH ANIMAL'S ENTRIES ARE TOGETHER
df = df.sort_values(['animal_id','datetime_in'])
df = df.reset_index(drop=True)

In [69]:
#DROP UNUSEFUL OR DUPLICATIVE COLUMNS
df = df.drop(columns=['monthyear','datetime2','found_location','animal_type'])
#AGE COLUMNS ARE NOT RELAIBLE SO REMOVING THEM AND RECALCULATING LATER
df = df.drop(columns=['age_upon_intake','age_upon_outcome'])

In [113]:
print("number of entries: ",len(df))
print("unique animal ids: ",df.animal_id.nunique())
print("number of times more than one entry for a single dog exists: ", len(df)-df.animal_id.nunique())

number of entries:  70036


AttributeError: 'DataFrame' object has no attribute 'animal_id'

## FEATURE ENGINEER

#### TIME BASED FEATURES

In [71]:
#LOOK FOR WHEN DATETIME OUT IS EARLIER THAN DATETIME IN AND SWITCH 
#FIRST MAKE COPIES OF THE DATE COLUMNS
df['dt_in'] = df['datetime_in']
df['dt_out'] = df['datetime_out']
#SWITCH 
df.loc[(df.dt_out < df.dt_in),'datetime_in'] = df.loc[(df.dt_out < df.dt_in),'dt_out']
df.loc[(df.dt_out < df.dt_in),'datetime_out'] = df.loc[(df.dt_out < df.dt_in),'dt_in']
#REMOVE COPY COLUMNS
df = df.drop(columns=['dt_in','dt_out'])

In [72]:
#UPDATE SOME OBVIOUS DATE DISCREPANCIES 
df.loc[df.animal_id.isin(['A728451', 'A728452', 'A728453', 'A728454', 'A728455', 'A728456','A728457','A728458', 'A728459']),'date_of_birth'] = '2016-06-01'
df.loc[df.animal_id.isin(['A680305','A680306','A680307','A680309','A680310','A680311','A680312','A680314','A680315']),'date_of_birth']='2014-05-29'
df.loc[df.animal_id=='A687107','date_of_birth']= '2016-02-12'
df.loc[df.animal_id.isin(['A731576','A731577','A731578','A731579','A731580','A731581','A731582','A731584','A731585']),'date_of_birth']='2016-07-23' 
df.loc[df.animal_id.isin(['A719808', 'A719809']), 'datetime_in'] = '2016-02-09'
df.loc[df.animal_id.isin(['A719808', 'A719809']), 'date_of_birth']= '2016-01-28'
df.loc[df.animal_id.isin(['A741271','A741272','A741274','A741275']),'date_of_birth']= '2016-10-16'
df.loc[df.animal_id.isin(['A741271','A741272','A741274','A741275']),'datetime_in']= '2016-11-30'
df.loc[df.animal_id=='A660928','datetime_in']  = '2014-12-01 13:19:00'

In [73]:
#OTHERWISE, DROP ENTIRES WHERE DATE OF BIRTH IS LESS THAN DATETIME IN 
df = df.loc[(df.date_of_birth < df.datetime_in)]

In [74]:
#CALCULATE AGE AT INTAKE AND OUTCOME
df['age_in'] = round((df['datetime_in'] - df['date_of_birth'])/ np.timedelta64(1, 'Y'),1)
df['age_out'] = round((df['datetime_out'] - df['date_of_birth'])/ np.timedelta64(1, 'Y'),1)

In [75]:
# CREATE A TRUE DURATION IN DAYS FOR IN/OUT 
df['duration_days']= (df.datetime_out - df.datetime_in).dt.days 

In [76]:
#PULL JUST MONTH TO SEE IF THAT HAS ANY FACTOR
df['month_in']= df.datetime_in.dt.month
df['month_out']= df.datetime_out.dt.month

In [78]:
df = df.reset_index(drop=True)

#### BREED FEATURE

In [79]:
#LOWER ALL BREED NAMES
ebreed = df["breed"].str.lower()

In [79]:
#CREATE DICTIONARY OF TERMS THAT NEED TO BE REMOVE OR UPDATED
replace_dict = {"mix":"",
                "hybrid":"",
                "unknown":"",
                "black/tan hound":"hound",
                "brittany":"brittany spaniel",
                "eng ": "english",
                "englishtoy":"english toy"
               }
#REPLACE THOSE TERMS
for k,v in replace_dict.items():
    ebreed = ebreed.str.replace(k,v)

In [79]:
#REMOVE TRAILING WHITESPACE
ebreed = ebreed.str.rstrip()
#SPLIT BREED UP INTO TWO COLUMNS
ebreed = ebreed.str.split("/",n=2,expand=True)
ebreed = ebreed.rename(columns={0:"primary",1:"secondary"})

In [79]:
#CREATE DICTIONARY OF TERMS THAT NEED TO BE UPDATED
#breed correction
b_cor = {"retr ":"retriever",
"span ":"spaniel",
"terr ":"terrier"}

#ADD TRAILING WHITESPACE TO ENSURE THE ABOVE TERM CORRECTION DOESN'T AFFECT OTHER TERMS (I.E. SPANISH)
ebreed.primary = [ebreed.primary[i].ljust(len(ebreed.primary[i])+1) for i in range(len(ebreed.primary))]
ebreed.secondary = ebreed.secondary.fillna("")
ebreed.secondary = pd.DataFrame([ebreed.secondary[i].ljust(len(ebreed.secondary[i])+1) for i in range(len(ebreed.secondary))])[0]

#APPLY TERM CORRECTION
for i in ebreed.columns:
    for k,v in b_cor.items():
        ebreed[i] = ebreed[i].str.replace(k,v)

In [79]:
#REMOVE TRAILING WHITESPACE AGAIN
for i in ebreed.columns:
    ebreed[i] = ebreed[i].str.rstrip()

In [79]:
#COMBINE TOGETHER 
ebreed['nbreed'] =  ebreed['primary']+ " " + ebreed['secondary']
ebreed['nbreed'] = ebreed['nbreed'].str.rstrip()
df['nbreed'] = ebreed['nbreed']

In [79]:
#CREATE DUMMIE DF FOR EACH WORD IN THE BREED COLUMN
breed_dummies = df.nbreed.str.split(n=7,expand=True).stack().str.get_dummies().sum(level=0)
#SHOW COUNTS OF EACH WORD 
b_word_counts = pd.DataFrame(breed_dummies.sum(axis=0)).sort_values([0], ascending=False).T
# breed_set = set([j for i in {i for i in ebreed.nbreed} for j in i.split()])
# breed_str = " ".join(set(" ".join({i for i in ebreed.nbreed}).split()))

  breed_dummies = df.nbreed.str.split(n=7,expand=True).stack().str.get_dummies().sum(level=0)


#### COLOR FEATURE

In [80]:
#CREATE DUMMY DF FOR EACH COLOR 
color_dummies = (df.color.str.split("/",expand=True)[0] +" "+ df.color.str.split("/",expand=True)[1]).str.split(expand=True).fillna("").stack().str.get_dummies().sum(level=0)
color_dummies.columns = color_dummies.columns.str.lower()+"_color"
#SHOW COUNTS FOR EACH COLOR 
color_count = pd.DataFrame(color_dummies.sum(axis=0)).sort_values([0],ascending=False)

  color_dummies = (df.color.str.split("/",expand=True)[0] +" "+ df.color.str.split("/",expand=True)[1]).str.split(expand=True).fillna("").stack().str.get_dummies().sum(level=0)


#### INTAKE TYPE FEATURES

In [81]:
#INTAKE TYPE DUMMIES 
df['stray'] = pd.get_dummies(df['intake_type'])['Stray']
df['surrendered'] = pd.get_dummies(df['intake_type'])['Owner Surrender']
df['public_assisted'] = pd.get_dummies(df['intake_type'])['Public Assist']

#### SEX FEATURE

In [81]:
#PULLING OUT JUST SEX PORTION AS STATUS DOESN'T LIKELY HAVE IMPACT ON ADOPTION
df['sex'] = df.sex_upon_intake.str.split(" ").str[-1]
#DUMMY COLUMN FOR SEX
df['is_male'] = pd.get_dummies(df.sex)['Male']

## DF CLEANUP

In [82]:
#DROP COLUMNS WHICH WILL NOT DIRECTLY CONTRIBUTE TO THE PREDICTION
df = df.drop(columns=['animal_id','age_out','breed', 
                       'color',  'date_of_birth', 'datetime_in', 
                       'datetime_out','sex_upon_intake',
                       'sex_upon_outcome','name','outcome_subtype',
                       'outcome_type','month_out','intake_condition','intake_type','sex','nbreed'])

df = df.reset_index(drop=True)

#### ADD ALL FEATURES TOGETHER

In [114]:
dog = pd.concat([df,breed_dummies,color_dummies],axis=1)

In [116]:
dog

Unnamed: 0,age_in,duration_days,month_in,stray,surrendered,public_assisted,is_male,affenpinscher,afghan,airedale,akbash,akita,alaskan,american,anatol,apso,argentino,australian,basenji,basset,bay,beagle,bearded,beauceron,bedlington,belgian,bernard,bernese,bichon,black,bloodhound,blue,bluetick,boerboel,bordeaux,border,borzoi,boston,boxer,boykin,briard,brindle,brittany,bruss,bull,bulldog,bullmastiff,cairn,canaan,canario,cane,cardigan,carolina,catahoula,cattle,cavalier,chesa,chihuahua,chin,chinese,chow,cirneco,clumber,coat,coated,cocker,collie,coonhound,corgi,corso,coton,crested,cur,dachshund,dalmatian,dandie,dane,de,dinmont,doberman,dog,dogo,dogue,duck,dutch,elkhound,english,entlebucher,eskimo,feist,field,finnish,flat,fox,foxhound,french,frise,german,giant,glen,golden,gordon,grand,great,greater,greyhound,griffon,hair,hairless,harrier,havanese,heeler,highland,hound,hovawart,husky,ibizan,imaal,inu,irish,italian,jack,japanese,jindo,kai,kangal,keeshond,kelpie,klee,kuvasz,labrador,lacy,lakeland,landseer,leonberger,lhasa,longhair,lowchen,malamute,malinois,maltese,manchester,mastiff,mexican,miniature,mountain,mouth,neapolitan,newfoundland,norfolk,norwegian,norwich,nova,of,old,otterhound,papillon,parson,patterdale,pbgv,pekingese,pembroke,pequeno,pharaoh,picardy,pinsch,pinscher,pit,plott,podengo,pointer,pointing,pomeranian,poodle,port,presa,pug,pyrenees,queensland,rat,redbone,retriever,rhod,ridgeback,rottweiler,rough,russell,saluki,samoyed,schipperke,schnauzer,scotia,scottish,sealyham,setter,sharpei,sheepdog,shepherd,shetland,shiba,shih,shorthair,siberian,silky,skye,smooth,soft,spaniel,spanish,spitz,springer,st.,staffordshire,stan,standard,sussex,swedish,swiss,tennesse,terrier,tervuren,tibetan,tolling,toy,treeing,tulear,tzu,vallhund,vendeen,vizsla,walker,water,weimaraner,welsh,west,wheaten,whippet,wire,wirehair,wirehaired,wolfhound,yorkshire,apricot_color,black_color,blue_color,brindle_color,brown_color,buff_color,chocolate_color,cream_color,fawn_color,gold_color,gray_color,liver_color,merle_color,orange_color,pink_color,red_color,ruddy_color,sable_color,silver_color,smoke_color,tan_color,tick_color,tiger_color,tortie_color,tricolor_color,white_color,yellow_color
0,10.0,4,4,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,16.0,49,10,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,17.3,9,9,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,15.3,23,10,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
4,15.0,7,6,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70031,6.0,3,9,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
70032,1.5,3,9,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
70033,0.2,2,9,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
70034,8.0,0,9,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [119]:
#SAVE DF 
dog.to_csv("aac_dog.csv", index=False)

## MODEL CREATION

In [92]:
X = dog.drop(columns='duration_days')
y = dog['duration_days']

In [97]:
X_cons = sn.add_constant(X)

  x = pd.concat(x[::order], 1)


#### TRAIN TEST SPLIT

In [98]:
#SPLIT DATA INTO TRAIN AND TEST
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, 
                                                    test_size = 0.3, random_state = 100)

In [100]:
#ADD CONSTANT TO GET AN INTERCEPT 
X_train_sm = sn.add_constant(X_train)

In [102]:
#FIT REGRESSION LINE USING `OLS`
lr = sn.OLS(y_train, X_train_sm).fit()

#PEEK THE PARAMETERS
lr.params

const             12.504655
age_in             3.885494
month_in           0.739149
stray             28.018172
surrendered       29.402651
                    ...    
tiger_color      -62.024852
tortie_color       0.000000
tricolor_color    -2.141509
white_color        6.075010
yellow_color     -26.424282
Length: 269, dtype: float64

In [103]:
lr.summary()

0,1,2,3
Dep. Variable:,duration_days,R-squared:,0.043
Model:,OLS,Adj. R-squared:,0.038
Method:,Least Squares,F-statistic:,9.97
Date:,"Mon, 06 Sep 2021",Prob (F-statistic):,1.4e-310
Time:,17:39:18,Log-Likelihood:,-336170.0
No. Observations:,49025,AIC:,672800.0
Df Residuals:,48806,BIC:,674700.0
Df Model:,218,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,12.5047,19.751,0.633,0.527,-26.208,51.218
age_in,3.8855,0.433,8.968,0.000,3.036,4.735
month_in,0.7391,0.303,2.442,0.015,0.146,1.332
stray,28.0182,19.292,1.452,0.146,-9.794,65.831
surrendered,29.4027,19.339,1.520,0.128,-8.503,67.308
public_assisted,166.5042,19.941,8.350,0.000,127.420,205.589
is_male,13.5200,2.110,6.407,0.000,9.384,17.656
affenpinscher,-44.7474,76.935,-0.582,0.561,-195.540,106.045
afghan,1.899e-13,6.99e-13,0.272,0.786,-1.18e-12,1.56e-12

0,1,2,3
Omnibus:,48018.205,Durbin-Watson:,1.988
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2134405.65
Skew:,4.933,Prob(JB):,0.0
Kurtosis:,33.782,Cond. No.,1.12e+16


In [104]:
lm = LinearRegression()

In [105]:
lm.fit(X,y)

LinearRegression()

In [109]:
lm = sn.OLS(X_cons.astype(float)).fit()

AttributeError: 'NoneType' object has no attribute 'shape'

In [111]:
regr = linear_model.LinearRegression()

In [112]:
regr.fit(X,y)

LinearRegression()

In [814]:
#######PREVIOUS BREED
# #PULL JUST THE BREED AND REMOVE "MIX"
# breed = pd.DataFrame(df["breed"].str.replace(" Mix",""))
# #SEPARATE INDIVIDUAL PARTS OF DOG'S BREED
# df['primary_breed'] = breed.breed.str.split("/", n = 2, expand = True)[0].str.rstrip()
# df['secondary_breed'] = breed.breed.str.split("/", n = 2, expand = True)[1].str.rstrip()
# df['tertiary_breed'] = breed.breed.str.split("/", n = 2, expand = True)[2].str.rstrip()

# pb_all = set(dog.primary_breed.values)
# sb_all = set(dog.secondary_breed.values)
# tb_all = set(dog.tertiary_breed.values)

# all_b = pb_all.union(sb_all,tb_all)

# breed_dummies = dog[['primary_breed','secondary_breed', 'tertiary_breed']].stack().str.get_dummies().sum(level=0)

# breed_counts = pd.DataFrame(breed_dummies.sum(axis=0))

# top50 = list(breed_counts.sort_values(0,ascending=False)[:50].index)

# top50_dummies = breed_dummies[top50]
#dog['breed'] = dog.primary_breed+ " "+ dog.secondary_breed.fillna("NAN") + " " + dog.tertiary_breed.fillna("NAN")


# ####PREVIOUS COLOR


########PREVIOUS COLOR
#SPLIT COLOR UP TO INDIVIDUALS
# df['color_1']= df["color"].str.split("/", n = 1, expand = True)[0]
# df['color_2']= df["color"].str.split("/", n = 1, expand = True)[1]
#color_split = (df.color.str.split("/",expand=True)[0] +" "+ df.color.str.split("/",expand=True)[1]).str.split(expand=True)
#df['color_comb'] = df.color.str.split("/",n=1,expand=True)[0] +" "+ df.color.str.split("/",n=1,expand=True)[1]


# #LOOK AT COLORS TO GET IDEA OF WHAT TO SET AS COLORS TO PREDICT ON 
# c1,c2 = set(dog.color_1.values), set(dog.color_2.values)
# all_c = [i for i in list(c1.union(c2)) if i!=None]
# c_all = []
# for i in all_c:
#     if len(i.split()) >1:
#         c_all.append(i.split()[0])
#     else:
#         c_all.append(i)

# dog['color'] = dog['color_1']+" " + dog.color_2.fillna("NAN")
# dog.color = dog.color.str.replace(' NAN', '')

# color_list = [i.title() for i in ['black','blue', 'cream', 'gold', 'gray', 'tan', 'white', 'yellow', 'tricolor', 'sable']]

# color_dummies = pd.DataFrame()
# color_dummies['brown_color'] = (dog.color.str.contains('Brown')==True) | (dog.color.str.contains('Chocolate')==True)
# color_dummies['brown_color'] = color_dummies.brown_color.astype(int)

# for i in color_list:
#     exec(f"color_dummies['{i.lower()}_color'] = dog.color.str.contains(i)==True")
#     exec(f"color_dummies['{i.lower()}_color'] = color_dummies['{i.lower()}_color'].astype(int)")

# dog_pred= pd.concat([dog.drop(columns=['color_1','color_2','color']),top50_dummies,color_dummies],axis=1)
# dog_pred = dog_pred.drop(columns=['primary_breed','secondary_breed','tertiary_breed'])
# dog_pred.columns = dog_pred.columns.str.lower()
# dog_pred.columns= dog_pred.columns.str.replace(" ","_")
# dog_num = dog.drop(columns=['primary_breed','secondary_breed','tertiary_breed','color_1','color_2','color'])