**1. Bring in base datasets and append columns from sentiment JSON**

In [38]:
#load relevant packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import json

#define path where files, folders, and subfolders exist
initPath = "/Users/jaredoeth/Document/petfinder-adoption-prediction"
os.chdir(initPath)

#loading base data sets
dfTrain = pd.read_csv('train.csv')
dfTest = pd.read_csv('test.csv')

#function for bringing in sentiment data
def dfAddSentiment(split):
    newPath = initPath + '/'+split+'_sentiment/'
    tempList = []

    #cycle through JSON files and glean document sentiment and language
    for filename in os.listdir(newPath):
        if filename.endswith(".json"):
            with open(os.path.join(newPath, filename)) as f:
                data = json.load(f)

            petID = os.path.splitext(filename)[0]
            row = [petID, data['documentSentiment']['magnitude'], data['documentSentiment']['score'], data['language'], len(data['sentences']), len(data['entities'])]
            tempList.append(row)
    
    labels = ['PetID', 'Desc_Magnitude', 'Desc_Score', 'Desc_Language', 'Desc_numSentences', 'Desc_numEntities']
    df = pd.DataFrame(tempList, columns=labels)
    
    #left join sentiment to base dataframes
    if split == 'train':
        dfBase = dfTrain
    elif split == 'test':
        dfBase = dfTest
    else:
        print('invalid function parameter')
        
    dfOut = pd.merge(dfBase, df, on='PetID', how='left')
    
    #move target variable to end in train dataset
    if split == 'train':
        colList = dfOut.columns.tolist()
        colList.remove('AdoptionSpeed')
        colList.append('AdoptionSpeed')
        dfOut = dfOut[colList]

    #drop description text column
    dfOut = dfOut.drop(['Description'], axis=1)
    
    return dfOut

dfTrain = dfAddSentiment('train')
dfTest = dfAddSentiment('test')

**2. Basic Image Statistics using method from https://www.kaggle.com/kaerunantoka/extract-image-features**

In [39]:
import glob
from PIL import Image as IMG

def getSize(filename):
    #filename = images_path + filename
    st = os.stat(filename)
    return st.st_size

def getDimensions(filename):
    #filename = images_path + filename
    img_size = IMG.open(filename).size
    return img_size 

def getImageStats(split):
    imagePath = initPath + '/'+split+'_images/'
    image_files = sorted(glob.glob(imagePath+'*.jpg'))
    
    df_imgs = pd.DataFrame(image_files, columns=['image_filename'])
    imgs_pets = df_imgs['image_filename'].apply(lambda x: x.split('/')[-1].split('-')[0]) #PetID

    df_imgs = df_imgs.assign(PetID=imgs_pets)

    df_imgs['image_size'] = df_imgs['image_filename'].apply(getSize)
    df_imgs['temp_size'] = df_imgs['image_filename'].apply(getDimensions)
    df_imgs['width'] = df_imgs['temp_size'].apply(lambda x : x[0])
    df_imgs['height'] = df_imgs['temp_size'].apply(lambda x : x[1])
    df_imgs = df_imgs.drop(['temp_size'], axis=1)

    #aggregate to one row per pet
    aggs = {
        'image_size': ['min', 'max', 'mean', 'median', "sum"],
        'width': ['min', 'max', 'mean', 'median', "sum"],
        'height': ['min', 'max', 'mean', 'median', "sum"],
    }

    agg_imgs = df_imgs.groupby('PetID').agg(aggs)

    new_columns = [
        k + '_' + agg for k in aggs.keys() for agg in aggs[k]
    ]

    agg_imgs.columns = new_columns

    agg_imgs = agg_imgs.reset_index()
    
    #left join sentiment to base dataframes
    if split == 'train':
        dfBase = dfTrain
    elif split == 'test':
        dfBase = dfTest
    else:
        print('invalid function parameter')
        
    dfOut = pd.merge(dfBase, agg_imgs, on='PetID', how='left')
    
    #move target variable to end in train dataset
    if split == 'train':
        colList = dfOut.columns.tolist()
        colList.remove('AdoptionSpeed')
        colList.append('AdoptionSpeed')
        dfOut = dfOut[colList]
        
    return(dfOut)

dfTrain = getImageStats('train')
dfTest = getImageStats('test')

**3. More complex image features using method from https://www.kaggle.com/teemingyi/image-statistics-for-petfinder**

In [40]:
from collections import defaultdict
from scipy.stats import itemfreq
from scipy import ndimage as ndi
import matplotlib.pyplot as plt
from skimage import feature
from PIL import Image as IMG
import operator
import cv2


def color_analysis(img):
    # obtain the color palette of the image 
    palette = defaultdict(int)
    for pixel in img.getdata():
        palette[pixel] += 1
    
    # sort the colors present in the image 
    sorted_x = sorted(palette.items(), key=operator.itemgetter(1), reverse = True)
    
    light_shade, dark_shade, shade_count, pixel_limit = 0, 0, 0, 1000
    for i, x in enumerate(sorted_x[:pixel_limit]):
        if all(xx <= 20 for xx in x[0][:3]): ## dull : too much darkness 
            dark_shade += x[1]
        if all(xx >= 240 for xx in x[0][:3]): ## bright : too much whiteness 
            light_shade += x[1]
        shade_count += x[1]
        
    light_percent = round((float(light_shade)/shade_count)*100, 2)
    dark_percent = round((float(dark_shade)/shade_count)*100, 2)
    return light_percent, dark_percent

def perform_color_analysis(img):

    path = imagePath + img 
    im = IMG.open(path) #.convert("RGB")
    
    # cut the images into two halves as complete average may give bias results
    size = im.size
    halves = (size[0]/2, size[1]/2)
    im1 = im.crop((0, 0, size[0], halves[1]))
    im2 = im.crop((0, halves[1], size[0], size[1]))

    try:
        light_percent1, dark_percent1 = color_analysis(im1)
        light_percent2, dark_percent2 = color_analysis(im2)
    except Exception as e:
        light_percent1, dark_percent1 = -1, -1
        light_percent2, dark_percent2 = -1, -1

    light_percent = (light_percent1 + light_percent2)/2 
    dark_percent = (dark_percent1 + dark_percent2)/2 
    
    return dark_percent, light_percent

def average_pixel_width(img):
    path = imagePath + img 
    im = IMG.open(path)    
    im_array = np.asarray(im.convert(mode='L'))
    edges_sigma1 = feature.canny(im_array, sigma=3)
    apw = (float(np.sum(edges_sigma1)) / (im.size[0]*im.size[1]))
    return apw*100

def get_blurrness_score(image):
    path =  imagePath + image 
    image = cv2.imread(path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    fm = cv2.Laplacian(image, cv2.CV_64F).var()
    return fm

def getImageDetail(split):
    
    imagePath = initPath + '/'+split+'_images/'
    
    imgs = os.listdir(imagePath)
    features = pd.DataFrame(data=imgs, columns=['image'])

    #keep only display image or the one ending in "-1"
    features = features.loc[['-1.' in x for x in features.image]]

    features['dullness_whiteness'] = features['image'].apply(lambda x : perform_color_analysis(x))
    features['dullness'] = features.dullness_whiteness.map(lambda x: x[0])
    features['whiteness'] = features.dullness_whiteness.map(lambda x: x[1])
    features['average_pixel_width'] = features['image'].apply(average_pixel_width)
    features['blurrness'] = features['image'].apply(get_blurrness_score)

    #create PetID from image name and drop image name and dullness_whiteness
    imgs_pets2 = features['image'].apply(lambda x: x.split('/')[-1].split('-')[0]) #PetID
    features.insert(0,'PetID',imgs_pets2)
    features = features.drop(['image','dullness_whiteness'], axis=1)
    
    #left join sentiment to base dataframes
    if split == 'train':
        dfBase = dfTrain
    elif split == 'test':
        dfBase = dfTest
    else:
        print('invalid function parameter')
        
    dfOut = pd.merge(dfBase, features, on='PetID', how='left')
    
    #move target variable to end in train dataset
    if split == 'train':
        colList = dfOut.columns.tolist()
        colList.remove('AdoptionSpeed')
        colList.append('AdoptionSpeed')
        dfOut = dfOut[colList]
        
    return(dfOut)

imagePath = initPath + '/train_images/'
dfTrain = getImageDetail('train')

imagePath = initPath + '/test_images/'
dfTest = getImageDetail('test')

**Optional. Output to csv for review and copy dataframes**

In [41]:
dfTrain.to_csv('dfTrain.csv', sep='\t', encoding='utf-8', index=False, header=True)
dfTest.to_csv('dfTest.csv', sep='\t', encoding='utf-8', index=False, header=True)

#copy data frames in case cleaning goes wrong
dfTrainBackup = dfTrain 
dfTestBackup = dfTest

In [42]:
#reset data frames to backup if cleaning goes wrong
dfTrain = dfTrainBackup
dfTest = dfTestBackup

**4a. Check where nulls exist for clean-up**

In [43]:
print(dfTrain.isna().sum())

Type                      0
Name                   1257
Age                       0
Breed1                    0
Breed2                    0
Gender                    0
Color1                    0
Color2                    0
Color3                    0
MaturitySize              0
FurLength                 0
Vaccinated                0
Dewormed                  0
Sterilized                0
Health                    0
Quantity                  0
Fee                       0
State                     0
RescuerID                 0
VideoAmt                  0
PetID                     0
PhotoAmt                  0
Desc_Magnitude          551
Desc_Score              551
Desc_Language           551
Desc_numSentences       551
Desc_numEntities        551
image_size_min          341
image_size_max          341
image_size_mean         341
image_size_median       341
image_size_sum          341
width_min               341
width_max               341
width_mean              341
width_median        

**4b. Check column cardinality for categorical columns (before we do one hot encoding)**

In [44]:
uniqueType = len(dfTrain.Type.unique())
uniqueName = len(dfTrain.Name.unique())
uniqueBreed1 = len(dfTrain.Breed1.unique())
uniqueBreed2 = len(dfTrain.Breed2.unique())
uniqueGender = len(dfTrain.Gender.unique())
uniqueColor1 = len(dfTrain.Color1.unique())
uniqueColor2 = len(dfTrain.Color2.unique())
uniqueColor3 = len(dfTrain.Color3.unique())
uniqueMaturitySize = len(dfTrain.MaturitySize.unique())
uniqueFurLength = len(dfTrain.FurLength.unique())
uniqueVaccinated = len(dfTrain.Vaccinated.unique())
uniqueDewormed = len(dfTrain.Dewormed.unique())
uniqueSterilized = len(dfTrain.Sterilized.unique())
uniqueHealth = len(dfTrain.Health.unique())
uniqueState = len(dfTrain.State.unique())
uniqueRescuerID = len(dfTrain.RescuerID.unique())
uniqueDesc_Language = len(dfTrain.Desc_Language.unique())

colList = ['Type', 'Name', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'State', 'RescuerID', 'Desc_Language']
data = []
for col in colList:
    val = eval('unique'+col)
    data.append([col,val])
    
dfUniques = pd.DataFrame(data, columns=['column','cardinality'])

print(dfUniques)


           column  cardinality
0            Type            2
1            Name         9061
2          Breed1          176
3          Breed2          135
4          Gender            3
5          Color1            7
6          Color2            7
7          Color3            6
8    MaturitySize            4
9       FurLength            3
10     Vaccinated            3
11       Dewormed            3
12     Sterilized            3
13         Health            3
14          State           14
15      RescuerID         5595
16  Desc_Language            5


**5. Preprocess data - impute values, encode categorical variables (label, binary, and one hot)**

In [45]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import category_encoders as ce
      
def imputeMean(df,col):
    df[col].fillna(df[col].mean(), inplace=True)
    return df

def imputeMode(df,col):
    df[col].fillna(df[col].mode()[0], inplace=True)
    return df

def imputeBlank(df,col):
    df[col] = df[col].fillna('')
    return df    

def imputeNegOne(df,col):
    df[col] = df[col].fillna(-1)
    return df

def labelEncode(df,col):
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    return df

def oneHotEncodeMult(df,cols):   
    #create one column for each unique member of col, removing the first one
    dfOut = pd.get_dummies(df, prefix=cols, columns = cols, drop_first=True)
    return dfOut

def binaryEncodeMult(df,cols):
    ce_bin = ce.BinaryEncoder(cols=cols)
    dfOut = ce_bin.fit_transform(df)
    return dfOut

def hasName(df):
    df['Name']=df['Name'].str.lower()
    
    #replace nulls with "unknown"
    df['Name'].fillna('unknown', inplace = True) 

    #when any of the following criteria is met show 0 else 1. goal is to represent pets that have actual name with 1s
    df['Name'] = ~df['Name'].str.contains('not named|no name|unknown|dog|cat|pup|kitt|1|2|3|4|5|6|7|8|9', regex=True)*1
    
    return df

def preprocess(df):
    #drop pet id, rescuer id, and photo dimension sums columns
    df = df.drop(['PetID','width_sum','height_sum','image_size_sum','RescuerID'], axis=1)

    #impute nulls with mean
    numCols = ['Health','Sterilized','Dewormed','Vaccinated','FurLength','MaturitySize']
    
    for col in numCols:
        df = imputeMean(df,col)
    
    #impute nulls to -1 with goal to distinguish that these records do not have data
    sentimentCols = ['Desc_Magnitude','Desc_Score','Desc_numSentences','Desc_numEntities']
    photoCols = ['image_size_min','image_size_max','image_size_mean','image_size_median','width_min','width_max','width_mean','width_median','height_min','height_max','height_mean','height_median','dullness','whiteness','average_pixel_width','blurrness']
    
    for col in sentimentCols:
        df = imputeNegOne(df,col)
        
    for col in photoCols:
        df = imputeNegOne(df,col)
        
    #impute nulls to blank category, will be label encoded shortly
    sentimentColsCat = ['Desc_Language']

    for col in sentimentColsCat:
        df = imputeBlank(df,col)
    
    #clean name column to 1's and 0's showing which records have pet name
    df = hasName(df)

    labelEncodeCols = ['Desc_Language']
    # repeat this for all columns that require label encoding
    for col in labelEncodeCols:
        df = labelEncode(df,col)

    oneHotEncodeCols = ['Desc_Language','State','Color3','Color2','Color1','Gender','Type']
    df = oneHotEncodeMult(df,oneHotEncodeCols)
    
    #these columns have high cardinality, ranging from 135-9061 in our train dataset and would be detrimental to one hot encode because of the curse of dimensionality
    
    highCardCols = ['Breed2','Breed1']
    # use binary encoding for high cardinality dimensions so we do not end up with 1000s of dimension columns
    # curse of dimensionality
    df = binaryEncodeMult(df,highCardCols)
    
    
    return df

#reset dataframes to "backup" before running each time. this can be removed once cell is finalized
dfTrain = dfTrainBackup
dfTest = dfTestBackup

#combine into one dataset for pre-processing with indicator column
dfTrain['train']=1
dfTest['train']=0
dfTest['AdoptionSpeed'] = np.nan
dfCombined = pd.concat([dfTrain, dfTest], sort=True)

#preprocess dataset together so that columns are aligned
dfCombined = preprocess(dfCombined)

#separate back into two dataframes
dfTrain = dfCombined[dfCombined.train == 1]
dfTest = dfCombined[dfCombined.train == 0]

#drop indicator column
dfTrain = dfTrain.drop('train', axis = 1)
dfTest = dfTest.drop('train', axis = 1)

#show resulting shape
print("dfTrain shape: " + str(dfTrain.shape))
print("dfTest shape: " + str(dfTest.shape))

dfTrain.to_csv('dfTrainPP.csv', sep='\t', encoding='utf-8', index=False, header=True)
dfTest.to_csv('dfTestPP.csv', sep='\t', encoding='utf-8', index=False, header=True)

dfTrain shape: (14993, 88)
dfTest shape: (3948, 88)


**6. Begin building models using dfTrain dataset**

In [46]:
#load relevant packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

#define path where files, folders, and subfolders exist
initPath = "/Users/jaredoeth/Document/petfinder-adoption-prediction"
os.chdir(initPath)

#pull files back from CSV so all of the building/preprocessing doesnt need to be re-run
dfTrain = pd.read_csv('dfTrainPP.csv', sep='\t', encoding='utf-8')
dfTest = pd.read_csv('dfTestPP.csv', sep='\t', encoding='utf-8')

In [47]:
X = dfTrain.drop('AdoptionSpeed', axis = 1)
y = dfTrain[['AdoptionSpeed']]

#convert all feature columns to float
Xcols = X.columns.tolist()
for col in Xcols:
    X[col]= X[col].astype(float)

#split train data into train/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

#print shape of train/test variables
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

# Normalizing the features
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

dfXtrain = pd.DataFrame(X_train)
dfXtest = pd.DataFrame(X_test)

dfXtrain.head()

(11994, 87) (11994, 1) (2999, 87) (2999, 1)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,77,78,79,80,81,82,83,84,85,86
0,0.0,-0.025835,-0.111008,-0.185262,-0.206163,-0.329709,2.151359,2.79164,0.509129,0.0,...,-0.547485,-0.577029,-0.261376,-0.212885,3.980641,-0.216497,-0.216286,-0.975618,-0.412765,1.091552
1,0.0,-0.025835,-0.111008,5.397748,-0.206163,-0.329709,-0.464823,2.79164,-1.964138,0.0,...,-0.547485,-0.577029,3.825905,-0.212885,-0.251216,-0.216497,-0.216286,-0.975618,-0.412765,-0.916127
2,0.0,-0.025835,-0.111008,-0.185262,-0.206163,-0.329709,-0.464823,-0.358212,0.509129,0.0,...,-0.547485,-0.577029,-0.261376,-0.212885,-0.251216,-0.216497,-0.216286,-0.975618,-0.412765,-0.916127
3,0.0,-0.025835,-0.111008,-0.185262,-0.206163,-0.329709,-0.464823,-0.358212,0.509129,0.0,...,1.826534,-0.577029,3.825905,-0.212885,-0.251216,-0.216497,-0.216286,1.024991,-0.412765,1.091552
4,0.0,-0.025835,-0.111008,-0.185262,-0.206163,-0.329709,-0.464823,-0.358212,0.509129,0.0,...,1.826534,1.733014,-0.261376,-0.212885,-0.251216,-0.216497,-0.216286,-0.975618,-0.412765,-0.916127
