## 1. Imports

In [1]:
import sys, os, re, random
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd
import seaborn as sns
from itertools import chain

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

join_path = os.path.join
ls = os.listdir
exists = os.path.exists
bname = os.path.basename
dname = os.path.dirname
find = re.findall
mapFn = lambda x, y: list(map(x, y))

### 1.1 Helper Functions

In [2]:
def has_null_values(df):
    """Checks for missing values

        performs isnull() operation on Pandas DataFrame to check if missing values exist

        Args:
            df (pd.core.frame.DataFrame): input dataframe to check for missing values

        Returns:
            _ (bool): returns True if DataFrame has missing values
    """
    
    if not isinstance(df, pd.core.frame.DataFrame): raise TypeError('Invalid Type : df should be a DataFrame')
    return df.isnull().values.any()

def replace_and_drop_na(df, removal_type='col'):
    """Drops missing values
    
        performs replace inf to nans and then drops all rows or columns with nans in the dataframe
        
        Args:
            df (pd.core.frame.DataFrame): input dataframe to perform add operation on its columns
            removal_type (str): Can be 'row' | 'col'; Column-wise Cleaning or Row-Wise Cleaning
            
        Returns:
            df (pd.core.frame.DataFrame): Updated existing dataframe withour missing values
    """
    
    if not isinstance(df, pd.core.frame.DataFrame): raise TypeError('Invalid Type : df should be a DataFrame')
    
    df = df.replace([np.inf, -np.inf], np.nan)
    
    if removal_type == 'col':
        df = df.dropna(how="any", axis=1)
    elif  removal_type == 'row':
        df = df.dropna(subset=df.columns, how="any")
    else:
        raise ValueError('Invalid Value : removal_type can only be \'row\' or \'col\' ')
    
    if has_null_values(df): 
        raise ValueError('Invalid Value : Dataframe still contains Nan Values')
    else:
        return df

## 2. Define Base Directory and Sub File Paths

In [3]:
base_dataset_dir = '../00_dataset/'
ls(base_dataset_dir)

['breed_labels.csv',
 '.DS_Store',
 'test.csv',
 'color_labels.csv',
 'train.csv',
 'state_labels.csv']

In [4]:
# dataset paths
train_csv, test_csv, breed_labels, color_labels, state_labels = \
['train.csv', 'test.csv', 'breed_labels.csv', 'color_labels.csv', 'state_labels.csv']

In [5]:
[train_csv_path, test_csv_path, breed_labels_path, color_labels_path, state_labels_path] = mapFn(lambda x: join_path(base_dataset_dir, x), 
                                                                                                 [train_csv, test_csv, breed_labels, color_labels, state_labels])
[train_csv_path, test_csv_path, breed_labels_path, color_labels_path, state_labels_path]

['../00_dataset/train.csv',
 '../00_dataset/test.csv',
 '../00_dataset/breed_labels.csv',
 '../00_dataset/color_labels.csv',
 '../00_dataset/state_labels.csv']

In [6]:
["{} exists...".format(path) for path in [train_csv_path, test_csv_path, breed_labels_path, color_labels_path, state_labels_path] if exists(path)]

['../00_dataset/train.csv exists...',
 '../00_dataset/test.csv exists...',
 '../00_dataset/breed_labels.csv exists...',
 '../00_dataset/color_labels.csv exists...',
 '../00_dataset/state_labels.csv exists...']

## 3. Load Data

In [7]:
raw_train_data = pd.read_csv(train_csv_path)
raw_test_data =  pd.read_csv(test_csv_path)

In [8]:
raw_train_data.shape, raw_test_data.shape

((14993, 24), (3972, 23))

In [9]:
raw_train_data.columns

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed'],
      dtype='object')

In [10]:
raw_train_data.dtypes

Type               int64
Name              object
Age                int64
Breed1             int64
Breed2             int64
Gender             int64
Color1             int64
Color2             int64
Color3             int64
MaturitySize       int64
FurLength          int64
Vaccinated         int64
Dewormed           int64
Sterilized         int64
Health             int64
Quantity           int64
Fee                int64
State              int64
RescuerID         object
VideoAmt           int64
Description       object
PetID             object
PhotoAmt         float64
AdoptionSpeed      int64
dtype: object

In [11]:
raw_train_data.columns

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed'],
      dtype='object')

In [12]:
target_features = set(['Type','Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3',
                    'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health','Fee', 'State',
                    'VideoAmt', 'PhotoAmt'])

target_predictor_feature = set(['AdoptionSpeed'])

target_features, target_predictor_feature

({'Age',
  'Breed1',
  'Breed2',
  'Color1',
  'Color2',
  'Color3',
  'Dewormed',
  'Fee',
  'FurLength',
  'Gender',
  'Health',
  'MaturitySize',
  'PhotoAmt',
  'State',
  'Sterilized',
  'Type',
  'Vaccinated',
  'VideoAmt'},
 {'AdoptionSpeed'})

### 3.1 Only Select Relevant Features and Drop rest of the columns

In [13]:
"Total {} number of Features for Training and {} feature to Predict".format(len(target_features), len(target_predictor_feature))

'Total 18 number of Features for Training and 1 feature to Predict'

In [14]:
"Learning to predict {}".format(target_predictor_feature)

"Learning to predict {'AdoptionSpeed'}"

In [15]:
train_data = raw_train_data[target_features.union(target_predictor_feature)]
train_data.shape

(14993, 19)

In [16]:
if not has_null_values(train_data):
    print ("No Missing Values Found.")
else:
    
    train_data = replace_and_drop_na(train_data, removal_type='row')
    
    if not has_null_values(train_data):
        print ("Missing Values Removed.")
    else:
        raise ValueError('\tInvalid Value : Dataframe still contains Nan Values')

No Missing Values Found.


### 3.2 Get Features and Labels 

In [17]:
train_data_X = train_data[target_features]
train_data_y = train_data[target_predictor_feature]

train_data.shape, train_data.shape

((14993, 19), (14993, 19))

In [18]:
X, y = train_data_X, train_data_y
split_ratio = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=32)

In [19]:
X.describe()

Unnamed: 0,FurLength,Breed1,Breed2,Color2,Type,Sterilized,Vaccinated,Gender,State,Color3,Color1,Age,Health,MaturitySize,Fee,VideoAmt,Dewormed,PhotoAmt
count,14993.0,14993.0,14993.0,14993.0,14993.0,14993.0,14993.0,14993.0,14993.0,14993.0,14993.0,14993.0,14993.0,14993.0,14993.0,14993.0,14993.0,14993.0
mean,1.467485,265.272594,74.009738,3.222837,1.457614,1.914227,1.731208,1.776162,41346.028347,1.882012,2.234176,10.452078,1.036617,1.862002,21.259988,0.05676,1.558727,3.889215
std,0.59907,60.056818,123.011575,2.742562,0.498217,0.566172,0.667649,0.681592,32.444153,2.984086,1.745225,18.15579,0.199535,0.547959,78.414548,0.346185,0.695817,3.48781
min,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,41324.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
25%,1.0,265.0,0.0,0.0,1.0,2.0,1.0,1.0,41326.0,0.0,1.0,2.0,1.0,2.0,0.0,0.0,1.0,2.0
50%,1.0,266.0,0.0,2.0,1.0,2.0,2.0,2.0,41326.0,0.0,2.0,3.0,1.0,2.0,0.0,0.0,1.0,3.0
75%,2.0,307.0,179.0,6.0,2.0,2.0,2.0,2.0,41401.0,5.0,3.0,12.0,1.0,2.0,0.0,0.0,2.0,5.0
max,3.0,307.0,307.0,7.0,2.0,3.0,3.0,3.0,41415.0,7.0,7.0,255.0,3.0,4.0,3000.0,8.0,3.0,30.0


In [20]:
y.describe()

Unnamed: 0,AdoptionSpeed
count,14993.0
mean,2.516441
std,1.177265
min,0.0
25%,2.0
50%,2.0
75%,4.0
max,4.0


In [21]:
print("Training Rows : {}, Features: {} \
\nTesting Rows : {}, Features: {}".format(X_train.shape[0], X_train.shape[1], X_test.shape[0], X_test.shape[1]))

Training Rows : 11994, Features: 18 
Testing Rows : 2999, Features: 18


In [22]:
# X_train = StandardScaler().fit_transform(X_train)
# X_test = StandardScaler().fit_transform(X_test)

### 3.3 Save Relevent Features to a new file

In [23]:
def saveCSVFile(dframe, filename, basedir):
    df_path = join_path(basedir, filename+".csv")

    dframe.to_csv(df_path, index=False)
    print("Saved at {}".format(df_path))

for data, fname in zip([X_train, X_test, y_train, y_test], ["X_train", "X_test", "y_train", "y_test"]):
    saveCSVFile(data, fname, basedir=base_dataset_dir)

Saved at ../00_dataset/X_train.csv
Saved at ../00_dataset/X_test.csv
Saved at ../00_dataset/y_train.csv
Saved at ../00_dataset/y_test.csv
