Exploring the text data

In [247]:
## Imports 

import pandas as pd
import numpy as np

In [248]:
# reading in the dataset 
df = pd.read_csv(filepath_or_buffer='data/train.csv.zip')
df.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed
0,2,Nibble,3,299,0,1,1,7,0,1,...,1,1,100,41326,8480853f516546f6cf33aa88cd76c379,0,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0,2
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,1,1,0,41401,3082c7125d8fb66f7dd4bff4192c8b14,0,I just found it alone yesterday near my apartm...,6296e909a,2.0,0
2,1,Brisco,1,307,0,1,2,7,0,2,...,1,1,0,41326,fa90fa5b1ee11c86938398b60abc32cb,0,Their pregnant mother was dumped by her irresp...,3422e4906,7.0,3
3,1,Miko,4,307,0,2,1,2,0,2,...,1,1,150,41401,9238e4f44c71a75282e62f7136c6b240,0,"Good guard dog, very alert, active, obedience ...",5842f1ff5,8.0,2
4,1,Hunter,1,307,0,1,1,0,0,2,...,1,1,0,41326,95481e953f8aed9ec3d16fc4509537e8,0,This handsome yet cute boy is up for adoption....,850a43f90,3.0,2


In [249]:
# looking at the name column
# no of unique names
print('Number of unique names: ', df['Name'].nunique())

Number of unique names:  9060


In [250]:
# Looking at the count of names 

df['Name'].value_counts()[:20]

Baby       66
Lucky      64
No Name    54
Brownie    54
Mimi       52
Blackie    49
Puppy      45
Max        39
Kitty      39
Kittens    39
Oreo       36
Coco       35
Tiger      34
Angel      33
Milo       32
Snowy      30
Bobby      30
Lucy       29
Kiki       29
Lily       28
Name: Name, dtype: int64

In [251]:
# function that changes column with name to 1, 0 otherwise

def name_to_num(name):
    """Adds 1 to field where there is name 0 otherwise"""
    list_no_name = ['No Name', 'No Name Yet', np.nan]
    if name in list_no_name:
        return 0
    else:
        return 1

In [252]:
## Using lambda function which is equivalent to above ```name_to_num`` function defined above

# create a list of items that are not names
list_no_name = ['No Name', 'No Name Yet', np.nan]

df['has_name'] = df['Name'].apply(lambda x: 0 if x in list_no_name else 1)

In [253]:
df['Name'][:10]

0                     Nibble
1                No Name Yet
2                     Brisco
3                       Miko
4                     Hunter
5                        NaN
6                      BULAT
7    Siu Pak & Her 6 Puppies
8                        NaN
9                      Kitty
Name: Name, dtype: object

In [254]:
df['has_name'][:10]

0    1
1    0
2    1
3    1
4    1
5    0
6    1
7    1
8    0
9    1
Name: has_name, dtype: int64

In [255]:
df.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed,has_name
0,2,Nibble,3,299,0,1,1,7,0,1,...,1,100,41326,8480853f516546f6cf33aa88cd76c379,0,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0,2,1
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,1,0,41401,3082c7125d8fb66f7dd4bff4192c8b14,0,I just found it alone yesterday near my apartm...,6296e909a,2.0,0,0
2,1,Brisco,1,307,0,1,2,7,0,2,...,1,0,41326,fa90fa5b1ee11c86938398b60abc32cb,0,Their pregnant mother was dumped by her irresp...,3422e4906,7.0,3,1
3,1,Miko,4,307,0,2,1,2,0,2,...,1,150,41401,9238e4f44c71a75282e62f7136c6b240,0,"Good guard dog, very alert, active, obedience ...",5842f1ff5,8.0,2,1
4,1,Hunter,1,307,0,1,1,0,0,2,...,1,0,41326,95481e953f8aed9ec3d16fc4509537e8,0,This handsome yet cute boy is up for adoption....,850a43f90,3.0,2,1


In [256]:
# processing the Description field

# number of empty / nan values
print(df['Description'].isna().sum())

# fill in the nans
df['Description'].fillna(value='unknown', inplace=True)

# after filling the nans
print(df['Description'].isna().sum())


12
0


In [257]:
## using tf-idf 

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_feature = tfidf.fit_transform(df['Description'])

In [258]:
# shape and type of matrix returned after tf-idf

print(tfidf_feature.shape)
print(type(tfidf_feature))

(14993, 21209)
<class 'scipy.sparse.csr.csr_matrix'>


In [259]:
# Using SVD to reduce the dimension of the matrix
from sklearn.decomposition import TruncatedSVD

In [260]:
# initializing the Singular Value Decomposition (SVD)
svd = TruncatedSVD(n_components=10,)
# fit the svd (from 21209 features to 100)
tfidf_transformed = svd.fit_transform(tfidf_feature)

In [261]:
# shape of the reduced matrix

tfidf_transformed.shape

(14993, 10)

In [262]:
## convert the reduced representation of the text to pandas dataframe 
# and concat with the origin dataframe

# creating the dataframe 
pd_tfidf_feature = pd.DataFrame(tfidf_transformed,)

# print a sample
pd_tfidf_feature.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.296693,-0.023356,-0.02497,0.083754,-0.042916,-0.042829,-0.057302,-0.074569,0.010798,-0.044249
1,0.164431,-0.044863,0.009544,0.019379,-0.034178,-0.11329,-0.046621,-0.101494,0.151155,-0.012257
2,0.353918,0.004322,0.164858,-0.02922,-0.075341,-0.03859,0.025263,0.016447,-0.098587,-0.058509
3,0.196249,0.077626,0.028074,-0.065923,0.060653,0.040699,0.117537,0.129143,0.039155,0.184599
4,0.382785,0.033763,-0.002267,0.243186,0.156808,-0.070848,-0.021812,-0.010267,-0.003879,0.028778


In [263]:

# concat the original dataframe and new tfidf data frame
df_merged = pd.concat((df, pd_tfidf_feature), axis=1)

# print a sample 
df_merged.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,0,1,2,3,4,5,6,7,8,9
0,2,Nibble,3,299,0,1,1,7,0,1,...,0.296693,-0.023356,-0.02497,0.083754,-0.042916,-0.042829,-0.057302,-0.074569,0.010798,-0.044249
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,0.164431,-0.044863,0.009544,0.019379,-0.034178,-0.11329,-0.046621,-0.101494,0.151155,-0.012257
2,1,Brisco,1,307,0,1,2,7,0,2,...,0.353918,0.004322,0.164858,-0.02922,-0.075341,-0.03859,0.025263,0.016447,-0.098587,-0.058509
3,1,Miko,4,307,0,2,1,2,0,2,...,0.196249,0.077626,0.028074,-0.065923,0.060653,0.040699,0.117537,0.129143,0.039155,0.184599
4,1,Hunter,1,307,0,1,1,0,0,2,...,0.382785,0.033763,-0.002267,0.243186,0.156808,-0.070848,-0.021812,-0.010267,-0.003879,0.028778


In [264]:
## making a copy of the dataframe 

df_merged_copy = df_merged.copy()

In [265]:
# original columns
df.columns

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed',
       'has_name'],
      dtype='object')

In [266]:
# preprocessing the merged dataframe

# labels
y_train = df_merged_copy['AdoptionSpeed']

# drop columns
df_merged_copy.drop(labels=['Name', 'RescuerID', 'Description', 'PetID', 'AdoptionSpeed'], axis=1, inplace=True)

In [267]:
# checking  the columns after dropped
" ".join([column for column in df_merged_copy.columns[:20]])

'Type Age Breed1 Breed2 Gender Color1 Color2 Color3 MaturitySize FurLength Vaccinated Dewormed Sterilized Health Quantity Fee State VideoAmt PhotoAmt has_name'

In [268]:
# nan values 
df_merged_copy.isna().sum().sum()

0

In [53]:
#  Splitting the dataset into train and valid set
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(df_merged_copy, y_train, test_size=.1,)

print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

(13493, 30) (13493,) (1500, 30) (1500,)


In [25]:
X_train.head()

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,...,0,1,2,3,4,5,6,7,8,9
11149,2,2,264,0,1,6,7,0,1,3,...,0.295768,0.091698,0.053648,0.04141,-0.087898,-0.069476,-0.090899,0.100437,0.009199,-0.031519
14084,1,6,307,307,2,1,2,0,3,1,...,0.179218,0.069842,0.039715,0.021224,-0.068036,-0.024527,0.005424,0.098092,-0.010082,0.017998
9893,1,2,78,307,2,1,2,0,2,2,...,0.281837,0.10063,0.591992,-0.201477,0.323178,-0.038805,4.1e-05,-0.130282,-0.089962,-0.106651
1755,1,6,307,0,2,2,0,0,2,1,...,0.372696,-0.053865,0.074256,0.035328,-0.092539,-0.119329,-0.000666,-0.018333,0.06497,0.019881
11221,2,48,264,0,3,1,7,0,3,3,...,0.502665,0.027807,-0.150795,-0.213498,-0.1396,0.014983,-0.163924,-0.049051,-0.016582,-0.023489


In [26]:
X_valid.head()

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,...,0,1,2,3,4,5,6,7,8,9
14348,1,12,307,0,2,2,5,7,2,1,...,0.10006,-0.025903,-0.013654,-0.031693,0.030195,-0.021928,-0.007572,0.019118,-0.025521,-0.028654
13282,1,1,307,0,1,1,7,0,1,1,...,0.211085,-0.038232,-0.019594,0.148007,0.073709,-0.129218,-0.030921,-0.018384,0.019706,0.041353
11748,1,2,307,0,1,2,3,0,1,2,...,0.208321,-0.057273,-0.040036,0.083704,-0.034966,0.00857,0.015686,-0.064222,0.112667,-0.035133
8334,2,2,299,292,3,1,7,0,1,1,...,0.244213,-0.023244,0.05582,0.066195,-0.169158,0.024853,-0.117421,0.041996,0.120526,-0.159204
4108,2,4,292,0,2,7,0,0,1,1,...,0.271581,-0.054324,0.010841,-0.070391,0.072502,0.165806,-0.019613,0.048718,0.063998,0.0535


In [27]:
# selecting the categorical features 

cat_cols = np.array(['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
            'Color3', 'State', 'has_name'])

In [28]:
# # Using column transform to onehot encode multiple columns 

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# initializing the column transform object

ct = ColumnTransformer(
        [('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)], remainder='passthrough')

# calling the transform 

X_train = ct.fit_transform(X_train,)
X_valid = ct.transform(X_valid)

In [377]:
X_train.shape, X_valid.shape

((13493, 367), (1500, 367))

In [378]:
## Let's train gradient boosted model and see if 
## the extra work helped 

from sklearn.ensemble import GradientBoostingClassifier

# initialize the model
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)

GradientBoostingClassifier()

In [379]:
from sklearn.metrics import cohen_kappa_score
# evaluating gradient boosting model

gb_predictions_valid = gb_model.predict(X_valid)
cohen_kappa_gb = cohen_kappa_score(gb_predictions_valid, y_valid, weights='quadratic')

print(f"For gradient boosted model cohen kappa: {cohen_kappa_gb}")

For gradient boosted model cohen kappa: 0.36798904289283063


It did not help! Infact got a bit worse :( 

In [382]:
## Let's try random forest
from sklearn.ensemble import RandomForestClassifier

# initialize the model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

RandomForestClassifier()

In [383]:
# evaluate the model 
# The performance got a bit better for random forest.

rf_predictions_valid = rf_model.predict(X_valid)
cohen_kappa_gb = cohen_kappa_score(rf_predictions_valid, y_valid, weights='quadratic')

print(f"For Random Forest model cohen kappa: {cohen_kappa_gb}")

For Random Forest model cohen kappa: 0.37850691052044183


### Using Embedding layer for categorical variable

In [173]:
import torch
import torch.nn as nn

In [169]:
## working with the dataframe that has been tfidf processed (df_merged)
df_merged.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,0,1,2,3,4,5,6,7,8,9
0,2,Nibble,3,299,0,1,1,7,0,1,...,0.296693,-0.023353,-0.025036,0.083695,-0.042744,-0.042249,-0.056209,-0.06781,0.06006,-0.033553
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,0.164431,-0.044861,0.009412,0.019295,-0.034182,-0.112116,-0.039178,-0.038505,0.190779,0.010158
2,1,Brisco,1,307,0,1,2,7,0,2,...,0.353918,0.004326,0.164793,-0.029221,-0.075373,-0.039862,0.034072,-0.02247,-0.099708,-0.075114
3,1,Miko,4,307,0,2,1,2,0,2,...,0.196249,0.077629,0.028146,-0.065843,0.060371,0.040468,0.116648,0.14167,-0.034158,0.163069
4,1,Hunter,1,307,0,1,1,0,0,2,...,0.382785,0.033777,-0.00239,0.243384,0.15736,-0.071793,-0.026796,-0.019213,0.015855,0.044306


In [269]:
## Let's make a copy 
df_embedding = df_merged_copy.copy()
df_embedding.head()

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,...,0,1,2,3,4,5,6,7,8,9
0,2,3,299,0,1,1,7,0,1,1,...,0.296693,-0.023356,-0.02497,0.083754,-0.042916,-0.042829,-0.057302,-0.074569,0.010798,-0.044249
1,2,1,265,0,1,1,2,0,2,2,...,0.164431,-0.044863,0.009544,0.019379,-0.034178,-0.11329,-0.046621,-0.101494,0.151155,-0.012257
2,1,1,307,0,1,2,7,0,2,2,...,0.353918,0.004322,0.164858,-0.02922,-0.075341,-0.03859,0.025263,0.016447,-0.098587,-0.058509
3,1,4,307,0,2,1,2,0,2,1,...,0.196249,0.077626,0.028074,-0.065923,0.060653,0.040699,0.117537,0.129143,0.039155,0.184599
4,1,1,307,0,1,1,0,0,2,1,...,0.382785,0.033763,-0.002267,0.243186,0.156808,-0.070848,-0.021812,-0.010267,-0.003879,0.028778


In [270]:
# categorical columns
cat_cols = np.array(['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
            'Color3', 'State', 'has_name'])

In [271]:
## creating a neural network that taken in a column and returns embeddings for a specified dimension
   
def embedding_layer(emb_dim, input):
    """Returns a embedding layer for specified dimensions"""
    return torch.nn.Embedding(*emb_dim)(input).detach().numpy()


In [272]:
# determining the size of the embedding dimensions

embedding_sizes = [(df_embedding[cols].max() + 1, min(50, (df_embedding[cols].nunique()+1)//2)) for cols in cat_cols]
embedding_sizes

[(3, 1),
 (308, 50),
 (308, 50),
 (4, 2),
 (8, 4),
 (8, 4),
 (8, 3),
 (41416, 7),
 (2, 1)]

In [273]:
# create a list of embeddings for the categorical columns in the dataframe
list_of_embeddings = [pd.DataFrame(embedding_layer(dimensions, (torch.tensor(df_embedding[cols])))) for cols, dimensions in zip(cat_cols, embedding_sizes)]

In [274]:
# sanity check to look at the dimensions 
[item.shape for item in list_of_embeddings]

[(14993, 1),
 (14993, 50),
 (14993, 50),
 (14993, 2),
 (14993, 4),
 (14993, 4),
 (14993, 3),
 (14993, 7),
 (14993, 1)]

In [275]:
## add the categorical embeddings 

# temporary dataframe to hold the dataframe of embeddings
temp_dataframe = pd.concat(list_of_embeddings, axis=1)

# check the shape 
print(temp_dataframe.shape)

# print shape before merge
print("Before merge", df_embedding.shape)

# merge dataframes 
df_embedding = pd.concat((df_embedding, temp_dataframe), axis=1)

# print shape after merge
print("After merge", df_embedding.shape)

(14993, 122)
Before merge (14993, 30)
After merge (14993, 152)


In [276]:
## looking at a sample
df_embedding.head()

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,...,1,2,0,1.1,2.1,3,4,5,6,0.1
0,2,3,299,0,1,1,7,0,1,1,...,0.316677,0.029539,-0.664638,-0.528773,0.155386,-0.27624,-0.156129,-1.624417,1.499093,-1.723887
1,2,1,265,0,1,1,2,0,2,2,...,0.316677,0.029539,1.697833,-0.124666,-0.604973,-0.269865,1.604037,0.112886,0.60116,-0.055802
2,1,1,307,0,1,2,7,0,2,2,...,0.316677,0.029539,-0.664638,-0.528773,0.155386,-0.27624,-0.156129,-1.624417,1.499093,-1.723887
3,1,4,307,0,2,1,2,0,2,1,...,0.316677,0.029539,1.697833,-0.124666,-0.604973,-0.269865,1.604037,0.112886,0.60116,-1.723887
4,1,1,307,0,1,1,0,0,2,1,...,0.316677,0.029539,-0.664638,-0.528773,0.155386,-0.27624,-0.156129,-1.624417,1.499093,-1.723887


In [277]:
# columns before dropping the categorical columns 
print("columns before dropping cat cols", df_embedding.columns[:35])

## drop the categorical columns that have been transformed
[df_embedding.drop(columns=[cat_col], inplace=True) for cat_col in cat_cols]

# columns after the drop
print("Columns after dropping cat cols", df_embedding.columns[:35])

columns before dropping cat cols Index([        'Type',          'Age',       'Breed1',       'Breed2',
             'Gender',       'Color1',       'Color2',       'Color3',
       'MaturitySize',    'FurLength',   'Vaccinated',     'Dewormed',
         'Sterilized',       'Health',     'Quantity',          'Fee',
              'State',     'VideoAmt',     'PhotoAmt',     'has_name',
                    0,              1,              2,              3,
                    4,              5,              6,              7,
                    8,              9,              0,              0,
                    1,              2,              3],
      dtype='object')
Columns after dropping cat cols Index([         'Age', 'MaturitySize',    'FurLength',   'Vaccinated',
           'Dewormed',   'Sterilized',       'Health',     'Quantity',
                'Fee',     'VideoAmt',     'PhotoAmt',              0,
                    1,              2,              3,              4,
     

In [278]:
df_embedding.head()

Unnamed: 0,Age,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,VideoAmt,...,1,2,0,1.1,2.1,3,4,5,6,0.1
0,3,1,1,2,2,2,1,1,100,0,...,0.316677,0.029539,-0.664638,-0.528773,0.155386,-0.27624,-0.156129,-1.624417,1.499093,-1.723887
1,1,2,2,3,3,3,1,1,0,0,...,0.316677,0.029539,1.697833,-0.124666,-0.604973,-0.269865,1.604037,0.112886,0.60116,-0.055802
2,1,2,2,1,1,2,1,1,0,0,...,0.316677,0.029539,-0.664638,-0.528773,0.155386,-0.27624,-0.156129,-1.624417,1.499093,-1.723887
3,4,2,1,1,1,2,1,1,150,0,...,0.316677,0.029539,1.697833,-0.124666,-0.604973,-0.269865,1.604037,0.112886,0.60116,-1.723887
4,1,2,1,2,2,2,1,1,0,0,...,0.316677,0.029539,-0.664638,-0.528773,0.155386,-0.27624,-0.156129,-1.624417,1.499093,-1.723887


In [279]:
X_train, X_valid, y_train, y_valid = train_test_split(df_embedding, y_train, test_size=.1,)

print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

(13493, 143) (13493,) (1500, 143) (1500,)


In [280]:
## start with decision tree
from sklearn.tree import DecisionTreeClassifier

model_decision_tree = DecisionTreeClassifier()
model_decision_tree.fit(X_train, y_train)

DecisionTreeClassifier()

In [284]:
from sklearn.metrics import cohen_kappa_score
# evaluating gradient boosting model

tree_predictions_valid = model_decision_tree.predict(X_valid)
cohen_kappa_tree = cohen_kappa_score(tree_predictions_valid, y_valid, weights='quadratic')

print(f"For decision tree model cohen kappa: {cohen_kappa_tree}")

For decision tree model cohen kappa: 0.20805261844608713


In [282]:
## random forest model
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

model_rf = RandomForestClassifier()
model_gb = GradientBoostingClassifier()

model_rf.fit(X_train, y_train)
model_gb.fit(X_train, y_train)

GradientBoostingClassifier()

In [285]:
from sklearn.metrics import cohen_kappa_score
# evaluating gradient boosting model

random_forest_predictions_valid = model_rf.predict(X_valid)
cohen_kappa_rf = cohen_kappa_score(random_forest_predictions_valid, y_valid, weights='quadratic')

gb_predictions_valid = model_gb.predict(X_valid)
cohen_kappa_gb = cohen_kappa_score(gb_predictions_valid, y_valid, weights='quadratic')

print(f"For random forest model cohen kappa: {cohen_kappa_rf}")
print(f"For gradient boosted model cohen kappa: {cohen_kappa_gb}")

For gradient boosted model cohen kappa: 0.41037282136124054
For gradient boosted model cohen kappa: 0.3754180081812951
