Exploring the text data

In [323]:
## Imports 

import pandas as pd
import numpy as np

In [324]:
# reading in the dataset 
df = pd.read_csv(filepath_or_buffer='data/train.csv.zip')
df.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed
0,2,Nibble,3,299,0,1,1,7,0,1,...,1,1,100,41326,8480853f516546f6cf33aa88cd76c379,0,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0,2
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,1,1,0,41401,3082c7125d8fb66f7dd4bff4192c8b14,0,I just found it alone yesterday near my apartm...,6296e909a,2.0,0
2,1,Brisco,1,307,0,1,2,7,0,2,...,1,1,0,41326,fa90fa5b1ee11c86938398b60abc32cb,0,Their pregnant mother was dumped by her irresp...,3422e4906,7.0,3
3,1,Miko,4,307,0,2,1,2,0,2,...,1,1,150,41401,9238e4f44c71a75282e62f7136c6b240,0,"Good guard dog, very alert, active, obedience ...",5842f1ff5,8.0,2
4,1,Hunter,1,307,0,1,1,0,0,2,...,1,1,0,41326,95481e953f8aed9ec3d16fc4509537e8,0,This handsome yet cute boy is up for adoption....,850a43f90,3.0,2


In [325]:
# looking at the name column
# no of unique names
print('Number of unique names: ', df['Name'].nunique())

Number of unique names:  9060


In [326]:
# Looking at the count of names 

df['Name'].value_counts()[:20]

Baby       66
Lucky      64
No Name    54
Brownie    54
Mimi       52
Blackie    49
Puppy      45
Max        39
Kitty      39
Kittens    39
Oreo       36
Coco       35
Tiger      34
Angel      33
Milo       32
Snowy      30
Bobby      30
Lucy       29
Kiki       29
Lily       28
Name: Name, dtype: int64

In [327]:
# function that changes column with name to 1, 0 otherwise

def name_to_num(name):
    """Adds 1 to field where there is name 0 otherwise"""
    list_no_name = ['No Name', 'No Name Yet', np.nan]
    if name in list_no_name:
        return 0
    else:
        return 1

In [328]:
## Using lambda function which is equivalent to above ```name_to_num`` function defined above

# create a list of items that are not names
list_no_name = ['No Name', 'No Name Yet', np.nan]

df['has_name'] = df['Name'].apply(lambda x: 0 if x in list_no_name else 1)

In [329]:
df['Name'][:10]

0                     Nibble
1                No Name Yet
2                     Brisco
3                       Miko
4                     Hunter
5                        NaN
6                      BULAT
7    Siu Pak & Her 6 Puppies
8                        NaN
9                      Kitty
Name: Name, dtype: object

In [330]:
df['has_name'][:10]

0    1
1    0
2    1
3    1
4    1
5    0
6    1
7    1
8    0
9    1
Name: has_name, dtype: int64

In [331]:
df.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed,has_name
0,2,Nibble,3,299,0,1,1,7,0,1,...,1,100,41326,8480853f516546f6cf33aa88cd76c379,0,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0,2,1
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,1,0,41401,3082c7125d8fb66f7dd4bff4192c8b14,0,I just found it alone yesterday near my apartm...,6296e909a,2.0,0,0
2,1,Brisco,1,307,0,1,2,7,0,2,...,1,0,41326,fa90fa5b1ee11c86938398b60abc32cb,0,Their pregnant mother was dumped by her irresp...,3422e4906,7.0,3,1
3,1,Miko,4,307,0,2,1,2,0,2,...,1,150,41401,9238e4f44c71a75282e62f7136c6b240,0,"Good guard dog, very alert, active, obedience ...",5842f1ff5,8.0,2,1
4,1,Hunter,1,307,0,1,1,0,0,2,...,1,0,41326,95481e953f8aed9ec3d16fc4509537e8,0,This handsome yet cute boy is up for adoption....,850a43f90,3.0,2,1


In [332]:
# processing the Description field

# number of empty / nan values
print(df['Description'].isna().sum())

# fill in the nans
df['Description'].fillna(value='unknown', inplace=True)

# after filling the nans
print(df['Description'].isna().sum())


12
0


In [333]:
## using tf-idf 

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_feature = tfidf.fit_transform(df['Description'])

In [334]:
# shape and type of matrix returned after tf-idf

print(tfidf_feature.shape)
print(type(tfidf_feature))

(14993, 21209)
<class 'scipy.sparse.csr.csr_matrix'>


In [361]:
# Using SVD to reduce the dimension of the matrix
from sklearn.decomposition import TruncatedSVD

In [362]:
# initializing the Singular Value Decomposition (SVD)
svd = TruncatedSVD(n_components=10,)
# fit the svd (from 21209 features to 100)
tfidf_transformed = svd.fit_transform(tfidf_feature)

In [363]:
# shape of the reduced matrix

tfidf_transformed.shape

(14993, 10)

In [364]:
## convert the reduced representation of the text to pandas dataframe 
# and concat with the origin dataframe

# creating the dataframe 
pd_tfidf_feature = pd.DataFrame(tfidf_transformed,)

# print a sample
pd_tfidf_feature.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.296693,-0.023355,-0.024915,0.083739,-0.042705,-0.041932,-0.057693,-0.072269,0.012351,-0.037335
1,0.164431,-0.044863,0.009561,0.019383,-0.034198,-0.112217,-0.042319,-0.087259,0.155545,-0.007163
2,0.353918,0.004332,0.164885,-0.029036,-0.075377,-0.040406,0.028782,0.00894,-0.111161,-0.059544
3,0.196249,0.077631,0.028067,-0.065867,0.060662,0.04055,0.118451,0.136922,0.031153,0.167772
4,0.382785,0.033767,-0.002274,0.243149,0.15689,-0.071376,-0.023647,-0.008367,-0.012171,0.02388


In [365]:

# concat the original dataframe and new tfidf data frame
df_merged = pd.concat((df, pd_tfidf_feature), axis=1)

# print a sample 
df_merged.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,0,1,2,3,4,5,6,7,8,9
0,2,Nibble,3,299,0,1,1,7,0,1,...,0.296693,-0.023355,-0.024915,0.083739,-0.042705,-0.041932,-0.057693,-0.072269,0.012351,-0.037335
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,0.164431,-0.044863,0.009561,0.019383,-0.034198,-0.112217,-0.042319,-0.087259,0.155545,-0.007163
2,1,Brisco,1,307,0,1,2,7,0,2,...,0.353918,0.004332,0.164885,-0.029036,-0.075377,-0.040406,0.028782,0.00894,-0.111161,-0.059544
3,1,Miko,4,307,0,2,1,2,0,2,...,0.196249,0.077631,0.028067,-0.065867,0.060662,0.04055,0.118451,0.136922,0.031153,0.167772
4,1,Hunter,1,307,0,1,1,0,0,2,...,0.382785,0.033767,-0.002274,0.243149,0.15689,-0.071376,-0.023647,-0.008367,-0.012171,0.02388


In [366]:
## making a copy of the dataframe 

df_merged_copy = df_merged.copy()

In [367]:
# original columns
df.columns

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed',
       'has_name'],
      dtype='object')

In [368]:
# preprocessing the merged dataframe

# labels
y_train = df_merged_copy['AdoptionSpeed']

# drop columns
df_merged_copy.drop(labels=['Name', 'RescuerID', 'Description', 'PetID', 'AdoptionSpeed'], axis=1, inplace=True)

In [369]:
# checking  the columns after dropped
" ".join([column for column in df_merged_copy.columns[:20]])

'Type Age Breed1 Breed2 Gender Color1 Color2 Color3 MaturitySize FurLength Vaccinated Dewormed Sterilized Health Quantity Fee State VideoAmt PhotoAmt has_name'

In [370]:
# nan values 
df_merged_copy.isna().sum().sum()

0

In [371]:
#  Splitting the dataset into train and valid set
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(df_merged_copy, y_train, test_size=.1,)

print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

(13493, 30) (13493,) (1500, 30) (1500,)


In [372]:
X_train.head()

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,...,0,1,2,3,4,5,6,7,8,9
12639,2,1,243,0,2,1,0,0,2,1,...,0.499458,-0.069735,-0.127685,-0.211249,-0.037631,-0.063862,-0.107513,-0.028376,0.036021,-0.045684
8872,1,2,307,0,2,2,0,0,2,1,...,0.201685,-0.00355,0.232965,0.043037,-0.217624,0.088845,0.093565,-0.096044,-0.09845,0.163077
3035,1,1,307,0,2,5,0,0,1,1,...,0.168443,-0.01088,0.13159,0.033975,0.095102,-0.05491,-0.011309,0.072975,0.238198,-0.059057
3168,2,4,247,266,1,1,0,0,2,1,...,0.101751,-0.008777,-0.003592,0.00829,0.025449,0.11069,-0.064738,-0.03026,0.028457,0.001293
7710,1,12,307,0,1,1,0,0,2,2,...,0.27891,-0.04652,-0.089246,0.196865,0.14521,-0.111274,0.122008,-0.063144,-0.011674,0.004313


In [373]:
X_valid.head()

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,...,0,1,2,3,4,5,6,7,8,9
6965,2,4,266,0,1,3,0,0,2,1,...,0.163543,0.064376,-0.084055,0.07148,0.057419,0.095271,-0.040266,-0.002415,-0.028865,-0.030883
338,1,36,218,218,2,2,3,5,2,3,...,0.256211,-0.067567,-0.030333,-0.101022,-0.015382,-0.102185,-0.008681,0.11704,0.001643,0.012861
11625,1,4,307,0,2,1,2,0,2,2,...,0.394703,-0.074948,-0.149425,-0.266087,0.093174,0.019825,0.053151,0.024489,-0.047592,-0.067953
12199,2,4,266,0,2,1,7,0,2,1,...,0.374167,-0.061604,0.006885,-0.131256,0.02644,0.017065,-0.071495,-0.005847,0.080475,0.021662
2616,1,36,307,0,1,1,0,0,2,2,...,0.262832,-0.041814,0.073695,0.113533,0.050699,-0.17457,0.055222,-0.121176,0.116654,0.067194


In [374]:
# selecting the categorical features 

cat_cols = np.array(['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
            'Color3', 'State', 'has_name'])

In [375]:
# # Using column transform to onehot encode multiple columns 

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# initializing the column transform object

ct = ColumnTransformer(
        [('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)], remainder='passthrough')

# calling the transform 

X_train = ct.fit_transform(X_train,)
X_valid = ct.transform(X_valid)

In [377]:
X_train.shape, X_valid.shape

((13493, 367), (1500, 367))

In [378]:
## Let's train gradient boosted model and see if 
## the extra work helped 

from sklearn.ensemble import GradientBoostingClassifier

# initialize the model
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)

GradientBoostingClassifier()

In [379]:
from sklearn.metrics import cohen_kappa_score
# evaluating gradient boosting model

gb_predictions_valid = gb_model.predict(X_valid)
cohen_kappa_gb = cohen_kappa_score(gb_predictions_valid, y_valid, weights='quadratic')

print(f"For gradient boosted model cohen kappa: {cohen_kappa_gb}")

For gradient boosted model cohen kappa: 0.36798904289283063


It did not help! Infact got a bit worse :( 

In [382]:
## Let's try random forest
from sklearn.ensemble import RandomForestClassifier

# initialize the model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

RandomForestClassifier()

In [383]:
# evaluate the model 
# The performance got a bit better for random forest.

rf_predictions_valid = rf_model.predict(X_valid)
cohen_kappa_gb = cohen_kappa_score(rf_predictions_valid, y_valid, weights='quadratic')

print(f"For Random Forest model cohen kappa: {cohen_kappa_gb}")

For Random Forest model cohen kappa: 0.37850691052044183
