In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
%matplotlib inline 
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from numpy import asarray
from numpy import mean
from numpy import std
from sklearn.metrics import classification_report
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_69903,10-07-2016 00:00,21-09-2016 16:25,2.0,Brown Tabby,0.8,7.78,13,9,0,1
1,ANSL_66892,21-11-2013 00:00,27-12-2018 17:47,1.0,White,0.72,14.19,13,9,0,2
2,ANSL_69750,28-09-2014 00:00,19-10-2016 08:24,,Brown,0.15,40.9,15,4,2,4
3,ANSL_71623,31-12-2016 00:00,25-01-2019 18:30,1.0,White,0.62,17.82,0,1,0,2
4,ANSL_57969,28-09-2017 00:00,19-11-2017 09:38,2.0,Black,0.5,11.06,18,4,0,1


In [4]:
test.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2
0,ANSL_75005,2005-08-17 00:00:00,2017-09-07 15:35:00,0.0,Black,0.87,42.73,0,7
1,ANSL_76663,2018-11-15 00:00:00,2019-05-08 17:24:00,1.0,Orange Tabby,0.06,6.71,0,1
2,ANSL_58259,2012-10-11 00:00:00,2018-04-02 16:51:00,1.0,Black,0.24,41.21,0,7
3,ANSL_67171,2015-02-13 00:00:00,2018-04-06 07:25:00,1.0,Black,0.29,8.46,7,1
4,ANSL_72871,2017-01-18 00:00:00,2018-04-26 13:42:00,1.0,Brown,0.71,30.92,0,7


In [5]:
train['pet_category'].value_counts()

2    10621
1     7184
4      941
0       88
Name: pet_category, dtype: int64

In [6]:
train['breed_category'].value_counts()

0    9000
1    8357
2    1477
Name: breed_category, dtype: int64

In [7]:
train['condition'].value_counts()

1.0    6819
0.0    6281
2.0    4257
Name: condition, dtype: int64

In [8]:
test['condition'].value_counts()

1.0    2928
0.0    2685
2.0    1840
Name: condition, dtype: int64

In [9]:
train['condition'].unique()

array([ 2.,  1., nan,  0.])

In [10]:
train.isnull().sum()

pet_id               0
issue_date           0
listing_date         0
condition         1477
color_type           0
length(m)            0
height(cm)           0
X1                   0
X2                   0
breed_category       0
pet_category         0
dtype: int64

In [11]:
test.isnull().sum()

pet_id            0
issue_date        0
listing_date      0
condition       619
color_type        0
length(m)         0
height(cm)        0
X1                0
X2                0
dtype: int64

In [12]:
train.groupby(['breed_category', 'pet_category']).size()

breed_category  pet_category
0               0                 30
                1               3195
                2               5692
                4                 83
1               0                  7
                1               3406
                2               4869
                4                 75
2               0                 51
                1                583
                2                 60
                4                783
dtype: int64

# date_time

In [13]:
train['issue_date'] = pd.to_datetime(train['issue_date'])
train['listing_date'] = pd.to_datetime(train['listing_date'])
train['duration'] = (train['listing_date'] - train['issue_date']).dt.days  

In [14]:
train['duration'] = np.absolute(np.array(train.duration).flatten())
train['duration'].head()

0      16
1    1862
2     752
3     755
4      52
Name: duration, dtype: int64

In [15]:
test['issue_date'] = pd.to_datetime(test['issue_date'])
test['listing_date'] = pd.to_datetime(test['listing_date'])
test['duration'] = (test['listing_date'] - test['issue_date']).dt.days  


In [16]:
test['duration'] = np.absolute(np.array(test.duration).flatten())

# length_height

In [17]:
print(len(train[train['length(m)'] == 0]))
print(len(test[test['length(m)']==0]))

93
44


In [18]:
train['length(cm)'] = train['length(m)'].apply(lambda x: x*100)
test['length(cm)'] = test['length(m)'].apply(lambda x: x*100)

In [19]:
# replace all 0 length with mean of lengths
val = train['length(cm)'].mean()
train['length(cm)'] = train['length(cm)'].replace(to_replace=0, value=val)
test['length(cm)'] = test['length(cm)'].replace(to_replace=0, value=val)


In [20]:
#new feature
train['ratio_len_height'] = train['length(cm)']/train['height(cm)']
test['ratio_len_height'] = test['length(cm)']/test['height(cm)']


In [21]:
train.groupby(['length(cm)', 'pet_category']).size()

length(cm)  pet_category
1.0         1                65
            2                90
            4                10
2.0         0                 3
            1                67
            2                99
            4                 7
3.0         1                64
            2               100
            4                12
4.0         1                65
            2               111
            4                11
5.0         1                58
            2               103
            4                10
6.0         0                 2
            1                74
            2               115
            4                 3
7.0         1                76
            2               106
            4                 2
8.0         0                 1
            1                86
            2               115
            4                14
9.0         1                74
            2               101
            4                 5
               

# clubbing train and test

In [22]:
df = pd.concat([train,test],axis=0,sort=False)

In [23]:
df

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category,duration,length(cm),ratio_len_height
0,ANSL_69903,2016-10-07,2016-09-21 16:25:00,2.0,Brown Tabby,0.80,7.78,13,9,0.0,1.0,16,80.0,10.282776
1,ANSL_66892,2013-11-21,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2.0,1862,72.0,5.073996
2,ANSL_69750,2014-09-28,2016-10-19 08:24:00,,Brown,0.15,40.90,15,4,2.0,4.0,752,15.0,0.366748
3,ANSL_71623,2016-12-31,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0.0,2.0,755,62.0,3.479237
4,ANSL_57969,2017-09-28,2017-11-19 09:38:00,2.0,Black,0.50,11.06,18,4,0.0,1.0,52,50.0,4.520796
5,ANSL_52200,2017-10-25,2019-10-03 13:18:00,0.0,Red,0.92,29.56,0,7,1.0,2.0,708,92.0,3.112314
6,ANSL_75444,2018-06-11,2019-09-04 16:00:00,2.0,Brown Tabby,0.14,40.24,0,1,0.0,1.0,450,14.0,0.347913
7,ANSL_52759,2018-05-06,2018-02-09 17:04:00,0.0,White,0.15,25.48,7,1,1.0,1.0,86,15.0,0.588697
8,ANSL_74632,2015-10-10,2016-04-08 16:29:00,2.0,Brown,0.05,38.22,13,9,1.0,2.0,181,5.0,0.130822
9,ANSL_56464,2014-02-18,2017-03-17 11:38:00,2.0,Brown,0.55,21.26,13,9,1.0,2.0,1123,55.0,2.587018


# missing values filling

In [24]:
df['condition'].fillna(-1, inplace=True)

# categorical values 

In [25]:
#df['color_number'] = LabelEncoder().fit_transform(df['color_type'])

In [26]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    return(res)

In [27]:
df = encode_and_bind(df, 'color_type')

# drop non required cols

In [28]:
df = df.drop(['pet_category','breed_category','pet_id','issue_date','listing_date','color_type',
              'length(m)']
             ,axis=1)

In [29]:
df.tail()

Unnamed: 0,condition,height(cm),X1,X2,duration,length(cm),ratio_len_height,color_type_Agouti,color_type_Apricot,color_type_Black,...,color_type_Silver Lynx Point,color_type_Silver Tabby,color_type_Tan,color_type_Torbie,color_type_Tortie,color_type_Tortie Point,color_type_Tricolor,color_type_White,color_type_Yellow,color_type_Yellow Brindle
8067,2.0,36.08,13,9,394,82.0,2.272727,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8068,0.0,27.54,13,9,798,49.0,1.77923,0,0,0,...,0,0,1,0,0,0,0,0,0,0
8069,0.0,37.19,0,7,393,98.0,2.635117,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8070,-1.0,23.83,0,2,387,79.0,3.315149,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8071,0.0,24.51,0,1,392,64.0,2.611179,0,0,1,...,0,0,0,0,0,0,0,0,0,0


# splitting train-test

In [30]:
X_train = df[:18834]

In [31]:
X_train.shape

(18834, 63)

In [32]:
Y = train['breed_category'].values

In [33]:
Y.shape

(18834,)

In [34]:
Z = train['pet_category'].values

In [35]:
Z.shape

(18834,)

In [36]:
trainX = pd.concat([X_train,train['pet_category']],axis=1)

In [37]:
trainX.shape

(18834, 64)

In [38]:
X_test = df[18834:]

In [39]:
X_test.shape

(8072, 63)

# MODEL_1 PET CATEGORY

In [40]:
new_train, X_valid, y_train, y_valid = train_test_split(X_train, Z, test_size=0.2, random_state=42)

In [41]:
classifier=XGBClassifier(silent=False, 
                      scale_pos_weight=1,
                      learning_rate=0.4,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=1000, 
                      reg_alpha = 0.3,
                      max_depth=4, 
                      gamma=4)

classifier.fit(new_train,y_train)
classifier.score(new_train,y_train)

Parameters: { scale_pos_weight, silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




0.8950686931705051

In [42]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score
y_pred_1=classifier.predict(X_valid)
print(confusion_matrix(y_valid,y_pred_1))
print(accuracy_score(y_valid,y_pred_1))
print(classification_report(y_valid,y_pred_1))
f1 = f1_score(y_valid,y_pred_1,average='weighted')
print(f1)

[[   9    2    4    7]
 [   0 1166  229    6]
 [   0  144 2005    1]
 [   0    9   33  152]]
0.8845234934961508
              precision    recall  f1-score   support

           0       1.00      0.41      0.58        22
           1       0.88      0.83      0.86      1401
           2       0.88      0.93      0.91      2150
           4       0.92      0.78      0.84       194

    accuracy                           0.88      3767
   macro avg       0.92      0.74      0.80      3767
weighted avg       0.89      0.88      0.88      3767

0.8831934541877808


In [43]:
params = {        
          'min_child_weight': [1,2,3,4 ,5,6,7,8,9,10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [1,2,3, 4, 5],
          }

In [46]:
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.model_selection import StratifiedKFold
folds = 5
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search_1 = RandomizedSearchCV(classifier, 
                                   param_distributions=params, 
                                   n_iter=param_comb, 
                                   n_jobs=4, 
                                   cv=skf.split(new_train,y_train),
                                   verbose=3,
                                   random_state=1001 )

In [None]:
random_search_1.fit(new_train,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
random_search_1.score(new_train,y_train)

In [None]:
random_search_1.best_params_

In [None]:

y_pred_1 = random_search_1.predict(X_valid) 
print(accuracy_score(y_valid,y_pred_1))
print(classification_report(y_valid, y_pred_1))
f1 = f1_score(y_valid,y_pred_1,average='weighted')
print(f1)

In [None]:
pred_1 = random_search_1.predict(X_test)
pred_1 = np.array(pred_1).flatten()
pred_1[:5]

In [None]:
result = pd.concat([test['pet_id'],pd.Series(pred_1)],axis=1)
result.columns = ['pet_id','pet_category']
result['pet_category'].value_counts()

In [None]:
X_test['pet_category'] = pred_1

In [None]:
X_test.head()

# MODEL_2 Breed Category

In [None]:
trainX.shape,Y.shape,X_test.shape

In [None]:
train_new, validX, train_y, validy = train_test_split(trainX, Y, test_size=0.2, random_state=44)

In [None]:
classifier_2 = XGBClassifier(silent=False, 
                      scale_pos_weight=1,
                      learning_rate=0.4,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=1000, 
                      reg_alpha = 0.3,
                      max_depth=4, 
                      gamma=4)

classifier_2.fit(train_new,train_y)
classifier_2.score(train_new,train_y)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score

y_pred=classifier_2.predict(validX)
print(confusion_matrix(validy,y_pred))
print(accuracy_score(validy,y_pred))
print(classification_report(validy,y_pred))
print(f1_score(validy,y_pred,average='weighted'))


In [None]:
params_rs = {        
          'min_child_weight': [1,2,3,4 ,5,6,7,8,9,10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [1,2,3, 4, 5],
          }

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

folds = 5
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 100)

random_search_2 = RandomizedSearchCV(classifier_2, 
                                   param_distributions=params_rs,
                                   n_iter=param_comb, 
                                   n_jobs=4, 
                                   cv=skf.split(train_new,train_y), 
                                   verbose=3, 
                                   random_state=1001 )


In [None]:
random_search_2.fit(train_new,train_y)
random_search_2.score(train_new,train_y)

In [None]:
random_search_2.best_params_

In [None]:
y_pred = random_search_2.predict(validX)
print(confusion_matrix(validy,y_pred))
print(accuracy_score(validy,y_pred))
print(classification_report(validy,y_pred))
f2 = f1_score(validy,y_pred,average='weighted')
print(f2)

In [None]:
pred_2 = random_search_2.predict(X_test)
pred_2 = np.array(pred_2).flatten()
pred_2[:5]

In [None]:
result = pd.concat([test['pet_id'],pd.Series(pred_2)],axis=1)
result.columns = ['pet_id','breed_category']
result['breed_category'].value_counts()

In [None]:
accuracy=100*((f1+f2)/2)
accuracy

In [None]:
results_df = pd.DataFrame(data={'pet_id':test['pet_id'], 'breed_category':pred_2,'pet_category':pred_1})
results_df.to_csv('FINAL.csv', index=False)