In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder , OneHotEncoder
from sklearn.metrics import f1_score
import random
random.seed(2020)

import warnings
warnings.filterwarnings('ignore')

In [2]:
#get some with pet_id
def id_bins(s):
    if s['pet_id'] <= 63355:
        return 'OLD'
    if s['pet_id'] <= 70150:
        return 'MID'
    else:
        return 'NEW'

def height_bins(s):
    
    if s['height(cm)'] <=27.36:
        return 'SHORT'
    else:
        return 'TALL'

In [3]:
def accuracy(y1,y2):
    f1 = f1_score(y1[:,[0]],y2.iloc[:,0],average='weighted')
    f2 = f1_score(y1[:,[1]],y2.iloc[:,1],average='weighted')
    s=(f1+f2)/2
    print('accuracy',s)

In [4]:
train = pd.read_csv('train.csv',parse_dates=['issue_date','listing_date'])
test = pd.read_csv('test.csv',parse_dates=['issue_date','listing_date'])
train.shape , test.shape

((18834, 11), (8072, 9))

In [5]:
#shuffling the training dataset
train = train.sample(frac=1).reset_index(drop=True)

#combining train and test data
all_data = train.append(test)

#filling the missing the value of condition
all_data.fillna(3,inplace=True)


In [6]:
#new features with X1 and X2 
#all_data['ADD_X1X2'] = all_data['X1'] + all_data['X2']
#all_data['SUB_X1X2'] = all_data['X1'] - all_data['X2']
#all_data['MUL_X1X2'] = all_data['X1'] * all_data['X2']

#

all_data['no_of_days'] =abs((all_data['issue_date'] - all_data['listing_date']).dt.days)
all_data['area'] = all_data['height(cm)'] * all_data['length(m)']*100

In [132]:
#dates features
all_data['listing_day'] = all_data['listing_date'].dt.day
all_data['listing_month'] = all_data['listing_date'].dt.month
all_data['listing_dayofweek'] = all_data['listing_date'].dt.dayofweek
all_data['listing_weekday'] = all_data['listing_date'].dt.weekday
all_data['listing_hour'] = all_data['listing_date'].dt.hour
all_data['issue_day'] = all_data['issue_date'].dt.day
all_data['issue_month'] = all_data['listing_date'].dt.month
all_data['issue_dayofweek'] = all_data['listing_date'].dt.dayofweek
all_data['issue_weekday'] = all_data['listing_date'].dt.weekday
all_data['issue_hour'] = all_data['listing_date'].dt.hour

In [133]:
#all_data['id_bins'] = all_data.apply(id_bins,axis=1)    
#all_data['height_bin'] = all_data.apply(height_bins,axis=1)

all_data['pet_id'] = all_data['pet_id'].apply(lambda x: int(x.split('_')[1]))

In [7]:
cat = list(all_data.select_dtypes('object'))
print('categorical values',cat)
for i in cat:
    l = LabelEncoder()
    l.fit(all_data[i])
    all_data[i]= l.transform(all_data[i])
    
all_data = pd.get_dummies(all_data,
                          columns=['color_type',#'id_bins', 
                                   #'height_bin'
                                  ],
                          drop_first=True)
all_data.shape

categorical values ['pet_id', 'color_type']


(26906, 67)

In [8]:
train.color_type.value_counts()

Black                4620
White                2453
Brown                1791
Brown Tabby          1687
Tan                  1349
Blue                  852
Orange Tabby          791
Red                   526
Brown Brindle         496
Tricolor              469
Blue Tabby            386
Tortie                366
Calico                343
Gray                  307
Chocolate             259
Torbie                242
Cream Tabby           191
Sable                 167
Cream                 162
Fawn                  159
Yellow                143
Buff                  125
Lynx Point            117
Blue Merle            104
Seal Point             78
Black Brindle          66
Gray Tabby             65
Black Tabby            55
Flame Point            52
Brown Merle            39
Orange                 39
Black Smoke            32
Gold                   31
Tortie Point           26
Silver                 24
Red Tick               23
Blue Tick              21
Blue Point             20
Lilac Point 

In [9]:
y = train[['breed_category','pet_category']]
drop = ['breed_category','pet_category','pet_id',
        'issue_date', 'listing_date'
       ]
all_data.drop(drop,axis=1,inplace=True)
test_pet_id = test['pet_id']
cols= list(all_data.columns)

In [10]:
train = all_data.iloc[:train.shape[0],:]
test = all_data.iloc[train.shape[0]:,:]
train.shape , test.shape

((18834, 62), (8072, 62))

In [14]:
X = train[cols]
xtrain , xtest , ytrain , ytest = model_selection.train_test_split(X,y,
                                                                   random_state=2020,test_size=0.2)
print('training data', xtrain.shape,ytrain.shape)
print('validation data ',xtest.shape,ytest.shape)

training data (15067, 62) (15067, 2)
validation data  (3767, 62) (3767, 2)


In [15]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
clf = MultiOutputClassifier(KNeighborsClassifier(n_neighbors=9)).fit(xtrain,ytrain)
p = clf.predict(xtest)
accuracy(p,ytest),clf.score(xtrain,np.array(ytrain)),clf.score(xtest,np.array(ytest))

accuracy 0.5817795349995412


(None, 0.4633968275038163, 0.3342182107778073)

In [16]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(xtrain,ytrain)
pr = model.predict(xtest)
accuracy(pr,ytest)

accuracy 0.892529398052152


In [17]:
import xgboost as xg
xgb = MultiOutputClassifier(xg.XGBClassifier(random_state=2020,
                                            max_depth=50,
                                            ))
xgb.fit(xtrain,ytrain)
pl = xgb.predict(xtest)
accuracy(pl,ytest)

accuracy 0.9074723200395072


In [34]:
import lightgbm as lgb 
lg = MultiOutputClassifier(lgb.LGBMClassifier(random_state=2020,
                                             num_iterations=400,n_estimators= 200,
                                              max_depth=100,learning_rate= 0.10,
                                              bosting='gbdt',bagging_fraction=0.7))

lg.fit(xtrain,ytrain)
plg = lg.predict(xtest)
accuracy(plg,ytest)

accuracy 0.9030504139313794


In [21]:
ypred = lg.predict(test[cols])

In [22]:
df1 = pd.DataFrame(test_pet_id,columns=['pet_id'])
df2 = pd.DataFrame(ypred,columns=['breed_category','pet_category'])
df = pd.concat([df1,df2],axis=1)
df.to_csv('submission.csv',index=False)

In [17]:
l = lgb.LGBMClassifier(random_state=10)
l.fit(xtrain,ytrain.iloc[:,[0]])
d= {'columns':xtrain.columns,'importance':l.feature_importances_}
df = pd.DataFrame(data=d)
df.sort_values(by='importance').tail(20)

Unnamed: 0,columns,importance
74,id_bins_1,44
20,color_type_2,45
55,color_type_37,47
75,id_bins_2,141
4,X2,147
56,color_type_38,152
39,color_type_21,164
67,color_type_49,185
68,color_type_50,195
3,X1,260


In [18]:
l2 = lgb.LGBMClassifier(random_state=10)
l2.fit(xtrain,ytrain.iloc[:,[1]])
d= {'columns':xtrain.columns,'importance':l2.feature_importances_}
df = pd.DataFrame(data=d)
df.sort_values(by='importance',ascending=False).head(20)

Unnamed: 0,columns,importance
6,no_of_days,1717
2,height(cm),804
12,issue_day,765
7,listing_day,732
5,area_occupied,692
8,listing_month,687
1,length(m),644
3,X1,603
11,listing_hour,595
0,condition,492
