# k-NN

In [52]:
import numpy as np
import pandas as pd
import joblib
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [22]:
data = pd.read_csv("train.csv")

In [23]:
data.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,,Brown,0.15,40.9,15,4,2.0,4
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0.0,2
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,0.0,1


In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18834 entries, 0 to 18833
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   pet_id          18834 non-null  object 
 1   issue_date      18834 non-null  object 
 2   listing_date    18834 non-null  object 
 3   condition       17357 non-null  float64
 4   color_type      18834 non-null  object 
 5   length(m)       18834 non-null  float64
 6   height(cm)      18834 non-null  float64
 7   X1              18834 non-null  int64  
 8   X2              18834 non-null  int64  
 9   breed_category  18834 non-null  float64
 10  pet_category    18834 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 1.6+ MB


## Impute Missing Data

In [25]:
data["condition"].value_counts()

1.0    6819
0.0    6281
2.0    4257
Name: condition, dtype: int64

In [26]:
# Used most frequent value of condition to fill in NaN entries
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(data[["condition"]])
data["condition"] = imputer.transform(data[["condition"]])

In [27]:
data.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,1.0,Brown,0.15,40.9,15,4,2.0,4
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0.0,2
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,0.0,1


In [28]:
data["condition"].value_counts()

1.0    8296
0.0    6281
2.0    4257
Name: condition, dtype: int64

## Encode Data

### Encode Independent variable

In [29]:
label_encoder = LabelEncoder()
data["color_type"] = label_encoder.fit_transform(data["color_type"])

In [30]:
data.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,18,0.8,7.78,13,9,0.0,1
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,53,0.72,14.19,13,9,0.0,2
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,1.0,15,0.15,40.9,15,4,2.0,4
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,53,0.62,17.82,0,1,0.0,2
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,2,0.5,11.06,18,4,0.0,1


In [31]:
color_encoder = OneHotEncoder()
condition_encoder = OneHotEncoder()
color_type_1h = color_encoder.fit_transform(data[["color_type"]])
condition_1h = condition_encoder.fit_transform(data[["condition"]])

In [32]:
color_type_1h

<18834x56 sparse matrix of type '<class 'numpy.float64'>'
	with 18834 stored elements in Compressed Sparse Row format>

In [33]:
condition_1h

<18834x3 sparse matrix of type '<class 'numpy.float64'>'
	with 18834 stored elements in Compressed Sparse Row format>

In [34]:
features = data[['length(m)', 'height(cm)', 'X1', 'X2']].values
X = np.concatenate((features, color_type_1h.toarray(), condition_1h.toarray()), axis = 1)

In [35]:
X.shape

(18834, 63)

### Encode Independent variable (Not Needed for sklearn k-NN Model)

In [36]:
y = data[["breed_category", "pet_category"]].values

In [37]:
print(y)

[[0. 1.]
 [0. 2.]
 [2. 4.]
 ...
 [1. 1.]
 [1. 2.]
 [1. 2.]]


## Scale Data

In [38]:
sc_x = StandardScaler()

# Scale the first 4 columns of features ['length(m)', 'height(cm)', 'X1', 'X2']
sc_x.fit(X[:,0:4])
X[:,0:4] = sc_x.transform(X[:, 0:4])

In [42]:
test_data = pd.read_csv("test.csv")

In [43]:
test_data.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2
0,ANSL_75005,2005-08-17 00:00:00,2017-09-07 15:35:00,0.0,Black,0.87,42.73,0,7
1,ANSL_76663,2018-11-15 00:00:00,2019-05-08 17:24:00,1.0,Orange Tabby,0.06,6.71,0,1
2,ANSL_58259,2012-10-11 00:00:00,2018-04-02 16:51:00,1.0,Black,0.24,41.21,0,7
3,ANSL_67171,2015-02-13 00:00:00,2018-04-06 07:25:00,1.0,Black,0.29,8.46,7,1
4,ANSL_72871,2017-01-18 00:00:00,2018-04-26 13:42:00,1.0,Brown,0.71,30.92,0,7


In [44]:
test_data["condition"].value_counts()

1.0    2928
0.0    2685
2.0    1840
Name: condition, dtype: int64

In [45]:
test_data["condition"] = imputer.transform(test_data[["condition"]])

In [46]:
test_data["condition"].value_counts()

1.0    3547
0.0    2685
2.0    1840
Name: condition, dtype: int64

In [47]:
test_data["color_type"] = label_encoder.transform(test_data["color_type"])

In [48]:
test_data.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2
0,ANSL_75005,2005-08-17 00:00:00,2017-09-07 15:35:00,0.0,2,0.87,42.73,0,7
1,ANSL_76663,2018-11-15 00:00:00,2019-05-08 17:24:00,1.0,38,0.06,6.71,0,1
2,ANSL_58259,2012-10-11 00:00:00,2018-04-02 16:51:00,1.0,2,0.24,41.21,0,7
3,ANSL_67171,2015-02-13 00:00:00,2018-04-06 07:25:00,1.0,2,0.29,8.46,7,1
4,ANSL_72871,2017-01-18 00:00:00,2018-04-26 13:42:00,1.0,15,0.71,30.92,0,7


In [49]:
test_color_type_1h = color_encoder.transform(test_data[["color_type"]])
test_condition_1h = condition_encoder.transform(test_data[["condition"]])

In [50]:
test_features = test_data[['length(m)', 'height(cm)', 'X1', 'X2']].values
X_test = np.concatenate((test_features, test_color_type_1h.toarray(), test_condition_1h.toarray()), axis = 1)

In [53]:
knn_multi_clf = joblib.load("knn_multi_clf.pkl")

In [63]:
y_test_pred = knn_multi_clf.predict(X_test)

In [65]:
import sys
np.set_printoptions(threshold=sys.maxsize)
print(y_test_pred)

[[0. 2.]
 [0. 1.]
 [0. 2.]
 [0. 1.]
 [0. 2.]
 [0. 1.]
 [1. 1.]
 [1. 2.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 2.]
 [0. 2.]
 [1. 1.]
 [1. 2.]
 [1. 2.]
 [1. 2.]
 [0. 2.]
 [0. 1.]
 [0. 1.]
 [0. 2.]
 [0. 1.]
 [0. 2.]
 [0. 2.]
 [1. 2.]
 [0. 2.]
 [1. 1.]
 [0. 1.]
 [0. 1.]
 [1. 2.]
 [0. 1.]
 [0. 1.]
 [1. 1.]
 [1. 2.]
 [0. 1.]
 [0. 1.]
 [1. 1.]
 [0. 2.]
 [1. 2.]
 [0. 2.]
 [0. 1.]
 [1. 2.]
 [0. 2.]
 [1. 2.]
 [1. 1.]
 [0. 2.]
 [0. 1.]
 [1. 2.]
 [1. 1.]
 [0. 2.]
 [1. 2.]
 [2. 1.]
 [0. 2.]
 [0. 2.]
 [1. 1.]
 [0. 1.]
 [1. 1.]
 [1. 1.]
 [1. 2.]
 [0. 2.]
 [1. 2.]
 [0. 2.]
 [0. 2.]
 [0. 2.]
 [0. 1.]
 [0. 2.]
 [2. 1.]
 [0. 1.]
 [1. 1.]
 [1. 1.]
 [1. 2.]
 [0. 2.]
 [1. 1.]
 [0. 2.]
 [1. 2.]
 [0. 2.]
 [1. 1.]
 [0. 1.]
 [0. 2.]
 [1. 2.]
 [0. 2.]
 [1. 2.]
 [0. 1.]
 [1. 1.]
 [0. 2.]
 [1. 2.]
 [0. 2.]
 [0. 2.]
 [0. 2.]
 [0. 2.]
 [0. 1.]
 [0. 2.]
 [1. 2.]
 [0. 2.]
 [0. 2.]
 [0. 1.]
 [0. 2.]
 [1. 1.]
 [1. 2.]
 [0. 1.]
 [0. 2.]
 [1. 1.]
 [0. 2.]
 [0. 1.]
 [0. 2.]
 [1. 1.]
 [1. 1.]
 [0. 1.]
 [1. 1.]
 [0. 2.]
 [1. 1.]
 

In [66]:
y_test_pred_df = pd.DataFrame(y_test_pred)

In [67]:
y_test_pred_df.head()

Unnamed: 0,0,1
0,0.0,2.0
1,0.0,1.0
2,0.0,2.0
3,0.0,1.0
4,0.0,2.0


In [68]:
y_out = test_data[["pet_id"]].join(y_test_pred_df)

In [69]:
y_out.head()

Unnamed: 0,pet_id,0,1
0,ANSL_75005,0.0,2.0
1,ANSL_76663,0.0,1.0
2,ANSL_58259,0.0,2.0
3,ANSL_67171,0.0,1.0
4,ANSL_72871,0.0,2.0


In [76]:
y_out.columns = ['pet_id', "breed_category", "pet_category"]

In [77]:
y_out.head()

Unnamed: 0,pet_id,breed_category,pet_category
0,ANSL_75005,0.0,2.0
1,ANSL_76663,0.0,1.0
2,ANSL_58259,0.0,2.0
3,ANSL_67171,0.0,1.0
4,ANSL_72871,0.0,2.0


In [78]:
y_out.to_csv('output.csv',index=False)