In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
%matplotlib inline 
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from numpy import asarray
from numpy import mean
from numpy import std
from sklearn.metrics import classification_report
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train['pet_category'].value_counts()

In [None]:
train['breed_category'].value_counts()

In [None]:
train['condition'].value_counts()

In [None]:
test['condition'].value_counts()

In [None]:
train['condition'].unique()

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train.groupby(['breed_category', 'pet_category']).size()

# date_time

In [None]:
train['issue_date'] = pd.to_datetime(train['issue_date'])
train['listing_date'] = pd.to_datetime(train['listing_date'])
train['duration'] = (train['listing_date'] - train['issue_date']).dt.days  

In [None]:
train['duration'] = np.absolute(np.array(train.duration).flatten())
train['duration'].head()

In [None]:
test['issue_date'] = pd.to_datetime(test['issue_date'])
test['listing_date'] = pd.to_datetime(test['listing_date'])
test['duration'] = (test['listing_date'] - test['issue_date']).dt.days  


In [None]:
test['duration'] = np.absolute(np.array(test.duration).flatten())

# length_height

In [None]:
print(len(train[train['length(m)'] == 0]))
print(len(test[test['length(m)']==0]))

In [None]:
train['length(cm)'] = train['length(m)'].apply(lambda x: x*100)
test['length(cm)'] = test['length(m)'].apply(lambda x: x*100)

In [None]:
# replace all 0 length with mean of lengths
val = train['length(cm)'].mean()
train['length(cm)'] = train['length(cm)'].replace(to_replace=0, value=val)
test['length(cm)'] = test['length(cm)'].replace(to_replace=0, value=val)


In [None]:
#new feature
train['ratio_len_height'] = train['length(cm)']/train['height(cm)']
test['ratio_len_height'] = test['length(cm)']/test['height(cm)']


In [None]:
train.groupby(['length(cm)', 'pet_category']).size()

# clubbing train and test

In [None]:
df = pd.concat([train,test],axis=0,sort=False)

In [None]:
df

# missing values filling

In [None]:
df['condition'].fillna(-1, inplace=True)

# categorical values 

In [None]:
#df['color_number'] = LabelEncoder().fit_transform(df['color_type'])

In [None]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    return(res)

In [None]:
df = encode_and_bind(df, 'color_type')

# drop non required cols

In [None]:
df = df.drop(['pet_category','breed_category','pet_id','issue_date','listing_date','color_type',
              'length(m)']
             ,axis=1)

In [None]:
df.tail()

# splitting train-test

In [None]:
X_train = df[:18834]

In [None]:
X_train.shape

In [None]:
Y = train['breed_category'].values

In [None]:
Y.shape

In [None]:
Z = train['pet_category'].values

In [None]:
Z.shape

In [None]:
trainX = pd.concat([X_train,train['pet_category']],axis=1)

In [None]:
trainX.shape

In [None]:
X_test = df[18834:]

In [None]:
X_test.shape

# MODEL_1 PET CATEGORY

In [None]:
new_train, X_valid, y_train, y_valid = train_test_split(X_train, Z, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

classifier=XGBClassifier(silent=False, 
                      scale_pos_weight=1,
                      learning_rate=0.4,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=1000, 
                      reg_alpha = 0.3,
                      max_depth=4, 
                      gamma=4)

classifier.fit(new_train,y_train)
classifier.score(new_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score
y_pred_1=classifier.predict(X_valid)
print(confusion_matrix(y_valid,y_pred_1))
print(accuracy_score(y_valid,y_pred_1))
print(classification_report(y_valid,y_pred_1))
f1 = f1_score(y_valid,y_pred_1,average='weighted')
print(f1)

In [None]:
params = {        
          'min_child_weight': [1,2,3,4 ,5,6,7,8,9,10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [1,2,3, 4, 5],
          }

In [None]:
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.model_selection import StratifiedKFold
folds = 5
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search_1 = RandomizedSearchCV(model_1, 
                                   param_distributions=params, 
                                   n_iter=param_comb, 
                                   n_jobs=4, 
                                   cv=skf.split(new_train,y_train),
                                   verbose=3,
                                   random_state=1001 )

In [None]:
random_search_1.fit(new_train,y_train)

In [None]:
random_search_1.score(new_train,y_train)

In [None]:
random_search_1.best_params_

In [None]:

y_pred_1 = random_search_1.predict(X_valid) 
print(accuracy_score(y_valid,y_pred_1))
print(classification_report(y_valid, y_pred_1))
f1 = f1_score(y_valid,y_pred_1,average='weighted')
print(f1)

In [None]:
pred_1 = random_search_1.predict(X_test)
pred_1 = np.array(pred_1).flatten()
pred_1[:5]

In [None]:
result = pd.concat([test['pet_id'],pd.Series(pred_1)],axis=1)
result.columns = ['pet_id','pet_category']
result['pet_category'].value_counts()

In [None]:
X_test['pet_category'] = pred_1

In [None]:
X_test.head()

# MODEL_2 Breed Category

In [None]:
trainX.shape,Y.shape,X_test.shape

In [None]:
train_new, validX, train_y, validy = train_test_split(trainX, Y, test_size=0.2, random_state=44)

In [None]:
from sklearn.ensemble import RandomForestClassifier

classifier_2 = XGBClassifier(silent=False, 
                      scale_pos_weight=1,
                      learning_rate=0.4,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=1000, 
                      reg_alpha = 0.3,
                      max_depth=4, 
                      gamma=4)

classifier_2.fit(train_new,train_y)
classifier_2.score(train_new,train_y)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score

y_pred=classifier_2.predict(validX)
print(confusion_matrix(validy,y_pred))
print(accuracy_score(validy,y_pred))
print(classification_report(validy,y_pred))
print(f1_score(validy,y_pred,average='weighted'))


In [None]:
params_rs = {        
          'min_child_weight': [1,2,3,4 ,5,6,7,8,9,10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [1,2,3, 4, 5],
          }

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

folds = 5
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 100)

random_search_2 = RandomizedSearchCV(model_2, 
                                   param_distributions=params_rs,
                                   n_iter=param_comb, 
                                   n_jobs=4, 
                                   cv=skf.split(train_new,train_y), 
                                   verbose=3, 
                                   random_state=1001 )


In [None]:
random_search_2.fit(train_new,train_y)
random_search_2.score(train_new,train_y)

In [None]:
random_search_2.fit(train_new,train_y)
random_search_2.score(train_new,train_y)

In [None]:
random_search_2.best_params_

In [None]:
y_pred=random_search_2.predict(validX)
print(confusion_matrix(validy,y_pred))
print(accuracy_score(validy,y_pred))
print(classification_report(validy,y_pred))
f2 = f1_score(validy,y_pred,average='weighted')
print(f2)

In [None]:
pred_2 = random_search_2.predict(X_test)
pred_2 = np.array(pred_2).flatten()
pred_2[:5]

In [None]:
result = pd.concat([test['pet_id'],pd.Series(pred_2)],axis=1)
result.columns = ['pet_id','breed_category']
result['breed_category'].value_counts()

In [None]:
accuracy=100*((f1+f2)/2)
accuracy

In [None]:
results_df = pd.DataFrame(data={'pet_id':test['pet_id'], 'breed_category':pred_2,'pet_category':pred_1})
results_df.to_csv('FINAL.csv', index=False)