In [1]:
# Import the required packages
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import linear_model, model_selection
from matplotlib.colors import ListedColormap
from sklearn.linear_model import LinearRegression, LogisticRegression, Perceptron, Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, r2_score

from sklearn.metrics import accuracy_score, f1_score, precision_score, confusion_matrix, recall_score, mean_squared_error, classification_report
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
%matplotlib inline

In [2]:
# load the given labels
breed = pd.read_csv('../data/breed_labels.csv')
color = pd.read_csv('../data/color_labels.csv')
state = pd.read_csv('../data/state_labels.csv')

In [3]:
original_df = pd.read_csv('../data/train.csv')

In [4]:
original_df

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,Description,AdoptionSpeed,PID
0,2,3,299,0,1,1,7,0,1,1,2,2,2,1,1,100,41326,Nibble is a 3+ month old ball of cuteness. He ...,2,0
1,1,4,307,0,2,1,2,0,2,1,1,1,2,1,1,150,41401,"Good guard dog, very alert, active, obedience ...",2,3
2,1,1,307,0,1,1,0,0,2,1,2,2,2,1,1,0,41326,This handsome yet cute boy is up for adoption....,2,4
3,2,3,266,0,2,5,6,0,2,1,2,2,2,1,1,0,41326,This is a stray kitten that came to my house. ...,2,5
4,2,12,264,264,1,1,0,0,2,3,2,2,3,1,1,300,41326,anyone within the area of ipoh or taiping who ...,1,6
5,2,2,265,0,2,6,0,0,2,2,2,2,2,1,1,0,41326,"healthy and active, feisty kitten found in nei...",1,8
6,1,2,307,0,1,1,2,7,2,1,2,1,2,1,1,0,41401,"For serious adopter, please do sms or call for...",1,10
7,2,2,265,0,3,1,6,7,1,2,2,2,3,1,7,0,41326,Hi Pet Lovers! This is my first posting and I ...,1,13
8,1,3,307,0,2,2,5,7,2,2,3,3,3,1,1,0,41401,"Lost Dog Found (Bandar Menjalara, Kepong／Taman...",2,14
9,1,78,218,205,1,1,7,0,2,2,3,3,3,1,1,0,41326,We moved out of our apartment to a landed home...,4,15


In [5]:
original_df.columns

Index(['Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'Description',
       'AdoptionSpeed', 'PID'],
      dtype='object')

In [6]:
def transform_data(train_data_fname, test_data_fname):
    def transform_columns(df):
        df = df.drop(["Description"], axis=1)
        df.Type = df.Type.replace({1: 'Dog', 2: 'Cat'})
        df.Gender = df.Gender.replace({1:'Male', 2:'Female', 3:'Mixed'})
        df.MaturitySize = df.MaturitySize.replace({1:'S', 2:'M', 3:'L', 4:'XL', 0:'N/A'})
        df.FurLength = df.FurLength.replace({1:'S', 2:'M', 3:'L', 0:'N/A'})
        df.Vaccinated = df.Vaccinated.replace({1:'T', 2:'N', 3:'N/A'})
        df.Dewormed = df.Dewormed.replace({1:'T', 2:'F', 3:'N/A'})
        df.Sterilized = df.Sterilized.replace({1:'T', 2:'F', 3:'N/A'})
        df.Health = df.Health.replace({1:'Healthy', 2: 'MinorInjury', 3:'SeriousInjury', 0: 'N/A'})
        df.Color1 = df.Color1.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        df.Color2 = df.Color2.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        df.Color3 = df.Color3.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        df.Breed1 = df.Breed1.replace(dict(list(zip(breed.BreedID, breed.BreedName)) + [(0, "N/A")]))
        df.Breed2 = df.Breed2.replace(dict(list(zip(breed.BreedID, breed.BreedName)) + [(0, "N/A")]))
        return df
    
    df_train = pd.read_csv(train_data_fname)
    df_train = transform_columns(df_train)
    df_test = pd.read_csv(test_data_fname)
    df_test = transform_columns(df_test)
    
    df = pd.concat([df_train, df_test], sort=True)

    # set dummy variables for everything
    # except from Age, Quantity, Fee
    df = pd.get_dummies(df)
    # get train and test back
    n = len(df_train)
    df_train = df.iloc[:n]
    df_test = df.iloc[n:]
    
    y = df_train['AdoptionSpeed']
    X = df_train.drop('AdoptionSpeed', axis=1)
    yy = None
    XX = df_test.drop('AdoptionSpeed', axis=1)

    return X, y, XX, yy

In [7]:
X, y, XX, yy = transform_data("../data/train.csv", "../data/test.csv")

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [9]:
XX.head()

Unnamed: 0,Age,Fee,PID,Quantity,State,Breed1_Abyssinian,Breed1_Affenpinscher,Breed1_Airedale Terrier,Breed1_Akita,Breed1_American Bulldog,...,MaturitySize_S,MaturitySize_XL,Sterilized_F,Sterilized_N/A,Sterilized_T,Type_Cat,Type_Dog,Vaccinated_N,Vaccinated_N/A,Vaccinated_T
0,1,0,1,1,41401,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
1,1,0,2,1,41326,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,1
2,0,0,7,6,41326,0,0,0,0,0,...,0,0,1,0,0,0,1,1,0,0
3,12,0,9,1,41326,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
4,3,50,11,1,41326,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,1


In [10]:
from sklearn import linear_model
from sklearn.linear_model import SGDClassifier

np.random.seed(14) 

In [12]:
predictoras2=XX[['Age', 'Sterilized_T', 'Vaccinated_T', 'Gender_Male', 'Dewormed_T']]

In [13]:
predic_train=X_train[['Age', 'Sterilized_T', 'Vaccinated_T', 'Gender_Male', 'Dewormed_T']]

In [14]:
predic_valid=X_valid[['Age', 'Sterilized_T', 'Vaccinated_T', 'Gender_Male', 'Dewormed_T']]

In [18]:
forest=RandomForestClassifier(n_estimators=25)

In [19]:
forest2=forest.fit(predic_train, y_train)

In [20]:
print('Exactitud para entrenamiento: %.2f' %  accuracy_score(y_train, forest.predict(predic_train)))
print('Exactitud para validación: %.2f' % accuracy_score(y_valid, forest.predict(predic_valid)))

Exactitud para entrenamiento: 0.40
Exactitud para validación: 0.32


In [23]:
#Accuracy otra forma de calcularlo
forest2.score(predic_train, y_train)


0.3997569866342649

In [24]:
#Accuracy
forest2.score(predic_valid, y_valid)

0.3168503937007874

In [26]:
model = ExtraTreesClassifier()

In [28]:
model.fit(predic_train, y_train)



ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [29]:
print(model.feature_importances_)

[0.78727386 0.10506576 0.04270425 0.02950057 0.03545556]


In [30]:
predictions2 = forest2.predict(predictoras2)

In [31]:
predicciones_df = pd.DataFrame(predictions2, columns=['Pred'])
predicciones_df.head() # predicciones de las primeras 5 lineas

Unnamed: 0,Pred
0,1.0
1,2.0
2,1.0
3,4.0
4,2.0


In [32]:
yy = predictions2.astype(np.int)

In [33]:
submissionRF2 = pd.DataFrame(list(zip(XX.PID, yy.astype(np.int))), columns=["PID", "AdoptionSpeed"])

In [34]:
submissionRF2.to_csv("../data/submission6.csv", header=True, index=False)

In [35]:
submissionRF2['AdoptionSpeed'].value_counts()

2    2174
4    1338
1     498
3     374
0      27
Name: AdoptionSpeed, dtype: int64

In [36]:
submissionRF2.describe()

Unnamed: 0,PID,AdoptionSpeed
count,4411.0,4411.0
mean,7541.519157,2.566311
std,4369.698456,1.055957
min,1.0,0.0
25%,3704.5,2.0
50%,7560.0,2.0
75%,11344.0,4.0
max,14990.0,4.0
