# Modelling PetFinder.my with Logistic Regression

Model: (Multinomial) Logistic Regression.

Evaluation metric: Quadratic Kappa

## References

* [Mathematical Explanation of Naive Bayes](https://towardsdatascience.com/a-mathematical-explanation-of-naive-bayes-in-5-minutes-44adebcdb5f8/)
* [Gaussian NB Guide](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB)
* [Multinomial NB Guide](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB)
* [Naive Bayes: A Baseline Model Guide](https://www.kdnuggets.com/2019/04/naive-bayes-baseline-model-machine-learning-classification-performance.html/2)
* [Multinomial Naive Bayes Classifier for Text Analysis](https://towardsdatascience.com/multinomial-naive-bayes-classifier-for-text-analysis-python-8dd6825ece67)
* [Naive Bayes Classification using Scikit-learn](https://www.datacamp.com/community/tutorials/naive-bayes-scikit-learn)

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.naive_bayes import MultinomialNB

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
SOURCE_DATA_FOLDER = "../data/source/petfinder-adoption-prediction/"
TRAIN_IMAGE_FOLDER = SOURCE_DATA_FOLDER + "train_images/"
TRAIN_METADATA_FOLDER = SOURCE_DATA_FOLDER + "train_metadata/"
TRAIN_SENTIMENT_FOLDER = SOURCE_DATA_FOLDER + "train_sentiment/"

BREED_LABELS = SOURCE_DATA_FOLDER + "breed_labels.csv"
COLOR_LABELS = SOURCE_DATA_FOLDER + "color_labels.csv"
STATE_LABELS = SOURCE_DATA_FOLDER + "state_labels.csv"
TRAIN_TABULAR = SOURCE_DATA_FOLDER + "train/train.csv"
TEST_TABULAR = SOURCE_DATA_FOLDER + "test/test.csv"

## Load Data

In [4]:
train_tabular_df = pd.read_csv(TRAIN_TABULAR)
test_tabular_df = pd.read_csv(TEST_TABULAR)

## Pre-Processing

In [5]:
def preprocessing(df):

    transformed_df = df.copy()

    # Change Name to 1 - has name, 0 - has no name
    transformed_df['Name'] = transformed_df['Name'].fillna(0)
    transformed_df['Name'].replace({"No Name": 0, "No Name Yet": 0, "Unknown": 0},inplace=True)
    transformed_df.loc[transformed_df['Name'] !=0, 'Name'] = 1

    # Create age bins
    age_bins = pd.cut(transformed_df['Age'], bins=[0,3,6,12,24,300], include_lowest=True)
    age_bin_dummies = pd.get_dummies(age_bins)
    age_bin_dummies.columns = ["is_age_0_3", "is_age_3_6", "is_age_6_12","is_age_12_24", "is_age_24_300"]
    transformed_df = pd.concat([transformed_df, age_bin_dummies], axis=1)

    # One-hot encode dummy variables
    dummy_cols = [
        'Type',
        'Gender',
        'MaturitySize',
        'Vaccinated',
        'Dewormed',
        'Sterilized',
        'Health',
        'FurLength',
        'State',
        'Breed1',
        'Breed2',
        'Color1',
        'Color2',
        'Color3',
    ]

    transformed_df = pd.get_dummies(transformed_df, columns=dummy_cols)
    
    # Get rid of all non-encoded columns except Quantity
    transformed_df.drop(columns=["VideoAmt", "PhotoAmt", "RescuerID", "Description", "PetID", "Age", "Fee"], 
                        inplace=True)
    
    return transformed_df

## Final model: Multinomial NB with One-Hot Encoding

In [6]:
first_train = preprocessing(train_tabular_df)
first_test = preprocessing(test_tabular_df)
first_test["AdoptionSpeed"] = None
train, test = first_train.align(first_test, join='inner', axis=1)
test.drop(columns=["AdoptionSpeed"], inplace=True)

In [7]:
X_train = train.drop(columns=["AdoptionSpeed"])
y_train = train["AdoptionSpeed"]

In [8]:
# Now do GridSearchCV on the model
params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0, ], }
quadratic_kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')
grid = GridSearchCV(MultinomialNB(), params, refit = True, verbose = True, scoring=quadratic_kappa_scorer)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    5.4s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [0.01, 0.1, 0.5, 1.0, 10.0]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(cohen_kappa_score, weights=quadratic),
             verbose=True)

In [9]:
print(grid.best_params_)

{'alpha': 0.01}


In [10]:
# Run through model
clf = MultinomialNB(alpha=0.1)
clf.fit(X_train, y_train)
y_pred = clf.predict(test)

print("Training score:", cohen_kappa_score(clf.predict(X_train), y_train, weights='quadratic'))

Training score: 0.29174263004198075


In [11]:
submission = pd.DataFrame.from_dict({'PetID': test_tabular_df['PetID'],
                                     'AdoptionSpeed': y_pred})
print(submission.shape)
submission.head()

(3972, 2)


Unnamed: 0,PetID,AdoptionSpeed
0,e2dfc2935,2
1,f153b465f,4
2,3c90f3f54,1
3,e02abc8a3,4
4,09f0df7d1,4


In [12]:
submission.to_csv("../data/final/jinhao-submission.csv", index=False)

## ALL ROUGH TESTING/WORKING CODE BELOW

In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from scipy import stats
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import classification_report

### 1. GaussianNB

In [28]:
train_df = pd.read_csv(TRAIN_TABULAR)

# Cleaning
# Add quantity of RescuerID
train_df["RescuerID_count"] = train_df.groupby("RescuerID")["RescuerID"].transform("count")
# sns.countplot(x="RescuerID_count", data=train_processed)
# plt.title("RescuerID Count Distribution")

# Change Name to 1 - has name, 0 - has no name
train_df['Name'] = train_df['Name'].fillna(0)
train_df['Name'].replace({"No Name": 0, "No Name Yet": 0, "Unknown": 0},inplace=True)
train_df.loc[train_df['Name'] !=0, 'Name'] = 1

# Outlier removal
train_df = train_df[(np.abs(stats.zscore(train_df['Age'])) < 3)]
train_df = train_df[(np.abs(stats.zscore(train_df['PhotoAmt'])) < 3)]
train_df = train_df[(np.abs(stats.zscore(train_df['Quantity'])) < 3)]

# Normalise numerical columns
columns_to_normalize = ['Age', 'MaturitySize', 'FurLength', 'Quantity', 'PhotoAmt', 'Fee', 'VideoAmt']
x = train_df[columns_to_normalize].values
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x)
train_temp = pd.DataFrame(x_scaled, columns=columns_to_normalize, index=train_df.index)
train_df[columns_to_normalize] = train_temp

# Get rid of unused columns
data_all = train_df.drop(columns=["RescuerID", "Description", "PetID"])

# Consider removing non-categorical rows -> Age, Quantity, Fee, RescuerID_count, VideoAmt, PhotoAmt
data_categorical = train_df.drop(columns=["RescuerID", "Description", "PetID", "Name", "VideoAmt", "PhotoAmt",
                                          "Age", "Quantity", "Fee", "RescuerID_count"])

### 1.1: data_all:

In [29]:
# Split into train test 20%
train, test = train_test_split(data_all, test_size=0.2, random_state=42, shuffle=True)
X_train = train.drop(columns=["AdoptionSpeed"])
y_train = train["AdoptionSpeed"]
X_test = test.drop(columns=["AdoptionSpeed"])
y_test = test["AdoptionSpeed"]

# Run through model
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

# Model Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Kappa:", cohen_kappa_score(y_test, y_pred, weights='quadratic'))
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Accuracy: 0.3445679455001793
Kappa: 0.170580504512022


Predicted,0,1,2,3,4,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,25,4,8,52,90
1,0,147,132,40,278,597
2,4,134,155,110,359,762
3,1,79,127,105,270,582
4,4,82,57,62,553,758
All,10,467,475,325,1512,2789


In [30]:
# Now do GridSearchCV on the model
params = {'var_smoothing': np.logspace(0,-9, num=100)}
quadratic_kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')
grid = GridSearchCV(GaussianNB(), params, refit = True, verbose = True, scoring=quadratic_kappa_scorer)

grid.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   11.0s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=GaussianNB(priors=None, var_smoothing=1e-09),
             iid='deprecated', n_jobs=None,
             param_grid={'var_smoothing': array([1.00000000e+00, 8.11130831e-01, 6.57933225e-01, 5.33669923e-01,
       4.32876128e-01, 3.51119173e-01, 2.84803587e-01, 2.31012970e-01,
       1.87381742e-01, 1.51991108e-01, 1.23284674e-01, 1.00000000e-01,
       8.11130831e-02,...
       2.31012970e-08, 1.87381742e-08, 1.51991108e-08, 1.23284674e-08,
       1.00000000e-08, 8.11130831e-09, 6.57933225e-09, 5.33669923e-09,
       4.32876128e-09, 3.51119173e-09, 2.84803587e-09, 2.31012970e-09,
       1.87381742e-09, 1.51991108e-09, 1.23284674e-09, 1.00000000e-09])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(cohen_kappa_score, weights=quadratic),
             verbose=True)

In [31]:
print(grid.best_params_)

{'var_smoothing': 3.5111917342151277e-07}


In [32]:
# Run through model optimal
gnb = GaussianNB(var_smoothing=3.5111917342151277e-07)
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

# Model Accuracy optimal
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Kappa:", cohen_kappa_score(y_test, y_pred, weights='quadratic'))
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Accuracy: 0.3607027608461814
Kappa: 0.1976118079639656


Predicted,0,1,2,3,4,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,27,7,6,49,90
1,2,177,140,44,234,597
2,0,155,188,104,315,762
3,2,91,130,105,254,582
4,2,107,72,42,535,758
All,7,557,537,301,1387,2789


### 1.2: data_categorical:


In [33]:
# Split into train test 20%
train, test = train_test_split(data_categorical, test_size=0.2, random_state=42, shuffle=True)
X_train = train.drop(columns=["AdoptionSpeed"])
y_train = train["AdoptionSpeed"]
X_test = test.drop(columns=["AdoptionSpeed"])
y_test = test["AdoptionSpeed"]

# Run through model
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

# Model Accuracy
print("Kappa:", cohen_kappa_score(y_test, y_pred, weights='quadratic'))
print("Accuracy:", accuracy_score(y_test, y_pred))
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Kappa: 0.20178287543869167
Accuracy: 0.3363212621011115


Predicted,0,1,2,3,4,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,27,30,8,24,90
1,1,141,310,40,105,597
2,0,134,411,59,158,762
3,2,88,276,63,153,582
4,3,84,304,45,322,758
All,7,474,1331,215,762,2789


In [34]:
# Now do GridSearchCV on the model
params = {'var_smoothing': np.logspace(0,-9, num=100)}
quadratic_kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')
grid = GridSearchCV(GaussianNB(), params, refit = True, verbose = True, scoring=quadratic_kappa_scorer)

grid.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    3.3s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=GaussianNB(priors=None, var_smoothing=1e-09),
             iid='deprecated', n_jobs=None,
             param_grid={'var_smoothing': array([1.00000000e+00, 8.11130831e-01, 6.57933225e-01, 5.33669923e-01,
       4.32876128e-01, 3.51119173e-01, 2.84803587e-01, 2.31012970e-01,
       1.87381742e-01, 1.51991108e-01, 1.23284674e-01, 1.00000000e-01,
       8.11130831e-02,...
       2.31012970e-08, 1.87381742e-08, 1.51991108e-08, 1.23284674e-08,
       1.00000000e-08, 8.11130831e-09, 6.57933225e-09, 5.33669923e-09,
       4.32876128e-09, 3.51119173e-09, 2.84803587e-09, 2.31012970e-09,
       1.87381742e-09, 1.51991108e-09, 1.23284674e-09, 1.00000000e-09])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(cohen_kappa_score, weights=quadratic),
             verbose=True)

In [35]:
print(grid.best_params_)

{'var_smoothing': 6.579332246575682e-08}


In [36]:
# Run through model v2
gnb = GaussianNB(var_smoothing=6.579332246575682e-08)
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

# Model Accuracy v2
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Kappa:", cohen_kappa_score(y_test, y_pred, weights='quadratic'))
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Accuracy: 0.33381140193617787
Kappa: 0.20163722884283353


Predicted,0,1,2,3,4,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,27,30,8,24,90
1,1,133,318,40,105,597
2,0,133,411,59,159,762
3,2,86,276,63,155,582
4,3,81,306,45,323,758
All,7,460,1341,215,766,2789


### 2. Multinomial NB with NLP


In [45]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

In [46]:
def transform_string_merge(df):
    # This function turns all the relevent columns to their string counterparts and combines everything into one string
    # First transform all the relevant numerical columns to strings
    df["Type"] = df["Type"].apply(lambda x: "Dog" if x == 1 else "Cat")
    
    breed = pd.read_csv("/Users/Jinhao 1/Documents/git/COMP9417-GroupProject21T2/data/source/petfinder-adoption-prediction/BreedLabels.csv")
    breed_dict = dict(zip(breed["BreedID"], breed["BreedName"]))
    df["Breed1"] = df["Breed1"].map(breed_dict)
    df["Breed2"] = df["Breed2"].map(breed_dict)
    
    gender_dict = {1:"Male", 2:"Female", 3:"Mixed"}
    df["Gender"] = df["Gender"].map(gender_dict)
    
    color = pd.read_csv("/Users/Jinhao 1/Documents/git/COMP9417-GroupProject21T2/data/source/petfinder-adoption-prediction/ColorLabels.csv")
    color_dict = dict(zip(color["ColorID"], color["ColorName"]))
    df["Color1"] = df["Color1"].map(color_dict)
    df["Color2"] = df["Color2"].map(color_dict)
    df["Color3"] = df["Color3"].map(color_dict)
    
    maturity_dict = {1:"Small", 2:"Medium", 3:"Large", 4:"Extra Large", 0:"Not Specified"}
    df["MaturitySize"] = df["MaturitySize"].map(maturity_dict)
    
    fur_dict = {1:"Short", 2:"Medium", 3:"Long", 0:"Not Specified"}
    df["FurLength"] = df["FurLength"].map(fur_dict)

    binary_dict = {1:"Yes", 2:"No", 3:"Not Sure"}
    df["Vaccinated"] = df["Vaccinated"].map(binary_dict)
    df["Dewormed"] = df["Dewormed"].map(binary_dict)
    df["Sterilized"] = df["Sterilized"].map(binary_dict)

    health_dict = {1:"Healthy", 2:"Minor Injury", 3:"Serious Injury", 0:"Not Specified"}
    df["Health"] = df["Health"].map(health_dict)

    state = pd.read_csv("/Users/Jinhao 1/Documents/git/COMP9417-GroupProject21T2/data/source/petfinder-adoption-prediction/StateLabels.csv")
    state_dict = dict(zip(state["StateID"], state["StateName"]))
    df["State"] = df["State"].map(state_dict)

    # More clean-up
    df = df.fillna('')
    df = df.where(pd.notnull(df), None)
    
    # Turn all columns into strings and then combine in new column as one string
    all_columns = list(df) # Creates list of all column headers
    df[all_columns] = df[all_columns].astype(str)
    
    # Combine all relevant columns as one string
    df["x_string"] = df[['Name', 
                         'Age', 
                         'Breed1', 
                         'Breed2', 
                         'Gender', 
                         'Color1', 
                         'Color2', 
                         'Color3', 
                         'MaturitySize', 
                         'FurLength', 
                         'Vaccinated', 
                         'Dewormed', 
                         'Sterilized', 
                         'Health', 
                         'Fee', 
                         'State', 
                         'Description']].agg(' '.join, axis=1)
    
    df = df[df.x_string.map(lambda x: x.isascii())]
    df_new = df[["x_string", "AdoptionSpeed"]]
    
    return df_new

In [47]:
def tokenize(data):
    tokenized_docs = [word_tokenize(doc) for doc in data]
    alpha_tokens = [[t.lower() for t in doc if t.isalpha() == True] for doc in tokenized_docs]
    lemmatizer = WordNetLemmatizer()
    lem_tokens = [[lemmatizer.lemmatize(alpha) for alpha in doc] for doc in alpha_tokens]
    X_stem_as_string = [" ".join(x_t) for x_t in lem_tokens]
    return X_stem_as_string

In [48]:
df = pd.read_csv(TRAIN_TABULAR)

# Use transform function
df_transformed = transform_string_merge(df)

# Split processed df into train and split
train, test = train_test_split(df_transformed, test_size=0.2, random_state=42, shuffle=True)

In [49]:
vct = CountVectorizer(stop_words='english', lowercase=False)
svd = TruncatedSVD(n_components=200, random_state=42)
tfvec = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), lowercase=False)

preprocessing_pipe = Pipeline([
    ('vectorizer', tfvec),
    ('svd', svd)   
])

In [50]:
X_train_tk = tokenize(train["x_string"])
X_test_tk = tokenize(test["x_string"])

In [51]:
lsa_train = preprocessing_pipe.fit_transform(X_train_tk)
lsa_train.shape

(11438, 200)

In [52]:
# Multinomial NB
mb = MultinomialNB(alpha=1)
pipe = Pipeline([
    ('vectorizer', tfvec),
    ('mb', mb)
])
pipe.fit(X_train_tk, train["AdoptionSpeed"])
y_pred = pipe.predict(X_test_tk)

# Model Accuracy
print("Accuracy:", accuracy_score(test["AdoptionSpeed"], y_pred))
print("Kappa:", cohen_kappa_score(test["AdoptionSpeed"], y_pred, weights='quadratic'))
print(classification_report(test["AdoptionSpeed"], y_pred))

Accuracy: 0.39090909090909093
Kappa: 0.2177138737405876
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        70
           1       0.46      0.04      0.07       583
           2       0.36      0.55      0.43       779
           3       0.62      0.07      0.13       645
           4       0.40      0.79      0.54       783

    accuracy                           0.39      2860
   macro avg       0.37      0.29      0.23      2860
weighted avg       0.44      0.39      0.31      2860



  _warn_prf(average, modifier, msg_start, len(result))


### 3. Multinomial NB with one-hot encoding


In [53]:
df = pd.read_csv(TRAIN_TABULAR)

transformed_df = df.copy()

# Change Name to 1 - has name, 0 - has no name
transformed_df['Name'] = transformed_df['Name'].fillna(0)
transformed_df['Name'].replace({"No Name": 0, "No Name Yet": 0, "Unknown": 0},inplace=True)
transformed_df.loc[transformed_df['Name'] !=0, 'Name'] = 1

# RescuerID Counts
rescuerid_counts = pd.DataFrame(transformed_df.groupby(["RescuerID"]).size(), columns=["RescuerID_Count"]).reset_index()
transformed_df = pd.merge(transformed_df, rescuerid_counts, on="RescuerID")

# create age bins
age_bins = pd.cut(transformed_df['Age'], bins=[0,3,6,12,24,300], include_lowest=True)
age_bin_dummies = pd.get_dummies(age_bins)
age_bin_dummies.columns = ["is_age_0_3", "is_age_3_6", "is_age_6_12","is_age_12_24", "is_age_24_300"]
transformed_df = pd.concat([transformed_df, age_bin_dummies], axis=1)

# one-hot encode dummy variables

dummy_cols = [
    'Type',
    'Gender',
    'MaturitySize',
    'Vaccinated',
    'Dewormed',
    'Sterilized',
    'Health',
    'FurLength',
    'State',
    'Breed1',
    'Breed2',
    'Color1',
    'Color2',
    'Color3',
]

transformed_df = pd.get_dummies(transformed_df, columns=dummy_cols)

In [54]:
transformed_df.drop(columns=["VideoAmt", "PhotoAmt", "RescuerID", "Description", "PetID",
                             "Age", "Fee", "RescuerID_Count"], inplace=True)

# Included - Quantity

In [55]:
# Split into train test 20%
train, test = train_test_split(transformed_df, test_size=0.2, random_state=42, shuffle=True)
X_train = train.drop(columns=["AdoptionSpeed"])
y_train = train["AdoptionSpeed"]
X_test = test.drop(columns=["AdoptionSpeed"])
y_test = test["AdoptionSpeed"]

# Run through model
clf_all = MultinomialNB(alpha=1)
clf_all.fit(X_train, y_train)
y_pred = clf_all.predict(X_test)

# Model Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Kappa:", cohen_kappa_score(y_test, y_pred, weights='quadratic'))
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Accuracy: 0.35978659553184394
Kappa: 0.2562594914543773


Predicted,0,1,2,3,4,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,30,20,14,16,81
1,2,206,208,70,132,618
2,5,193,298,89,225,810
3,5,121,222,116,185,649
4,5,114,181,83,458,841
All,18,664,929,372,1016,2999


In [56]:
# Now do GridSearchCV on the model
params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0, ], }
quadratic_kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')
grid = GridSearchCV(MultinomialNB(), params, refit = True, verbose = 3, scoring=quadratic_kappa_scorer)

grid.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] alpha=0.01 ......................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .......................... alpha=0.01, score=0.261, total=   0.3s
[CV] alpha=0.01 ......................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] .......................... alpha=0.01, score=0.279, total=   0.2s
[CV] alpha=0.01 ......................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV] .......................... alpha=0.01, score=0.243, total=   0.2s
[CV] alpha=0.01 ......................................................
[CV] .......................... alpha=0.01, score=0.279, total=   0.3s
[CV] alpha=0.01 ......................................................
[CV] .......................... alpha=0.01, score=0.286, total=   0.3s
[CV] alpha=0.1 .......................................................
[CV] ........................... alpha=0.1, score=0.259, total=   0.2s
[CV] alpha=0.1 .......................................................
[CV] ........................... alpha=0.1, score=0.278, total=   0.2s
[CV] alpha=0.1 .......................................................
[CV] ........................... alpha=0.1, score=0.246, total=   0.2s
[CV] alpha=0.1 .......................................................
[CV] ........................... alpha=0.1, score=0.282, total=   0.2s
[CV] alpha=0.1 .......................................................
[CV] .

[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    6.9s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [0.01, 0.1, 0.5, 1.0, 10.0]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(cohen_kappa_score, weights=quadratic),
             verbose=3)

In [57]:
print(grid.best_params_)

{'alpha': 0.1}


In [58]:
# Run through model v2
clf_all = MultinomialNB(alpha=0.1)
clf_all.fit(X_train, y_train)
y_pred = clf_all.predict(X_test)

# Model Accuracy v2
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Kappa:", cohen_kappa_score(y_test, y_pred, weights='quadratic'))
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Accuracy: 0.3567855951983995
Kappa: 0.25695836024345053


Predicted,0,1,2,3,4,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,30,20,16,15,81
1,8,203,206,75,126,618
2,5,194,291,97,223,810
3,2,124,213,125,185,649
4,7,116,178,89,451,841
All,22,667,908,402,1000,2999


### 4. Bernoulli NB with one-hot encoding


In [59]:
 from sklearn.naive_bayes import BernoulliNB

In [60]:
df = pd.read_csv(TRAIN_TABULAR)

transformed_df = df.copy()

# Change Name to 1 - has name, 0 - has no name
transformed_df['Name'] = transformed_df['Name'].fillna(0)
transformed_df['Name'].replace({"No Name": 0, "No Name Yet": 0, "Unknown": 0},inplace=True)
transformed_df.loc[transformed_df['Name'] !=0, 'Name'] = 1

# RescuerID Counts
rescuerid_counts = pd.DataFrame(transformed_df.groupby(["RescuerID"]).size(), columns=["RescuerID_Count"]).reset_index()
transformed_df = pd.merge(transformed_df, rescuerid_counts, on="RescuerID")

# create age bins
age_bins = pd.cut(transformed_df['Age'], bins=[0,3,6,12,24,300], include_lowest=True)
age_bin_dummies = pd.get_dummies(age_bins)
age_bin_dummies.columns = ["is_age_0_3", "is_age_3_6", "is_age_6_12","is_age_12_24", "is_age_24_300"]
transformed_df = pd.concat([transformed_df, age_bin_dummies], axis=1)

# one-hot encode dummy variables

dummy_cols = [
    'Type',
    'Gender',
    'MaturitySize',
    'Vaccinated',
    'Dewormed',
    'Sterilized',
    'Health',
    'FurLength',
    'State',
    'Breed1',
    'Breed2',
    'Color1',
    'Color2',
    'Color3',
]

transformed_df = pd.get_dummies(transformed_df, columns=dummy_cols)

In [61]:
transformed_df.drop(columns=["VideoAmt", "PhotoAmt", "RescuerID", "Description", "PetID",
                             "Age", "Fee", "RescuerID_Count"], inplace=True)

# Included - Quantity

In [62]:
# Split into train test 20%
train, test = train_test_split(transformed_df, test_size=0.2, random_state=42, shuffle=True)
X_train = train.drop(columns=["AdoptionSpeed"])
y_train = train["AdoptionSpeed"]
X_test = test.drop(columns=["AdoptionSpeed"])
y_test = test["AdoptionSpeed"]

# Run through model
clf_all = BernoulliNB(alpha=1)
clf_all.fit(X_train, y_train)
y_pred = clf_all.predict(X_test)

# Model Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)
print("Kappa:", cohen_kappa_score(y_test, y_pred, weights='quadratic'))

Accuracy: 0.36178726242080694
Kappa: 0.23784268585208956


In [63]:
# Now do GridSearchCV on the model
params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0, ], }
quadratic_kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')
grid = GridSearchCV(MultinomialNB(), params, refit = True, verbose = 3, scoring=quadratic_kappa_scorer)

grid.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] alpha=0.01 ......................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .......................... alpha=0.01, score=0.261, total=   0.3s
[CV] alpha=0.01 ......................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] .......................... alpha=0.01, score=0.279, total=   0.2s
[CV] alpha=0.01 ......................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV] .......................... alpha=0.01, score=0.243, total=   0.3s
[CV] alpha=0.01 ......................................................
[CV] .......................... alpha=0.01, score=0.279, total=   0.5s
[CV] alpha=0.01 ......................................................
[CV] .......................... alpha=0.01, score=0.286, total=   0.3s
[CV] alpha=0.1 .......................................................
[CV] ........................... alpha=0.1, score=0.259, total=   0.2s
[CV] alpha=0.1 .......................................................
[CV] ........................... alpha=0.1, score=0.278, total=   0.2s
[CV] alpha=0.1 .......................................................
[CV] ........................... alpha=0.1, score=0.246, total=   0.3s
[CV] alpha=0.1 .......................................................
[CV] ........................... alpha=0.1, score=0.282, total=   0.2s
[CV] alpha=0.1 .......................................................
[CV] .

[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    6.4s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [0.01, 0.1, 0.5, 1.0, 10.0]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(cohen_kappa_score, weights=quadratic),
             verbose=3)

In [64]:
print(grid.best_params_)

{'alpha': 0.1}


In [65]:
# Run through model
clf_all = BernoulliNB(alpha=0.1)
clf_all.fit(X_train, y_train)
y_pred = clf_all.predict(X_test)

# Model Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Kappa:", cohen_kappa_score(y_test, y_pred, weights='quadratic'))
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Accuracy: 0.35745248416138714
Kappa: 0.24366698325412572


Predicted,0,1,2,3,4,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3,29,17,14,18,81
1,15,236,161,77,129,618
2,11,225,256,106,212,810
3,6,149,167,133,194,649
4,15,147,148,87,444,841
All,50,786,749,417,997,2999
