# Alex's Scratchpad

### Feature Engineering Notes
* NLP on pet description (use given sentiment analysis first?)
* Dog breed type categorizing
* Pure vs mixed class
* Breed encoding by # datapoints > threshold

In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge, RidgeClassifierCV
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import make_scorer
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from quadratic_weighted_kappa import quadratic_weighted_kappa

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train = pd.read_csv('data_minus_images/train.csv')

In [3]:
train.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed
0,2,Nibble,3,299,0,1,1,7,0,1,...,1,1,100,41326,8480853f516546f6cf33aa88cd76c379,0,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0,2
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,1,1,0,41401,3082c7125d8fb66f7dd4bff4192c8b14,0,I just found it alone yesterday near my apartm...,6296e909a,2.0,0
2,1,Brisco,1,307,0,1,2,7,0,2,...,1,1,0,41326,fa90fa5b1ee11c86938398b60abc32cb,0,Their pregnant mother was dumped by her irresp...,3422e4906,7.0,3
3,1,Miko,4,307,0,2,1,2,0,2,...,1,1,150,41401,9238e4f44c71a75282e62f7136c6b240,0,"Good guard dog, very alert, active, obedience ...",5842f1ff5,8.0,2
4,1,Hunter,1,307,0,1,1,0,0,2,...,1,1,0,41326,95481e953f8aed9ec3d16fc4509537e8,0,This handsome yet cute boy is up for adoption....,850a43f90,3.0,2


In [4]:
# Read sentiment file and extract score
def extract_sentiment(petID):
    try:
        with open('data_minus_images/train_sentiment/'+petID+'.json') as target:
            sentiment = json.load(target)
            magnitude, score = sentiment['documentSentiment'].values()
            language = sentiment['language']      
    except:
        magnitude, score, language = 0, 0, None
    return [magnitude, score, language]

In [5]:
extract_sentiment(train.iloc[0].PetID)

[2.4, 0.3, 'en']

In [6]:
def load_sentiments(pet_df):
    output = pet_df.set_index('PetID')
    output['magnitude'] = 0
    output['score'] = 0
    output['language'] = None
    for pet in output.index.values:
        sentiments = extract_sentiment(pet)
        output.loc[pet,['magnitude','score','language']] = sentiments
    return output.reset_index()

In [7]:
small_train = train.iloc[:100]
small_train

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed
0,2,Nibble,3,299,0,1,1,7,0,1,...,1,1,100,41326,8480853f516546f6cf33aa88cd76c379,0,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0,2
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,1,1,0,41401,3082c7125d8fb66f7dd4bff4192c8b14,0,I just found it alone yesterday near my apartm...,6296e909a,2.0,0
2,1,Brisco,1,307,0,1,2,7,0,2,...,1,1,0,41326,fa90fa5b1ee11c86938398b60abc32cb,0,Their pregnant mother was dumped by her irresp...,3422e4906,7.0,3
3,1,Miko,4,307,0,2,1,2,0,2,...,1,1,150,41401,9238e4f44c71a75282e62f7136c6b240,0,"Good guard dog, very alert, active, obedience ...",5842f1ff5,8.0,2
4,1,Hunter,1,307,0,1,1,0,0,2,...,1,1,0,41326,95481e953f8aed9ec3d16fc4509537e8,0,This handsome yet cute boy is up for adoption....,850a43f90,3.0,2
5,2,,3,266,0,2,5,6,0,2,...,1,1,0,41326,22fe332bf9c924d4718005891c63fbed,0,This is a stray kitten that came to my house. ...,d24c30b4b,2.0,2
6,2,BULAT,12,264,264,1,1,0,0,2,...,1,1,300,41326,1e0b5a458b5b77f5af581d57ebf570b3,0,anyone within the area of ipoh or taiping who ...,1caa6fcdb,3.0,1
7,1,Siu Pak & Her 6 Puppies,0,307,0,2,1,2,7,2,...,1,6,0,41326,1fba5f6e5480946254590d48f9c5198d,0,Siu Pak just give birth on 13/6/10 to 6puppies...,97aa9eeac,9.0,3
8,2,,2,265,0,2,6,0,0,2,...,1,1,0,41326,d8af7afece71334473575c9f70daf00d,0,"healthy and active, feisty kitten found in nei...",c06d167ca,6.0,1
9,2,Kitty,12,265,0,2,1,7,0,2,...,1,1,0,41326,1f3f36e4b18e94855b3e88af0852fdc4,0,"Very manja and gentle stray cat found, we woul...",7a0942d61,2.0,4


In [8]:
test = load_sentiments(small_train)
test.head()

Unnamed: 0,PetID,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,...,Fee,State,RescuerID,VideoAmt,Description,PhotoAmt,AdoptionSpeed,magnitude,score,language
0,86e1089a3,2,Nibble,3,299,0,1,1,7,0,...,100,41326,8480853f516546f6cf33aa88cd76c379,0,Nibble is a 3+ month old ball of cuteness. He ...,1.0,2,2.4,0.3,en
1,6296e909a,2,No Name Yet,1,265,0,1,1,2,0,...,0,41401,3082c7125d8fb66f7dd4bff4192c8b14,0,I just found it alone yesterday near my apartm...,2.0,0,0.7,-0.2,en
2,3422e4906,1,Brisco,1,307,0,1,2,7,0,...,0,41326,fa90fa5b1ee11c86938398b60abc32cb,0,Their pregnant mother was dumped by her irresp...,7.0,3,3.7,0.2,en
3,5842f1ff5,1,Miko,4,307,0,2,1,2,0,...,150,41401,9238e4f44c71a75282e62f7136c6b240,0,"Good guard dog, very alert, active, obedience ...",8.0,2,0.9,0.9,en
4,850a43f90,1,Hunter,1,307,0,1,1,0,0,...,0,41326,95481e953f8aed9ec3d16fc4509537e8,0,This handsome yet cute boy is up for adoption....,3.0,2,3.7,0.6,en


### Add Sentiments

In [9]:
add_sentiments = load_sentiments(train)

### Add dog breed group, purebred label

In [10]:
breeds = pd.read_csv('data_minus_images/breed_labels.csv')

In [11]:
encoding = pd.read_csv('breed_group_encoding.csv', header=None,encoding = "ISO-8859-1")
encoding.set_index(0,inplace=True)
group_dict = encoding.to_dict()[1]

In [12]:
# Add breedname for group processing
add_breeds = pd.merge(add_sentiments,breeds.drop(columns=['Type']),left_on='Breed1',right_on='BreedID').drop(columns=['BreedID'])
# Preload with empty columns
add_breeds['BreedGroup'] = 'MISC'
add_breeds['BreedGroupID'] = 8

In [13]:
add_breeds.columns

Index(['PetID', 'Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1',
       'Color2', 'Color3', 'MaturitySize', 'FurLength', 'Vaccinated',
       'Dewormed', 'Sterilized', 'Health', 'Quantity', 'Fee', 'State',
       'RescuerID', 'VideoAmt', 'Description', 'PhotoAmt', 'AdoptionSpeed',
       'magnitude', 'score', 'language', 'BreedName', 'BreedGroup',
       'BreedGroupID'],
      dtype='object')

In [14]:
group_ID=pd.DataFrame.from_dict({'MIXED':0,
                                  'HERDING':1,
                                  'HOUND':2,
                                  'TOY':3,
                                  'NON-SPORTING':4,
                                  'SPORTING':5,
                                  'TERRIER':6,
                                  'WORKING':7,
                                  'MISC':8,
                                  'FSS':9},orient='index',columns=['BreedID'])
group_ID

Unnamed: 0,BreedID
MIXED,0
HERDING,1
HOUND,2
TOY,3
NON-SPORTING,4
SPORTING,5
TERRIER,6
WORKING,7
MISC,8
FSS,9


In [15]:
# TODO: fix mixed breed label to account for multiple animals
def group_encoding(row):
    try:
        if row['Type']==2:
            row['BreedGroup']='CAT'
            row['BreedGroupID']=-1
        elif row['Breed1']==307 or row['Breed2']!=0:
            row['BreedGroup']='MIXED'
            row['BreedGroupID']=0
        else:
            group = group_dict[row['BreedName']]
            row['BreedGroup'] = group
            row['BreedGroupID'] = group_ID.loc[group].BreedID
    except:
        pass
    return row

In [16]:
add_groups = add_breeds.apply(group_encoding,axis=1)
# Add purebred flag
add_groups['purebred']=~((add_groups['BreedGroup']=='MIXED')|(add_groups['Breed2']!=0))*1

### Add Description Length

In [17]:
add_groups['desc_len'] = add_groups['Description'].apply(lambda x: len(str(x)))
add_groups['start_cap'] = add_groups['Description'].apply(lambda x: 1*(str(x)[0]!=str(x)[0].lower()))
add_groups[['Description','desc_len','start_cap']]

Unnamed: 0,Description,desc_len,start_cap
0,Nibble is a 3+ month old ball of cuteness. He ...,359,1
1,"Manja, comel ,dan memahami perasaan org.",40,1
2,"This tabby was found by the clinic, has been s...",707,1
3,Found her when she's just about a weeks old at...,130,1
4,My cat's offspring Cham is lively and loves pl...,99,1
5,"Looking for a playful, caring and affectionate...",869,1
6,Ashley and Ginger have been left to fend for t...,587,1
7,rescued a kitten at housing area. we are unabl...,144,0
8,Milk Milk & Mio Mio are cheerful kitten.They w...,163,1
9,KaKa is one of the 3 siblings with ChaoChao & ...,408,1


### Add Color Count, Name Flag

In [18]:
add_groups['num_colors']=1*(add_groups['Color1']>0)+1*(add_groups['Color2']>0)+1*(add_groups['Color3']>0)
add_groups['has_name']=~pd.isnull(add_groups['Name'])*1

### Dummify categoricals

In [19]:
dummify = [
    'Gender'
    ,'Type'
]
add_groups.columns

Index(['PetID', 'Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1',
       'Color2', 'Color3', 'MaturitySize', 'FurLength', 'Vaccinated',
       'Dewormed', 'Sterilized', 'Health', 'Quantity', 'Fee', 'State',
       'RescuerID', 'VideoAmt', 'Description', 'PhotoAmt', 'AdoptionSpeed',
       'magnitude', 'score', 'language', 'BreedName', 'BreedGroup',
       'BreedGroupID', 'purebred', 'desc_len', 'start_cap', 'num_colors',
       'has_name'],
      dtype='object')

In [20]:
dummies = pd.get_dummies(add_groups[dummify],columns=dummify)
add_dummies = pd.concat([add_groups,dummies],axis=1)

### Drop Columns and Fit Models

In [21]:
model_train = add_dummies
model_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14988 entries, 0 to 14987
Data columns (total 40 columns):
PetID            14988 non-null object
Type             14988 non-null int64
Name             13732 non-null object
Age              14988 non-null int64
Breed1           14988 non-null int64
Breed2           14988 non-null int64
Gender           14988 non-null int64
Color1           14988 non-null int64
Color2           14988 non-null int64
Color3           14988 non-null int64
MaturitySize     14988 non-null int64
FurLength        14988 non-null int64
Vaccinated       14988 non-null int64
Dewormed         14988 non-null int64
Sterilized       14988 non-null int64
Health           14988 non-null int64
Quantity         14988 non-null int64
Fee              14988 non-null int64
State            14988 non-null int64
RescuerID        14988 non-null object
VideoAmt         14988 non-null int64
Description      14976 non-null object
PhotoAmt         14988 non-null float64
AdoptionSpe

In [22]:
seed = 10
drop_columns = [
                'Name'
                ,'Breed1'
                ,'Breed2'
                ,'RescuerID'
                ,'Description'
                ,'PetID'
                ,'AdoptionSpeed'
                ,'language'
                ,'BreedGroup'
                ,'BreedName'
                ,'Color1'
                ,'Color2'
                ,'Color3'
                ,'State'
               ]
target = 'AdoptionSpeed' 
X = model_train.drop(columns=drop_columns+dummify)
y = model_train[target].astype('str')

sclr = StandardScaler()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=seed)
X_train = sclr.fit_transform(X_train)
X_val = sclr.transform(X_val)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [23]:
def quadratic_weighted_kappa_scorer(estimator, X, y):
    pred_class = np.argmax(estimator.predict(X),axis=1)
    y_class = np.argmax(y,axis=1)
    score = quadratic_weighted_kappa(pred_class,y_class,min_rating=0,max_rating=4)
    return score

In [24]:
log = LogisticRegression(multi_class='ovr')
log.fit(X_train, y_train)
log_pred = log.predict(X_val)

ridge_cv = RidgeClassifierCV(alphas=(0.1,.5,1,2,5,10),scoring=quadratic_weighted_kappa_scorer, cv=10)
ridge_cv.fit(X_train, y_train)
ridge_pred = ridge_cv.predict(X_val)



In [25]:
gbc = GradientBoostingClassifier(subsample=0.9, max_features=X_train.shape[1]//3, random_state=seed, verbose=1,learning_rate=0.1, n_estimators=100, max_leaf_nodes=1000, min_samples_split=5)
gbc.fit(X_train, y_train)
gbc_pred = gbc.predict(X_val)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1       16628.4170          33.8151            8.78s
         2       16359.6968          26.9287            7.55s
         3       16119.0103          24.5354            8.68s
         4       15919.4016          19.9505            8.68s
         5       15732.6781          19.2223            9.96s
         6       15566.3465          17.0646            9.53s
         7       15423.3063          11.5131            9.63s
         8       15289.4637          11.5905           10.19s
         9       15192.3989          10.6279           10.33s
        10       15087.4590           8.5543           10.17s
        20       14410.3009           2.5288            7.67s
        30       14129.3355           0.7426            6.76s
        40       13864.3072          -0.5806            5.83s
        50       13670.0951          -0.6643            4.88s
        60       13494.7569           0.1713            3.96s
       

In [26]:
print(quadratic_weighted_kappa(log_pred,y_val))
print(quadratic_weighted_kappa(ridge_pred,y_val))
print(quadratic_weighted_kappa(gbc_pred,y_val))

0.21047077500935585
0.20115001207557204
0.3091648917675981


In [27]:
gbc_pred

array(['1', '4', '1', ..., '4', '4', '4'], dtype=object)

### Gradient Boosting hyperparameter tuning

Workflow notes:
* Define starting parameters: high learning rate (default = 0.1 is fine) and tree params
* Tune number of trees (trade-off with learning rate)
* Tune tree parameters


In [31]:
# Starting params
learning_rate = 0.1 # default
min_samples_split = int(X_train.shape[0]//100)
min_samples_leaf = int(min_samples_split//10)
max_depth = 5 # small sample size
max_features = 'sqrt'
subsample=0.8
first_gbc = GradientBoostingClassifier(min_samples_split=min_samples_split,
                                       min_samples_leaf=min_samples_leaf,
                                       max_depth=max_depth,
                                       max_features=max_features,
                                       subsample=subsample,
                                       random_state=seed)

In [32]:
def quadratic_weighted_kappa_gbc_scorer(estimator, X, y):
    pred_class = estimator.predict(X)
    y_class = y
    score = quadratic_weighted_kappa(pred_class,y_class,min_rating=0,max_rating=4)
    return score

In [33]:
trees = {'n_estimators':range(20,81,10)}#[20,30,40,50,60,70,80]}
GSCV_1 = GridSearchCV(estimator = first_gbc,
                      param_grid = trees,
                      scoring=quadratic_weighted_kappa_gbc_scorer,
                      n_jobs=-1, iid=False, cv=5, verbose=1)
GSCV_1.fit(X_train, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:   49.9s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=5,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=11, min_sa...      subsample=0.8, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid=False, n_jobs=-1,
       param_grid={'n_estimators': range(20, 81, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=<function quadratic_weighted_kappa_gbc_scorer at 0x000001BF09AE8620>,
       verbose=1)

In [34]:
# Optimal tree count:
GSCV_1.best_params_, GSCV_1.best_score_

({'n_estimators': 80}, 0.32883233131970924)

In [35]:
n_estimators = GSCV_1.best_params_['n_estimators']
forest_sizes = {'max_depth':range(5,16,2),
                'min_samples_split':range(200,1001,200)}
second_gbc = GradientBoostingClassifier(min_samples_split=min_samples_split,
                                       min_samples_leaf=min_samples_leaf,
                                       n_estimators=n_estimators,
                                       max_depth=max_depth,
                                       max_features=max_features,
                                       subsample=subsample,
                                       random_state=seed)
GSCV_2 = GridSearchCV(estimator = second_gbc,
                      param_grid = forest_sizes,
                      scoring=quadratic_weighted_kappa_gbc_scorer,
                      n_jobs=-1, iid=False, cv=5, verbose=1)
GSCV_2.fit(X_train,y_train)
GSCV_2.best_params_, GSCV_2.best_score_

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  6.6min finished


({'max_depth': 11, 'min_samples_split': 600}, 0.3358124253432269)

In [36]:
# Tune features and leaf_sizes
max_depth, min_samples_split = GSCV_2.best_params_.values()
tree_params = {'max_features':range(7,20,4),'min_samples_leaf':range(30,71,10)}
last_gbc = GradientBoostingClassifier(min_samples_split=min_samples_split,
                                       n_estimators=n_estimators,
                                       max_depth=max_depth,
                                       subsample=subsample,
                                       random_state=seed)
GSCV_3 = GridSearchCV(estimator = last_gbc,
                      param_grid = tree_params,
                      scoring=quadratic_weighted_kappa_gbc_scorer,
                      n_jobs=-1, iid=False, cv=5, verbose=1)
GSCV_3.fit(X_train,y_train)
GSCV_3.best_params_, GSCV_3.best_score_

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  9.0min finished


({'max_features': 11, 'min_samples_leaf': 40}, 0.33720269988507134)

In [37]:
max_features, min_samples_leaf = GSCV_3.best_params_.values()