## Pet finder Prediction

In [1]:
import gc
import glob
import os

import json
import matplotlib.pyplot as plt
import pprint
import numpy as np
import pandas as pd
from sklearn.externals.joblib import Parallel, delayed
from tqdm import tqdm_notebook
from PIL import Image
#import cv2

%matplotlib inline



In [2]:
import cv2

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
pd.options.display.max_rows= 128
pd.options.display.max_columns = 128

In [5]:
plt.rcParams['figure.figsize'] = (12,9)

In [6]:
os.listdir('./input/petfinder-adoption-prediction/test')

['sample_submission.csv', 'test.csv']

In [7]:
df_train = pd.read_csv('./input/petfinder-adoption-prediction/train/train.csv')
df_test  = pd.read_csv('./input/petfinder-adoption-prediction/test/test.csv')
sample_submission = pd.read_csv('./input/petfinder-adoption-prediction/test/sample_submission.csv')

##### load mapping dictionaries:

In [8]:
labels_breed = pd.read_csv('./input/petfinder-adoption-prediction/breed_labels.csv')
labels_state = pd.read_csv('./input/petfinder-adoption-prediction/color_labels.csv')
labels_color = pd.read_csv('./input/petfinder-adoption-prediction/state_labels.csv')

In [9]:
gc.collect()

11

In [10]:
train_image_files = glob.glob('./input/petfinder-adoption-prediction/train_images/*jpg')
train_metadata_files = glob.glob('./input/petfinder-adoption-prediction/train_metadata/*json')
train_sentiment_files = glob.glob('./input/petfinder-adoption-prediction/train_sentiment/*json')


test_image_files     = glob.glob('./input/petfinder-adoption-prediction/test_images/*jpg')
test_metadata_files  = glob.glob('./input/petfinder-adoption-prediction/test_metadata/*json')
test_sentiment_files = glob.glob('./input/petfinder-adoption-prediction/test_sentiment/*json')

In [11]:
df_train.columns

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed'],
      dtype='object')

##### For Training Set

In [12]:
train_image_files[-1]

'./input/petfinder-adoption-prediction/train_images\\fffd9b5a8-1.jpg'

In [13]:
## Image file
df_train_id = df_train['PetID']
df_train_imgs = pd.DataFrame(train_image_files)  ## from glob.glob
df_train_imgs.columns = ['image_filenames']

df_train_pets = df_train_imgs['image_filenames'].apply(lambda x: x.split('\\')[-1].split('-')[0])
df_train_imgs = df_train_imgs.assign(PetID=df_train_pets)


print('fraction pets with images  : {:0.2f} %' .format(df_train_imgs['PetID'].nunique()*100/ df_train['PetID'].nunique()))

fraction pets with images  : 97.73 %


In [14]:
## Manually verify
df_train_imgs.head(2)

Unnamed: 0,image_filenames,PetID
0,./input/petfinder-adoption-prediction/train_im...,0008c5398
1,./input/petfinder-adoption-prediction/train_im...,0008c5398


In [15]:
train_metadata_files[-1]

'./input/petfinder-adoption-prediction/train_metadata\\fffd9b5a8-1.json'

In [16]:
## Metadata files

df_train_id = df_train['PetID']
df_train_metadata = pd.DataFrame(train_metadata_files)  ## from glob.glob
df_train_metadata.columns = ['metadata_filenames']

df_train_pets = df_train_metadata['metadata_filenames'].apply(lambda x: x.split('\\')[-1].split('-')[0])
df_train_metadata['PetID'] = df_train_pets


print('fraction pets with metadata  : {:0.2f} %' .format(df_train_metadata['PetID'].nunique()*100/ df_train['PetID'].nunique()))

fraction pets with metadata  : 97.73 %


In [17]:
## Manually verify
df_train_metadata.head(2)

Unnamed: 0,metadata_filenames,PetID
0,./input/petfinder-adoption-prediction/train_me...,0008c5398
1,./input/petfinder-adoption-prediction/train_me...,0008c5398


In [18]:
## Sentiment files

df_train_id = df_train['PetID']
df_train_sentiment = pd.DataFrame(train_sentiment_files)  ## from glob.glob
df_train_sentiment.columns = ['sentiment_filenames']

df_train_pets = df_train_sentiment['sentiment_filenames'].apply(lambda x: x.split('\\')[-1].split('.')[0])
df_train_sentiment['PetID'] = df_train_pets


print('fraction pets with sentiment  : {:0.2f} %' 
      .format(df_train_sentiment['PetID'].nunique()*100/ df_train['PetID'].nunique()))

fraction pets with sentiment  : 96.32 %


In [19]:
## Manually verify
df_train_sentiment.head(2)

Unnamed: 0,sentiment_filenames,PetID
0,./input/petfinder-adoption-prediction/train_se...,0008c5398
1,./input/petfinder-adoption-prediction/train_se...,000a290e4


##### For Test Set

In [20]:
## Image file
df_test_id = df_test['PetID']
df_test_imgs = pd.DataFrame(test_image_files)  ## from glob.glob
df_test_imgs.columns = ['image_filenames']

df_test_pets = df_test_imgs['image_filenames'].apply(lambda x: x.split('\\')[-1].split('-')[0])
df_test_imgs = df_test_imgs.assign(PetID=df_test_pets)


print('fraction pets with images  : {:0.2f} %' .format(df_test_imgs['PetID'].nunique()*100/ df_test['PetID'].nunique()))

fraction pets with images  : 96.78 %


In [21]:
## Metadata files

df_test_id = df_test['PetID']
df_test_metadata = pd.DataFrame(test_metadata_files)  ## from glob.glob
df_test_metadata.columns = ['metadata_filenames']

df_test_pets = df_test_metadata['metadata_filenames'].apply(lambda x: x.split('\\')[-1].split('-')[0])
df_test_metadata['PetID'] = df_test_pets


print('fraction pets with metadata  : {:0.2f} %' .format(df_test_metadata['PetID'].nunique()*100/ df_test['PetID'].nunique()))

fraction pets with metadata  : 96.78 %


In [22]:
## Sentiment files

df_test_id = df_test['PetID']
df_test_sentiment = pd.DataFrame(test_sentiment_files)  ## from glob.glob
df_test_sentiment.columns = ['sentiment_filenames']

df_test_pets = df_test_sentiment['sentiment_filenames'].apply(lambda x: x.split('\\')[-1].split('.')[0])
df_test_sentiment['PetID'] = df_test_pets


print('fraction pets with sentiment  : {:0.2f} %' 
      .format(df_test_sentiment['PetID'].nunique()*100/ df_test['PetID'].nunique()))

fraction pets with sentiment  : 96.63 %


In [23]:
## train_imgs_pets

class PetFinderParser(object):
    
    
    def __init__(self , debug = False):
        self.debug = debug
        self.sentence_sep = ' '
        self.extract_sentiment_text = ' '
    
    
    def open_metadata_file(self,filename):
        ''' 
        Load Metadata File
        '''
        with open(filename,'r', encoding = 'utf8') as f:
            metadata_file = json.load(f)
        return metadata_file
        
    def open_sentiment_file(self, filename):
        ''' 
        Load Sentiment
        '''
        with open(filename,'r', encoding = 'utf8') as f:
            sentiment_file = json.load(f)
        return sentiment_file
        
        
    def open_image_file(self,filename):
        '''
        Load Images
        '''
        with open(filename, 'r') as  f:
            image = np.asarray(Image.open(filename))
        return image
    
    
    def parse_sentiment_file(self,file):
        """
        Parse sentiment file. Output DF with sentiment features.
        """
        
        file_sentiment = file['documentSentiment']  ## This has magnitude and score
        file_entities = [x['name'] for x in file['entities']]
        file_entities = self.sentence_sep.join(file_entities)
        
        
        if self.extract_sentiment_text:
            file_sentences_text = [x['text']['content'] for x in file['sentences']]
            file_sentences_text = self.sentence_sep.join(file_sentences_text)
        
        file_sentences_sentiment = [x['sentiment'] for x in file['sentences']]
        file_sentences_sentiment = pd.DataFrame.from_dict(file_sentences_sentiment , orient = 'columns').sum()
        file_sentences_sentiment = file_sentences_sentiment.add_prefix('document_').to_dict()
        file_sentiment.update(file_sentences_sentiment)
        df_sentiment = pd.DataFrame.from_dict(file_sentiment, orient = 'index').T  ## here we have 4 columns
        if self.extract_sentiment_text:
            df_sentiment['text'] = file_sentences_text 
        
        df_sentiment['entities'] = file_entities 
        df_sentiment = df_sentiment.add_prefix('sentiment_')
        return df_sentiment
    
    
    def parse_metadata_file(self,file):
        '''
        Parse Metadata File
        '''
        if 'labelAnnotations' in  file.keys():
            file_annots = file['labelAnnotations'][: int(len(file['labelAnnotations'])*0.3)]
            file_top_score = np.asarray([x['score'] for x in file_annots]).mean()
            file_top_desc = [x['description'] for x in file_annots]
        else:
            file_top_score = np.nan
            file_top_desc = [' ']
        
        file_colors = file['imagePropertiesAnnotation']['dominantColors']['colors']
        file_colors_score = np.asarray([x['score'] for x in file_colors]).mean()
        file_colors_pixelFrac = np.asarray([x['pixelFraction'] for x in file_colors]).mean()
        
        file_crop = file['cropHintsAnnotation']['cropHints']
        
        file_crop_conf = np.asarray([x['confidence'] for x in file_crop]).mean()
        if 'importanceFraction' in file_crop[0].keys():
            file_crop_importance = np.asarray([x['importanceFraction'] for x in file_crop]).mean()
        else:
            file_crop_importance = np.nan
            
        df_metadata = {
            'annots_score':file_top_score,
            'annots_top_desc' : self.sentence_sep.join(file_top_desc),
            
            'colors_score' :  file_colors_score,
            'colors_pixelFrac' : file_colors_pixelFrac,
            
            'crop_conf' : file_crop_conf ,
            'crop_importance' : file_crop_importance
            
        }
        df_metadata = pd.DataFrame.from_dict(df_metadata, orient = 'index').T
        
        df_metadata = df_metadata.add_prefix('metadata_')
        return df_metadata
        
# Helper function for parallel data processing:

def extract_additional_features(pet_id, mode = 'train'):
    sentiment_filename = './input/petfinder-adoption-prediction/{}_sentiment/{}.json'.format(mode, pet_id)
    try:
        sentiment_file = pet_finder_parser.open_sentiment_file(sentiment_filename)
        df_sentiment   = pet_finder_parser.parse_sentiment_file(sentiment_file) 
        df_sentiment['PetID'] =pet_id
    except FileNotFoundError:
        print('abcd')
        df_sentiment = []
    
    dfs_metadata = []
    metadata_files = sorted(glob.glob('./input/petfinder-adoption-prediction/{}_metadata/{}*.json'.format(mode, pet_id)))
    if len(metadata_files) > 0:
        for f in metadata_files:
            metadata_file = pet_finder_parser.open_metadata_file(f)
            df_metadata   = pet_finder_parser.parse_metadata_file(metadata_file) 
            df_metadata['PetID'] = pet_id
            dfs_metadata.append(df_metadata)
        dfs_metadata = pd.concat(dfs_metadata, ignore_index = True, sort = False)
        
    dfs = [df_sentiment, dfs_metadata]
    
    return dfs

pet_finder_parser = PetFinderParser()

In [24]:
##unique Pet Ids

debug = True

train_pet_id = df_train['PetID'].unique()
test_pet_id  = df_test['PetID'].unique()

if debug == True:
    train_pet_id = train_pet_id[:1000]
    test_pet_id  = test_pet_id[:500]
    
dfs_train = Parallel(n_jobs = 8, verbose =1)(
            delayed(extract_additional_features)(i, mode = 'train') for i in train_pet_id)


train_dfs_sentiment = [x[0] for x in dfs_train if isinstance(x[0], pd.DataFrame)]
train_dfs_metadata  = [x[1] for x in dfs_train if isinstance(x[1], pd.DataFrame)]


train_dfs_sentiment = pd.concat(train_dfs_sentiment, ignore_index = True, sort = False)
train_dfs_metadata  = pd.concat(train_dfs_metadata, ignore_index = True, sort = False)

print(train_dfs_sentiment.shape,train_dfs_metadata.shape )

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    9.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   27.9s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:  1.0min
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:  1.7min
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:  2.2min finished


(949, 7) (3996, 7)


In [25]:
train_dfs_sentiment.columns

Index(['sentiment_magnitude', 'sentiment_score',
       'sentiment_document_magnitude', 'sentiment_document_score',
       'sentiment_text', 'sentiment_entities', 'PetID'],
      dtype='object')

In [26]:
# Test set:
# Parallel processing of data:
dfs_test = Parallel(n_jobs=8, verbose=1)(
    delayed(extract_additional_features)(i, mode='test') for i in test_pet_id)

# Extract processed data and format them as DFs:
test_dfs_sentiment = [x[0] for x in dfs_test if isinstance(x[0], pd.DataFrame)]
test_dfs_metadata = [x[1] for x in dfs_test if isinstance(x[1], pd.DataFrame)]

test_dfs_sentiment = pd.concat(test_dfs_sentiment, ignore_index=True, sort=False)
test_dfs_metadata = pd.concat(test_dfs_metadata, ignore_index=True, sort=False)

print(test_dfs_sentiment.shape, test_dfs_metadata.shape)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.5s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    9.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   26.4s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:   31.7s finished


(497, 7) (1792, 7)


In [27]:
# Extend aggregates and improve column naming

aggregates = ['mean', 'sum']

train_metadata_desc = train_dfs_metadata.groupby(['PetID'])['metadata_annots_top_desc'].unique()
train_metadata_desc = train_metadata_desc.reset_index()
train_metadata_desc['metadata_annots_top_desc'] = train_metadata_desc['metadata_annots_top_desc'].apply(lambda x: ' '.join(x))

                            

prefix = 'metadata'

train_metadata_gr = train_dfs_metadata.drop(['metadata_annots_top_desc'], axis =1)

### convert the column type to float

for i in train_metadata_gr.columns:
    if i not in  'PetID':
        train_metadata_gr[i] = train_metadata_gr[i].astype(float)
    
train_metadata_gr = train_metadata_gr.groupby(['PetID']).agg(aggregates)

train_metadata_gr.columns = pd.Index(['{}_{}_{}'.format(
            prefix, c[0], c[1].upper()) for c in train_metadata_gr.columns.tolist()])

train_metadata_gr = train_metadata_gr.reset_index()

##Train sentiment

train_sentiment_desc = train_dfs_sentiment.groupby(['PetID'])['sentiment_entities'].unique()
train_sentiment_desc = train_sentiment_desc.reset_index()
train_sentiment_desc[
    'sentiment_entities'] = train_sentiment_desc[
    'sentiment_entities'].apply(lambda x: ' '.join(x))

prefix = 'sentiment'
##print(train_sentiment_gr.columns)
train_sentiment_gr = train_dfs_sentiment.drop(['sentiment_entities','sentiment_text'], axis=1)


for i in train_sentiment_gr.columns:
    if ('PetID' not in i) :
        train_sentiment_gr[i] = train_sentiment_gr[i].astype(float)
train_sentiment_gr = train_sentiment_gr.groupby(['PetID']).agg(aggregates)
train_sentiment_gr.columns = pd.Index(['{}_{}_{}'.format(
            prefix, c[0], c[1].upper()) for c in train_sentiment_gr.columns.tolist()])
train_sentiment_gr = train_sentiment_gr.reset_index()

In [28]:
train_metadata_gr.head(1)

Unnamed: 0,PetID,metadata_metadata_annots_score_MEAN,metadata_metadata_annots_score_SUM,metadata_metadata_colors_score_MEAN,metadata_metadata_colors_score_SUM,metadata_metadata_colors_pixelFrac_MEAN,metadata_metadata_colors_pixelFrac_SUM,metadata_metadata_crop_conf_MEAN,metadata_metadata_crop_conf_SUM,metadata_metadata_crop_importance_MEAN,metadata_metadata_crop_importance_SUM
0,0052dcf47,0.938122,2.814365,0.087819,0.263457,0.060522,0.181565,0.8,2.4,1.0,3.0


In [29]:
train_metadata_gr.head(1)

Unnamed: 0,PetID,metadata_metadata_annots_score_MEAN,metadata_metadata_annots_score_SUM,metadata_metadata_colors_score_MEAN,metadata_metadata_colors_score_SUM,metadata_metadata_colors_pixelFrac_MEAN,metadata_metadata_colors_pixelFrac_SUM,metadata_metadata_crop_conf_MEAN,metadata_metadata_crop_conf_SUM,metadata_metadata_crop_importance_MEAN,metadata_metadata_crop_importance_SUM
0,0052dcf47,0.938122,2.814365,0.087819,0.263457,0.060522,0.181565,0.8,2.4,1.0,3.0


In [30]:
# Test
test_metadata_desc = test_dfs_metadata.groupby(['PetID'])['metadata_annots_top_desc'].unique()
test_metadata_desc = test_metadata_desc.reset_index()
test_metadata_desc[
    'metadata_annots_top_desc'] = test_metadata_desc[
    'metadata_annots_top_desc'].apply(lambda x: ' '.join(x))

prefix = 'metadata'
test_metadata_gr = test_dfs_metadata.drop(['metadata_annots_top_desc'], axis=1)
for i in test_metadata_gr.columns:
    if 'PetID' not in i:
        test_metadata_gr[i] = test_metadata_gr[i].astype(float)
test_metadata_gr = test_metadata_gr.groupby(['PetID']).agg(aggregates)
test_metadata_gr.columns = pd.Index(['{}_{}_{}'.format(
            prefix, c[0], c[1].upper()) for c in test_metadata_gr.columns.tolist()])
test_metadata_gr = test_metadata_gr.reset_index()


test_sentiment_desc = test_dfs_sentiment.groupby(['PetID'])['sentiment_entities'].unique()
test_sentiment_desc = test_sentiment_desc.reset_index()
test_sentiment_desc[
    'sentiment_entities'] = test_sentiment_desc[
    'sentiment_entities'].apply(lambda x: ' '.join(x))

prefix = 'sentiment'
test_sentiment_gr = test_dfs_sentiment.drop(['sentiment_entities','sentiment_text'], axis=1)
for i in test_sentiment_gr.columns:
    if 'PetID' not in i:
        test_sentiment_gr[i] = test_sentiment_gr[i].astype(float)
test_sentiment_gr = test_sentiment_gr.groupby(['PetID']).agg(aggregates)
test_sentiment_gr.columns = pd.Index(['{}_{}_{}'.format(
            prefix, c[0], c[1].upper()) for c in test_sentiment_gr.columns.tolist()])
test_sentiment_gr = test_sentiment_gr.reset_index()


##### merge processed DFs with base train/test DF

In [31]:
## Train Merges

train_proc = df_train.copy()
train_proc = train_proc.merge(train_sentiment_gr, how='left', on='PetID')
train_proc = train_proc.merge(train_sentiment_desc, how='left', on='PetID')
train_proc = train_proc.merge(train_metadata_gr, how='left', on='PetID')
train_proc = train_proc.merge(train_metadata_desc, how='left', on='PetID')

# Test merges:
test_proc = df_test.copy()
test_proc = test_proc.merge(
    test_sentiment_gr, how='left', on='PetID')
test_proc = test_proc.merge(
    test_metadata_gr, how='left', on='PetID')
test_proc = test_proc.merge(
    test_metadata_desc, how='left', on='PetID')
test_proc = test_proc.merge(
    test_sentiment_desc, how='left', on='PetID')


print(train_proc.shape, test_proc.shape)
assert train_proc.shape[0] == df_train.shape[0]
assert test_proc.shape[0] == df_test.shape[0]

(14993, 44) (3948, 43)


In [32]:
#train_metadata_gr['PetID'].nunique()

In [33]:
#df_train['PetID'].nunique()

In [34]:
train_breed_main = train_proc[['Breed1']].merge(
    labels_breed, how='left',
    left_on='Breed1', right_on='BreedID',
    suffixes=('', '_main_breed'))

train_breed_main = train_breed_main.iloc[:, 2:]  ## all the columns except the first 2 i.e. Breed1 and BreedID
train_breed_main = train_breed_main.add_prefix('main_breed_')
train_breed_main.head(1)

train_breed_second = train_proc[['Breed2']].merge(
    labels_breed, how='left',
    left_on='Breed2', right_on='BreedID',
    suffixes=('', '_second_breed'))

train_breed_second = train_breed_second.iloc[:, 2:]
train_breed_second = train_breed_second.add_prefix('second_breed_')

train_proc = pd.concat(
    [train_proc, train_breed_main, train_breed_second], axis=1)

##test

test_breed_main = test_proc[['Breed1']].merge(
    labels_breed, how='left',
    left_on='Breed1', right_on='BreedID',
    suffixes=('', '_main_breed'))

test_breed_main = test_breed_main.iloc[:, 2:]
test_breed_main = test_breed_main.add_prefix('main_breed_')

test_breed_second = test_proc[['Breed2']].merge(
    labels_breed, how='left',
    left_on='Breed2', right_on='BreedID',
    suffixes=('', '_second_breed'))

test_breed_second = test_breed_second.iloc[:, 2:]
test_breed_second = test_breed_second.add_prefix('second_breed_')


test_proc = pd.concat(
    [test_proc, test_breed_main, test_breed_second], axis=1)

print(train_proc.shape, test_proc.shape)

(14993, 48) (3948, 47)


##### concatenate train & test:

In [35]:
#concatenate train & test:

#Inspect NaN structure of the processed data: AdoptionSpeed is the target column

In [36]:
X = pd.concat([train_proc, test_proc], ignore_index=True, sort=False)
print('NaN structure:\n{}'.format(np.sum(pd.isnull(X))))

NaN structure:
Type                                               0
Name                                            1560
Age                                                0
Breed1                                             0
Breed2                                             0
Gender                                             0
Color1                                             0
Color2                                             0
Color3                                             0
MaturitySize                                       0
FurLength                                          0
Vaccinated                                         0
Dewormed                                           0
Sterilized                                         0
Health                                             0
Quantity                                           0
Fee                                                0
State                                              0
RescuerID                      


extract different column types:

    integer columns are usually categorical features, which do not need encoding
    float columns are numerical features
    object columns are categorical features, which should be encoded



In [37]:
type(X['Age'][0])

numpy.int64

In [38]:
column_types = X.dtypes
int_cols = column_types[(column_types == 'int64') | (column_types == 'int32')]
float_cols = column_types[column_types == 'float']
cat_cols = column_types[column_types == 'object']

print('\tinteger columns:\n{}'.format(int_cols))
print('\n\tfloat columns:\n{}'.format(float_cols))
print('\n\t to encode categorical columns:\n{}'.format(cat_cols))

	integer columns:
Type            int64
Age             int64
Breed1          int64
Breed2          int64
Gender          int64
Color1          int64
Color2          int64
Color3          int64
MaturitySize    int64
FurLength       int64
Vaccinated      int64
Dewormed        int64
Sterilized      int64
Health          int64
Quantity        int64
Fee             int64
State           int64
VideoAmt        int64
dtype: object

	float columns:
PhotoAmt                                       float64
AdoptionSpeed                                  float64
sentiment_sentiment_magnitude_MEAN             float64
sentiment_sentiment_magnitude_SUM              float64
sentiment_sentiment_score_MEAN                 float64
sentiment_sentiment_score_SUM                  float64
sentiment_sentiment_document_magnitude_MEAN    float64
sentiment_sentiment_document_magnitude_SUM     float64
sentiment_sentiment_document_score_MEAN        float64
sentiment_sentiment_document_score_SUM         float64
metad

In [39]:
# Copy original X DF for easier experimentation,
# all feature engineering will be performed on this one:
X_temp = X.copy()

# Names are all unique, so they can be dropped by default
# Same goes for PetID, it shouldn't be used as a feature
# RescuerID will also be dropped as a feature based on this column will be extracted independently
to_drop_columns = ['PetID', 'Name', 'RescuerID']


# Select subsets of columns:
text_columns = ['Description', 'metadata_annots_top_desc', 'sentiment_entities']
categorical_columns = ['main_breed_BreedName', 'second_breed_BreedName']



In [40]:
# Count RescuerID occurrences:
rescuer_count = X.groupby(['RescuerID'])['PetID'].count().reset_index()
rescuer_count.columns = ['RescuerID', 'RescuerID_COUNT']

# Merge as another feature onto main DF:
X_temp = X_temp.merge(rescuer_count, how='left', on='RescuerID')

In [47]:
# Factorize categorical columns:
for i in categorical_columns:
    X_temp.loc[:, i] = pd.factorize(X_temp.loc[:, i])[0]
    #print(i)
    #print(X_temp.loc[:, i][0])
    #print(pd.factorize(X_temp.loc[:, i])[0])
    

In [48]:
# Subset text features:
X_text = X_temp[text_columns]

for i in X_text.columns:
    X_text.loc[:, i] = X_text.loc[:, i].fillna('<MISSING>')


In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import SparsePCA, TruncatedSVD, LatentDirichletAllocation, NMF

n_components = 5
text_features = []


# Generate text features:
for i in X_text.columns:
    
    # Initialize decomposition methods:
    print('generating features from: {}'.format(i))
    svd_ = TruncatedSVD(
        n_components=n_components, random_state=1337)
    nmf_ = NMF(
        n_components=n_components, random_state=1337)
    
    tfidf_col = TfidfVectorizer().fit_transform(X_text.loc[:, i].values)
    svd_col = svd_.fit_transform(tfidf_col)
    svd_col = pd.DataFrame(svd_col)
    svd_col = svd_col.add_prefix('SVD_{}_'.format(i))
    
    nmf_col = nmf_.fit_transform(tfidf_col)
    nmf_col = pd.DataFrame(nmf_col)
    nmf_col = nmf_col.add_prefix('NMF_{}_'.format(i))
    
    text_features.append(svd_col)
    text_features.append(nmf_col)

    
# Combine all extracted features:
text_features = pd.concat(text_features, axis=1)

# Concatenate with main DF:
X_temp = pd.concat([X_temp, text_features], axis=1)

# Remove raw text columns:
for i in X_text.columns:
    X_temp = X_temp.drop(i, axis=1)

generating features from: Description
generating features from: metadata_annots_top_desc
generating features from: sentiment_entities


In [50]:
# Remove unnecessary columns:
X_temp = X_temp.drop(to_drop_columns, axis=1)

# Check final df shape:
print('X shape: {}'.format(X_temp.shape))

X shape: (18941, 73)


train/test split:

In [53]:
# Split into train and test again:
X_train = X_temp.loc[np.isfinite(X_temp.AdoptionSpeed), :]
X_test = X_temp.loc[~np.isfinite(X_temp.AdoptionSpeed), :]

# Remove missing target column from test:
X_test = X_test.drop(['AdoptionSpeed'], axis=1)


print('X_train shape: {}'.format(X_train.shape))
print('X_test shape: {}'.format(X_test.shape))

assert X_train.shape[0] == df_train.shape[0]
assert X_test.shape[0] == df_test.shape[0]


# Check if columns between the two DFs are the same:
train_cols = X_train.columns.tolist()
train_cols.remove('AdoptionSpeed')

test_cols = X_test.columns.tolist()

assert np.all(train_cols == test_cols)

X_train shape: (14993, 73)
X_test shape: (3948, 72)


train and test NaN structure:

In [54]:
np.sum(pd.isnull(X_train))

Type                                               0
Age                                                0
Breed1                                             0
Breed2                                             0
Gender                                             0
Color1                                             0
Color2                                             0
Color3                                             0
MaturitySize                                       0
FurLength                                          0
Vaccinated                                         0
Dewormed                                           0
Sterilized                                         0
Health                                             0
Quantity                                           0
Fee                                                0
State                                              0
VideoAmt                                           0
PhotoAmt                                      

In [55]:
np.sum(pd.isnull(X_test))

Type                                              0
Age                                               0
Breed1                                            0
Breed2                                            0
Gender                                            0
Color1                                            0
Color2                                            0
Color3                                            0
MaturitySize                                      0
FurLength                                         0
Vaccinated                                        0
Dewormed                                          0
Sterilized                                        0
Health                                            0
Quantity                                          0
Fee                                               0
State                                             0
VideoAmt                                          0
PhotoAmt                                          0
sentiment_se

model training:

In [57]:
import scipy as sp

from collections import Counter
from functools import partial
from math import sqrt

from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import confusion_matrix as sk_cmatrix


def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings

def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4

        ll = quadratic_weighted_kappa(y, X_p)
        return -ll

    
    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')
        
        
    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        return X_p

    
    def coefficients(self):
        return self.coef_['x']
    
    
def rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted))

In [None]:
import lightgbm as lgb

params = {'application': 'regression',
          'boosting': 'gbdt',
          'metric': 'rmse',
          'num_leaves': 70,
          'max_depth': 9,
          'learning_rate': 0.01,
          'bagging_fraction': 0.85,
          'feature_fraction': 0.8,
          'min_split_gain': 0.02,
          'min_child_samples': 150,
          'min_child_weight': 0.02,
          'lambda_l2': 0.0475,
          'verbosity': -1,
          'data_random_seed': 17}

# Additional parameters:
early_stop = 500
verbose_eval = 100
num_rounds = 10000
n_splits = 5

In [None]:
from sklearn.model_selection import StratifiedKFold


kfold = StratifiedKFold(n_splits=n_splits, random_state=1337)


oof_train = np.zeros((X_train.shape[0]))
oof_test = np.zeros((X_test.shape[0], n_splits))


i = 0
for train_index, valid_index in kfold.split(X_train, X_train['AdoptionSpeed'].values):
    
    X_tr = X_train.iloc[train_index, :]
    X_val = X_train.iloc[valid_index, :]
    
    y_tr = X_tr['AdoptionSpeed'].values
    X_tr = X_tr.drop(['AdoptionSpeed'], axis=1)
    
    y_val = X_val['AdoptionSpeed'].values
    X_val = X_val.drop(['AdoptionSpeed'], axis=1)
    
    print('\ny_tr distribution: {}'.format(Counter(y_tr)))
    
    d_train = lgb.Dataset(X_tr, label=y_tr)
    d_valid = lgb.Dataset(X_val, label=y_val)
    watchlist = [d_train, d_valid]
    
    print('training LGB:')
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=num_rounds,
                      valid_sets=watchlist,
                      verbose_eval=verbose_eval,
                      early_stopping_rounds=early_stop)
    
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
    oof_train[valid_index] = val_pred
    oof_test[:, i] = test_pred
    
    i += 1

In [None]:
plt.hist(oof_train)

In [None]:
# Compute QWK based on OOF train predictions:
optR = OptimizedRounder()
optR.fit(oof_train, X_train['AdoptionSpeed'].values)
coefficients = optR.coefficients()
pred_test_y_k = optR.predict(oof_train, coefficients)
print("\nValid Counts = ", Counter(X_train['AdoptionSpeed'].values))
print("Predicted Counts = ", Counter(pred_test_y_k))
print("Coefficients = ", coefficients)
qwk = quadratic_weighted_kappa(X_train['AdoptionSpeed'].values, pred_test_y_k)
print("QWK = ", qwk)



In [None]:
# Manually adjusted coefficients:

coefficients_ = coefficients.copy()

coefficients_[0] = 1.645
coefficients_[1] = 2.115
coefficients_[3] = 2.84

train_predictions = optR.predict(oof_train, coefficients_).astype(int)
print('train pred distribution: {}'.format(Counter(train_predictions)))

test_predictions = optR.predict(oof_test.mean(axis=1), coefficients_)
print('test pred distribution: {}'.format(Counter(test_predictions)))

In [None]:
# Distribution inspection of original target and predicted train and test:

print("True Distribution:")
print(pd.value_counts(X_train['AdoptionSpeed'], normalize=True).sort_index())
print("\nTrain Predicted Distribution:")
print(pd.value_counts(train_predictions, normalize=True).sort_index())
print("\nTest Predicted Distribution:")
print(pd.value_counts(test_predictions, normalize=True).sort_index())



###################################################################################

In [None]:
#### Start from here Start Start start begin beginning continue

In [None]:
# Generate submission:

submission = pd.DataFrame({'PetID': test['PetID'].values, 'AdoptionSpeed': test_predictions.astype(np.int32)})
submission.head()
submission.to_csv('submission.csv', index=False)

In [175]:
train_matadata_gr.head()

Unnamed: 0_level_0,metadata_annots_score,metadata_annots_score,metadata_colors_score,metadata_colors_score,metadata_colors_pixelFrac,metadata_colors_pixelFrac,metadata_crop_conf,metadata_crop_conf,metadata_crop_importance,metadata_crop_importance
Unnamed: 0_level_1,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum
PetID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
0052dcf47,0.938122,2.814365,0.087819,0.263457,0.060522,0.181565,0.8,2.4,1.0,3.0
00a1f270a,0.944613,4.723066,0.095528,0.477641,0.066992,0.334961,0.8,4.0,1.0,5.0
015c75c9e,0.934615,2.803844,0.060296,0.180887,0.033252,0.099756,0.8,2.4,1.0,3.0
0162634c2,0.93437,2.803109,0.088806,0.266418,0.059992,0.179977,0.8,2.4,1.0,3.0
01ba4e94d,0.992804,0.992804,0.069252,0.069252,0.023777,0.023777,0.8,0.8,1.0,1.0


In [82]:
train_dfs_metadata.agg(['mean', 'sum'])

Unnamed: 0,metadata_annots_score,metadata_annots_top_desc,metadata_colors_score,metadata_colors_pixelFrac,metadata_crop_conf,metadata_crop_importance,PetID
mean,0.940978,,0.086139,0.066387,0.802077,1.002172,
sum,3746.034541,cat black catcat whiskers small to medium size...,344.211896,265.280945,3205.099804,4004.679995,86e1089a36296e909a6296e909a3422e49063422e49063...


In [154]:
abcd = train_dfs_metadata.groupby(['PetID'])['metadata_annots_top_desc'].unique().reset_index()

In [156]:
abcd.head()

Unnamed: 0,PetID,metadata_annots_top_desc
0,0052dcf47,"[dog breed dog dog like mammal, dog dog like m..."
1,00a1f270a,[cat small to medium sized cats cat like mamma...
2,015c75c9e,"[dog dog breed dog like mammal, dog dog like m..."
3,0162634c2,"[cat whiskers small to medium sized cats, dog ..."
4,01ba4e94d,[cat]


In [157]:
abcd['metadata_annots_top_desc'][0]

array(['dog breed dog dog like mammal', 'dog dog like mammal dog breed',
       'dog dog breed dog like mammal'], dtype=object)

In [158]:
abcd

abcd[
    'metadata_annots_top_desc'] = abcd[
    'metadata_annots_top_desc'].apply(lambda x: ' '.join(x))

In [159]:
abcd

Unnamed: 0,PetID,metadata_annots_top_desc
0,0052dcf47,dog breed dog dog like mammal dog dog like mam...
1,00a1f270a,cat small to medium sized cats cat like mammal...
2,015c75c9e,dog dog breed dog like mammal dog dog like mam...
3,0162634c2,cat whiskers small to medium sized cats dog br...
4,01ba4e94d,cat
5,022d2d16e,cat dragon li mammal
6,026ebf139,dog dog breed dog like mammal
7,02d6020a9,dog dog breed dog like mammal dog dog like mam...
8,0392c0c81,dog breed group fauna dog like mammal dog bree...
9,039509c40,dog dog like mammal dog breed dog like mammal ...


In [31]:
train_metadata_desc = train_dfs_metadata.groupby(['PetID'])['metadata_annots_top_desc'].unique()

KeyError: 'Column not found: metadata_annots_top_desc'

###########################

In [55]:
with open('./input/petfinder-adoption-prediction/train_sentiment/000a290e4.json','r') as f:
    file = json.load(f)

In [56]:
file['sentences']

[{'text': {'content': 'went to teluk kumba kuanthai restaurant saw this female puppies alone by the beach..',
   'beginOffset': -1},
  'sentiment': {'magnitude': 0.1, 'score': 0.1}},
 {'text': {'content': 'Adopters must vaccinate, spay and keep puppy indoors/fenced Call/WhatsApp: Address: teluk kumba',
   'beginOffset': -1},
  'sentiment': {'magnitude': 0.5, 'score': 0.5}}]

In [57]:
file.keys()

dict_keys(['sentences', 'tokens', 'entities', 'documentSentiment', 'language', 'categories'])

In [64]:
file['sentences'][0].keys()

dict_keys(['text', 'sentiment'])

In [65]:
file['sentences'][0]['text']

{'content': 'went to teluk kumba kuanthai restaurant saw this female puppies alone by the beach..',
 'beginOffset': -1}

In [66]:
file['sentences'][0]['sentiment']

{'magnitude': 0.1, 'score': 0.1}

In [67]:
file['tokens']

[]

In [71]:
file['entities'][0].keys()

dict_keys(['name', 'type', 'metadata', 'salience', 'mentions'])

In [72]:
file['entities'][0]['metadata']

{}

In [73]:
file['entities'][0]['name']

'restaurant'

In [74]:
file['entities'][0]['type']

'LOCATION'

In [75]:
file['entities'][0]['salience']

0.26085824

In [77]:
file['entities'][0]['mentions'][0]

{'text': {'content': 'restaurant', 'beginOffset': -1}, 'type': 'COMMON'}

In [62]:
file['documentSentiment']

{'magnitude': 0.6, 'score': 0.3}

In [46]:
file_sentiment = file['documentSentiment']
file_sentiment

{'magnitude': 0.6, 'score': 0.3}

In [47]:
file_sentences_sentiment = [x['sentiment'] for x in file['sentences']]

In [48]:
file_sentences_sentiment 

[{'magnitude': 0.1, 'score': 0.1}, {'magnitude': 0.5, 'score': 0.5}]

In [57]:
file_sentences_sentiment = pd.DataFrame.from_dict(
            file_sentences_sentiment, orient='columns').sum()
file_sentences_sentiment

magnitude    0.6
score        0.6
dtype: float64

In [58]:
file_sentences_sentiment = file_sentences_sentiment.add_prefix('document_').to_dict()
file_sentences_sentiment 

{'document_magnitude': 0.6, 'document_score': 0.6}

In [59]:
file_sentiment.update(file_sentences_sentiment)
file_sentiment

{'magnitude': 0.6,
 'score': 0.3,
 'document_magnitude': 0.6,
 'document_score': 0.6}

In [60]:
df_sentiment = pd.DataFrame.from_dict(file_sentiment, orient='index').T
df_sentiment 

Unnamed: 0,magnitude,score,document_magnitude,document_score
0,0.6,0.3,0.6,0.6


In [29]:
sentiment_file.keys()

dict_keys(['sentences', 'tokens', 'entities', 'documentSentiment', 'language', 'categories'])

In [34]:
sentiment_file['documentSentiment']

{'magnitude': 0.6, 'score': 0.3}

In [35]:
sentiment_file['entities']

[{'name': 'restaurant',
  'type': 'LOCATION',
  'metadata': {},
  'salience': 0.26085824,
  'mentions': [{'text': {'content': 'restaurant', 'beginOffset': -1},
    'type': 'COMMON'}]},
 {'name': 'puppies',
  'type': 'OTHER',
  'metadata': {},
  'salience': 0.20370758,
  'mentions': [{'text': {'content': 'puppies', 'beginOffset': -1},
    'type': 'COMMON'}]},
 {'name': 'beach',
  'type': 'LOCATION',
  'metadata': {},
  'salience': 0.18226475,
  'mentions': [{'text': {'content': 'beach', 'beginOffset': -1},
    'type': 'COMMON'}]},
 {'name': 'Call',
  'type': 'OTHER',
  'metadata': {'wikipedia_url': 'https://en.wikipedia.org/wiki/Telephone_call',
   'mid': '/m/024j49'},
  'salience': 0.13754916,
  'mentions': [{'text': {'content': 'WhatsApp', 'beginOffset': -1},
    'type': 'COMMON'},
   {'text': {'content': 'Call', 'beginOffset': -1}, 'type': 'PROPER'}]},
 {'name': 'teluk kumba',
  'type': 'OTHER',
  'metadata': {},
  'salience': 0.09735242,
  'mentions': [{'text': {'content': 'Address'

In [33]:
sentiment_file['sentences']

[{'text': {'content': 'went to teluk kumba kuanthai restaurant saw this female puppies alone by the beach..',
   'beginOffset': -1},
  'sentiment': {'magnitude': 0.1, 'score': 0.1}},
 {'text': {'content': 'Adopters must vaccinate, spay and keep puppy indoors/fenced Call/WhatsApp: Address: teluk kumba',
   'beginOffset': -1},
  'sentiment': {'magnitude': 0.5, 'score': 0.5}}]

In [30]:
sentiment_file

{'sentences': [{'text': {'content': 'went to teluk kumba kuanthai restaurant saw this female puppies alone by the beach..',
    'beginOffset': -1},
   'sentiment': {'magnitude': 0.1, 'score': 0.1}},
  {'text': {'content': 'Adopters must vaccinate, spay and keep puppy indoors/fenced Call/WhatsApp: Address: teluk kumba',
    'beginOffset': -1},
   'sentiment': {'magnitude': 0.5, 'score': 0.5}}],
 'tokens': [],
 'entities': [{'name': 'restaurant',
   'type': 'LOCATION',
   'metadata': {},
   'salience': 0.26085824,
   'mentions': [{'text': {'content': 'restaurant', 'beginOffset': -1},
     'type': 'COMMON'}]},
  {'name': 'puppies',
   'type': 'OTHER',
   'metadata': {},
   'salience': 0.20370758,
   'mentions': [{'text': {'content': 'puppies', 'beginOffset': -1},
     'type': 'COMMON'}]},
  {'name': 'beach',
   'type': 'LOCATION',
   'metadata': {},
   'salience': 0.18226475,
   'mentions': [{'text': {'content': 'beach', 'beginOffset': -1},
     'type': 'COMMON'}]},
  {'name': 'Call',
   

######################  metadata

In [78]:
with open('./input/petfinder-adoption-prediction/train_metadata/000a290e4-1.json','r') as f:
    file = json.load(f)

In [79]:
file

{'labelAnnotations': [{'mid': '/m/0bt9lr',
   'description': 'dog',
   'score': 0.96414083,
   'topicality': 0.96414083},
  {'mid': '/m/0kpmf',
   'description': 'dog breed',
   'score': 0.9419755,
   'topicality': 0.9419755},
  {'mid': '/m/01z5f',
   'description': 'dog like mammal',
   'score': 0.92154,
   'topicality': 0.92154},
  {'mid': '/m/02xl47d',
   'description': 'dog breed group',
   'score': 0.8994595,
   'topicality': 0.8994595},
  {'mid': '/m/0393qn',
   'description': 'phalÃ¨ne',
   'score': 0.71789825,
   'topicality': 0.71789825},
  {'mid': '/m/01lrl',
   'description': 'carnivoran',
   'score': 0.7058321,
   'topicality': 0.7058321},
  {'mid': '/m/01pkw7',
   'description': 'papillon',
   'score': 0.6653916,
   'topicality': 0.6653916},
  {'mid': '/m/03yl64',
   'description': 'companion dog',
   'score': 0.6042771,
   'topicality': 0.6042771},
  {'mid': '/m/0fxnkq',
   'description': 'moscow watchdog',
   'score': 0.6030931,
   'topicality': 0.6030931},
  {'mid': '/m

In [80]:
file.keys()

dict_keys(['labelAnnotations', 'imagePropertiesAnnotation', 'cropHintsAnnotation'])

In [122]:
file['labelAnnotations'][3]

{'mid': '/m/02xl47d',
 'description': 'dog breed group',
 'score': 0.8994595,
 'topicality': 0.8994595}

In [115]:
print(int(len(file['labelAnnotations'])*0.3))
file['labelAnnotations'][: int(len(file['labelAnnotations'])*0.3)]

3


[{'mid': '/m/0bt9lr',
  'description': 'dog',
  'score': 0.96414083,
  'topicality': 0.96414083},
 {'mid': '/m/0kpmf',
  'description': 'dog breed',
  'score': 0.9419755,
  'topicality': 0.9419755},
 {'mid': '/m/01z5f',
  'description': 'dog like mammal',
  'score': 0.92154,
  'topicality': 0.92154}]

In [101]:
file['imagePropertiesAnnotation']['dominantColors']['colors'][0]

{'color': {'red': 155, 'green': 112, 'blue': 91},
 'score': 0.13871339,
 'pixelFraction': 0.015743626}

In [108]:
file['cropHintsAnnotation']['cropHints'][0]

{'boundingPoly': {'vertices': [{},
   {'x': 359},
   {'x': 359, 'y': 479},
   {'y': 479}]},
 'confidence': 0.79999995,
 'importanceFraction': 1}

In [69]:
df_train['Description'].head()

0    Nibble is a 3+ month old ball of cuteness. He ...
1    I just found it alone yesterday near my apartm...
2    Their pregnant mother was dumped by her irresp...
3    Good guard dog, very alert, active, obedience ...
4    This handsome yet cute boy is up for adoption....
Name: Description, dtype: object

In [138]:
df_gender = df_train['Description']

In [139]:
if ' he' in 'she is good looking':
    print('found')


In [140]:
gender = []

for i,text in df_gender.items():
    
    text =str(text).lower()
    #print(type(text)
    #break
    if (' she ' in text) or  (' her ' in text) or (' girl ' in text):
        gender.append('F')
    elif (' he ' in text) or  (' him ' in text) or (' boy ' in text) or (' his ' in text):
        gender.append('M')
    else:
        gender.append('U')

In [141]:
i

'gender'

In [142]:
len(gender)

14994

In [143]:
df_gender = pd.DataFrame(gender)
df_gender.columns = ['gender']
#df_gender = df_train['Description']

In [144]:
#df_gender

In [145]:
df_gender['gender'].value_counts()

U    7202
F    4806
M    2986
Name: gender, dtype: int64

In [178]:
df1 = pd.read_csv('D:/Project data/Histopathologic Cancer Detection/dense201_upd.csv')

df2 = pd.read_csv('D:/Project data/Histopathologic Cancer Detection/sub0.2_0.8.csv')

##df3 = pd.read_csv('D:/Project data/Histopathologic Cancer Detection/nasnet_upd.csv')

df3 = pd.read_csv('D:/Project data/Histopathologic Cancer Detection/resnet34_fastai_9633.csv')

In [179]:
df1.columns = ['id', 'label1']
df2.columns = ['id', 'label2']
df3.columns = ['id', 'label3']

In [180]:
df = df3.merge(df1.merge(df2, on= 'id'), on= 'id')

In [181]:
df.head()

Unnamed: 0,id,label3,label1,label2
0,0b2ea2a822ad23fdb1b5dd26653da899fbd2c0d5,0.00251,0.000115218,0.000230444
1,95596b92e5066c5c52466c90b69ff089b39f2737,0.15181,0.2909326,0.06332633
2,248e6738860e2ebcf6258cdc1f32f299e0c76914,0.001777,2.59e-09,9.6349e-09
3,2c35657e312966e9294eac6841726ff3a748febf,0.002632,8.52e-05,0.0001725525
4,145782eb7caa1c516acbe2eda34d9a3f31c41fd6,0.062357,0.007916833,0.01750345


In [184]:
df['label'] = .5*df['label1'] + .90*df['label2'] + .5*df['label3']

In [185]:
df[['id','label']].to_csv('D:/Project data/Histopathologic Cancer Detection/5_dense201_90_9754sub_5_resnet34.csv'
                          ,index = False)

In [60]:
## create the file outside the loop
with open("./guru1991.txt","w+") as f:
    f.write('I am savings the loss here \n')
    
## Inside the loop of the epochs write to the file
epoch =2.00
val_loss = .0922
with open("./guru1991.txt","a+") as f:
    f.write(f"epoch  {epoch} validation loss {val_loss:0.2f} \n")
## Now download the file to gmail drive


In [62]:
epoch =2.00
val_loss = .0922
with open("./guru1991.txt","a+") as f:
    f.write(f"epoch  {epoch} validation loss {val_loss:0.2f} \n")