Lots of work on classifiers

kboone · May 20, 2019 · 2ed77c9 · 2ed77c9
1 parent eccbf1c
commit 2ed77c9
Show file tree

Hide file tree

Showing 3 changed files with 230 additions and 33 deletions.
diff --git a/avocado/classifier.py b/avocado/classifier.py
@@ -1,38 +1,115 @@
 import numpy as np
+import os
 import pandas as pd
 
+from .settings import settings
+
+def get_classifier_path(tag):
+    """Get the path to where a classifier should be stored on disk
+
+    Parameters
+    ----------
+    tag : str
+        The unique tag for the classifier.
+    """
+    classifier_directory = settings['classifier_directory']
+    classifier_path = os.path.join(classifier_directory,
+                                   'classifier_%s.pkl' % tag)
+
+    return classifier_path
+
+
 class Classifier():
     """Classifier used to classify the different objects in a dataset."""
     def train(self, dataset):
         """Train the classifier on a dataset
-        
+
+        This needs to be implemented in subclasses.
+
         Parameters
         ----------
         dataset : :class:`Dataset`
             The dataset to use for training.
         """
         raise NotImplementedError
 
-    def write(self, tag):
+    def predict(self, dataset):
+        """Generate predictions for a dataset
+
+        This needs to be implemented in subclasses.
+
+        Parameters
+        ----------
+        dataset : :class:`Dataset`
+            The dataset to generate predictions for.
+
+        Returns
+        -------
+        predictions : :class:`pandas.DataFrame`
+            A pandas Series with the predictions for each class.
+        """
+        raise NotImplementedError
+
+    def write(self, tag, overwrite=False):
         """Write a trained classifier to disk
-        
-        TODO: Figure out API
+
+        Parameters
+        ----------
+        tag : str
+            A unique tag used to identify the classifier.
+        overwrite : bool (optional)
+            If a classifier with the same tag already exists on disk and this
+            is True, overwrite it. Otherwise, raise an AvocadoException.
         """
+        import pickle
+
+        path = get_classifier_path(tag)
+
+        # Make the containing directory if it doesn't exist yet.
+        directory = os.path.dirname(path)
+        os.makedirs(directory, exist_ok=True)
+
+        # Handle if the file already exists.
+        if os.path.exists(path):
+            if overwrite:
+                logger.warning("Overwriting %s..." % path)
+                os.remove(path)
+            else:
+                raise AvocadoException(
+                    "Dataset %s already exists! Can't write." % path
+                )
+
+        # Write the classifier to a pickle file
+        with open(path, 'wb') as output_file:
+            pickle.dump(self, output_file)
 
     @classmethod
     def load(cls, tag):
         """Load a classifier that was previously saved to disk
-        
-        TODO: Figure out API
+
+        Parameters
+        ----------
+        tag : str
+            A unique tag used to identify the classifier to load.
         """
+        import pickle
+
+        path = get_classifier_path(tag)
+
+        # Write the classifier to a pickle file
+        with open(path, 'rb') as input_file:
+            classifier = pickle.load(input_file)
+
+        return classifier
+
 
 class LightGBMClassifier(Classifier):
     """Feature based classifier using LightGBM to classify objects.
 
     This uses a weighted multi-class logarithmic loss that normalizes for the
     total counts of each class. This classifier is optimized for the metric
     used in the PLAsTiCC Kaggle challenge.
-    
+
     Parameters
     ----------
     featurizer : :class:`Featurizer`
@@ -148,12 +225,64 @@ def train(self, dataset, num_folds=None, random_state=None, **kwargs):
 
             classifiers.append(classifier)
 
+        # Statistics on out-of-sample predictions
+        total_logloss = weighted_multi_logloss(
+            object_classes, predictions, self.class_weights
+        )
+        print('Total weighted log-loss: %.5f ' % total_logloss)
+
+        # Original sample only (no augments)
+        if 'reference_object_id' in dataset.metadata:
+            original_mask = dataset.metadata['reference_object_id'].isnull()
+            original_logloss = weighted_multi_logloss(
+                object_classes[original_mask],
+                predictions[original_mask],
+                self.class_weights
+            )
+            print('Original weighted log-loss: %.5f ' % original_logloss)
+
         self.importances = importances
-        self.out_of_fold_predictions = predictions
+        self.train_predictions = predictions
+        self.train_classes = object_classes
         self.classifiers = classifiers
 
         return classifiers
 
+    def predict(self, dataset):
+        """Generate predictions for a dataset
+
+        Parameters
+        ----------
+        dataset : :class:`Dataset`
+            The dataset to generate predictions for.
+
+        Returns
+        -------
+        predictions : :class:`pandas.DataFrame`
+            A pandas Series with the predictions for each class.
+        """
+        features = dataset.select_features(self.featurizer)
+
+        predictions = 0
+
+        for classifier in self.classifiers:
+            fold_scores = classifier.predict_proba(
+                features, raw_score=True,
+                num_iteration=classifier.best_iteration_
+            )
+
+            exp_scores = np.exp(fold_scores)
+
+            fold_predictions = exp_scores / np.sum(exp_scores, axis=1)[:, None]
+            predictions += fold_predictions
+
+        predictions /= len(self.classifiers)
+
+        predictions = pd.DataFrame(predictions, index=features.index,
+                                   columns=self.train_predictions.columns)
+
+        return predictions
+
 
 def fit_lightgbm_classifier(train_features, train_classes, train_weights,
                             validation_features, validation_classes,
@@ -217,3 +346,59 @@ def fit_lightgbm_classifier(train_features, train_classes, train_weights,
     classifier.fit(train_features, train_classes, **fit_params)
 
     return classifier
+
+
+def weighted_multi_logloss(true_classes, predictions, class_weights=None,
+                           return_class_contributions=False):
+    """Evaluate a weighted multi-class logloss function.
+
+    Parameters
+    ----------
+    true_classes : `pandas.Series`
+        A pandas series with the true class for each object
+    predictions : `pandas.DataFrame`
+        A pandas data frame with the predicted probabilities of each class for
+        every object. There should be one column for each class.
+    class_weights : dict (optional)
+        The weights to use for each class. If not specified, flat weights are
+        assumed for each class.
+    return_class_contributions : bool (optional)
+        If True, return a pandas Series with the contributions from each
+        class. Otherwise, return the sum over all classes (default).
+
+    Returns
+    -------
+    logloss : float or `pandas.Series`
+        By default, return the weighted multi-class logloss over all classes.
+        If return_class_contributions is True, this returns a pandas Series
+        with the individual contributions to the logloss from each class
+        instead.
+    """
+    class_loglosses = []
+    sum_weights = 0
+
+    for class_name in predictions.columns:
+        class_mask = true_classes == class_name
+
+        class_count = np.sum(class_mask)
+        class_predictions = predictions[class_name][class_mask]
+
+        class_logloss = -np.sum(np.log(class_predictions)) / class_count
+
+        if class_weights is not None:
+            weight = class_weights.get(class_name, 1)
+        else:
+            weight = 1
+
+        class_loglosses.append(weight * class_logloss)
+        sum_weights += weight
+
+    class_loglosses = pd.Series(
+        np.array(class_loglosses) / sum_weights,
+        index=predictions.columns
+    )
+
+    if return_class_contributions:
+        return class_loglosses
+    else:
+        return np.sum(class_loglosses)
diff --git a/avocado/plasticc.py b/avocado/plasticc.py
@@ -18,6 +18,9 @@
 plasticc_end_time = 60675
 plasticc_bands = ['lsstu', 'lsstg', 'lsstr', 'lssti', 'lsstz', 'lssty']
 
+plasticc_kaggle_weights = {6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64:
+                           2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1, 99: 2}
+
 
 class PlasticcAugmentor(Augmentor):
     """Implementation of an Augmentor for the PLAsTiCC dataset"""
@@ -205,7 +208,7 @@ def _augment_metadata(self, reference_object):
         # Choose whether the new object will be in the DDF or not.
         if reference_object.metadata['ddf']:
             # Most observations are WFD observations, so generate more of
-            # those. Thee DDF and WFD samples are effectively completely
+            # those. The DDF and WFD samples are effectively completely
             # different, so this ratio doesn't really matter.
             augmented_metadata['ddf'] = np.random.rand() > 0.8
         else:
@@ -648,20 +651,20 @@ def select_features(self, raw_features):
                                     rf['min_flux_lssti'])
         )
         features['max_flux_ratio_red'] = (
-            rf['max_flux_lssty'] /
+            np.abs(rf['max_flux_lssty']) /
             (np.abs(rf['max_flux_lssty']) + np.abs(rf['max_flux_lssti']))
         )
         features['max_flux_ratio_blue'] = (
-            rf['max_flux_lsstg'] /
+            np.abs(rf['max_flux_lsstg']) /
             (np.abs(rf['max_flux_lssti']) + np.abs(rf['max_flux_lsstg']))
         )
 
         features['min_flux_ratio_red'] = (
-            -rf['min_flux_lssty'] /
+            np.abs(rf['min_flux_lssty']) /
             (np.abs(rf['min_flux_lssty']) + np.abs(rf['min_flux_lssti']))
         )
         features['min_flux_ratio_blue'] = (
-            -rf['min_flux_lsstg'] /
+            np.abs(rf['min_flux_lsstg']) /
             (np.abs(rf['min_flux_lssti']) + np.abs(rf['min_flux_lsstg']))
         )
 
@@ -673,26 +676,34 @@ def select_features(self, raw_features):
         features['time_fwd_max_0.5'] = rf['time_fwd_max_0.5_lssti']
         features['time_fwd_max_0.2'] = rf['time_fwd_max_0.2_lssti']
 
-        features['time_fwd_max_0.5_diff_red'] = \
-            rf['time_fwd_max_0.5_lssty'] - rf['time_fwd_max_0.5_lssti']
-        features['time_fwd_max_0.5_diff_blue'] = \
-            rf['time_fwd_max_0.5_lsstg'] - rf['time_fwd_max_0.5_lssti']
-        features['time_fwd_max_0.2_diff_red'] = \
-            rf['time_fwd_max_0.2_lssty'] - rf['time_fwd_max_0.2_lssti']
-        features['time_fwd_max_0.2_diff_blue'] = \
-            rf['time_fwd_max_0.2_lsstg'] - rf['time_fwd_max_0.2_lsstg']
+        features['time_fwd_max_0.5_ratio_red'] = (
+            rf['time_fwd_max_0.5_lssty']
+            / (rf['time_fwd_max_0.5_lssty'] + rf['time_fwd_max_0.5_lssti']))
+        features['time_fwd_max_0.5_ratio_blue'] = (
+            rf['time_fwd_max_0.5_lsstg']
+            / (rf['time_fwd_max_0.5_lsstg'] + rf['time_fwd_max_0.5_lssti']))
+        features['time_fwd_max_0.2_ratio_red'] = (
+            rf['time_fwd_max_0.2_lssty']
+            / (rf['time_fwd_max_0.2_lssty'] + rf['time_fwd_max_0.2_lssti']))
+        features['time_fwd_max_0.2_ratio_blue'] = (
+            rf['time_fwd_max_0.2_lsstg']
+            / (rf['time_fwd_max_0.2_lsstg'] + rf['time_fwd_max_0.2_lssti']))
 
         features['time_bwd_max_0.5'] = rf['time_bwd_max_0.5_lssti']
         features['time_bwd_max_0.2'] = rf['time_bwd_max_0.2_lssti']
 
-        features['time_bwd_max_0.5_diff_red'] = \
-            rf['time_bwd_max_0.5_lssty'] - rf['time_bwd_max_0.5_lssti']
-        features['time_bwd_max_0.5_diff_blue'] = \
-            rf['time_bwd_max_0.5_lsstg'] - rf['time_bwd_max_0.5_lssti']
-        features['time_bwd_max_0.2_diff_red'] = \
-            rf['time_bwd_max_0.2_lssty'] - rf['time_bwd_max_0.2_lssti']
-        features['time_bwd_max_0.2_diff_blue'] = \
-            rf['time_bwd_max_0.2_lsstg'] - rf['time_bwd_max_0.2_lsstg']
+        features['time_bwd_max_0.5_ratio_red'] = (
+            rf['time_bwd_max_0.5_lssty']
+            / (rf['time_bwd_max_0.5_lssty'] + rf['time_bwd_max_0.5_lssti']))
+        features['time_bwd_max_0.5_ratio_blue'] = (
+            rf['time_bwd_max_0.5_lsstg']
+            / (rf['time_bwd_max_0.5_lsstg'] + rf['time_bwd_max_0.5_lssti']))
+        features['time_bwd_max_0.2_ratio_red'] = (
+            rf['time_bwd_max_0.2_lssty']
+            / (rf['time_bwd_max_0.2_lssty'] + rf['time_bwd_max_0.2_lssti']))
+        features['time_bwd_max_0.2_ratio_blue'] = (
+            rf['time_bwd_max_0.2_lsstg']
+            / (rf['time_bwd_max_0.2_lsstg'] + rf['time_bwd_max_0.2_lssti']))
 
         features['frac_s2n_5'] = rf['count_s2n_5'] / rf['count']
         features['frac_s2n_-5'] = rf['count_s2n_-5'] / rf['count']
@@ -723,10 +734,9 @@ def select_features(self, raw_features):
             rf['peaks_pos_lssty_frac_2']
         ]
 
-        if np.all(np.isnan(all_peak_pos_frac_2)):
-            features['peak_frac_2'] = np.nan
-        else:
-            features['peak_frac_2'] = np.nanmedian(all_peak_pos_frac_2)
+        with np.warnings.catch_warnings():
+            np.warnings.filterwarnings('ignore', r'All-NaN slice encountered')
+            features['peak_frac_2'] = np.nanmedian(all_peak_pos_frac_2, axis=0)
 
         features['total_s2n'] = np.sqrt(
             rf['total_s2n_lsstu']**2 +

diff --git a/avocado_settings.json b/avocado_settings.json
@@ -9,6 +9,8 @@
     "features_directory": "./features",
     "features_tag": "features_v1",
 
+    "classifier_directory": "./classifiers",
+
     "RAW_DATA_DIR": "./data",
     "RAW_TRAINING_PATH": "./data/training_set.csv",
     "RAW_TRAINING_METADATA_PATH": "./data/training_set_metadata.csv",