Merge branch 'master' into prithag

kiudee · Feb 15, 2018 · c9246af · c9246af
2 parents 0ed1b13 + 9d1cfff
commit c9246af
Show file tree

Hide file tree

Showing 3 changed files with 338 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -6,6 +6,17 @@ Introduction
 -------------
 CS-Rank is a Python package for context-sensitive ranking algorithms.
 
+We implement the following new object ranking architectures:
+
+ * FATE (First aggregate then evaluate)
+ * FETA (First evaluate then aggregate)
+
+In addition we also offer these benchmark algorithms:
+
+* Expected Rank Regression
+* RankNet
+* RankSVM
+
 Check out our [interactive notebooks](https://mybinder.org/v2/gh/kiudee/cs-ranking/master?filepath=notebooks) to quickly find out what our package can do.
 
 
@@ -15,13 +26,15 @@ As a simple "Hello World!"-example we will try to learn the Medoid problem:
 ```python
 import csrank as cs
 from csrank import SyntheticDatasetGenerator
-gen = SyntheticDatasetGenerator(dataset_type='medoid')
+gen = SyntheticDatasetGenerator(dataset_type='medoid',
+                                n_objects=5,
+                                n_features=2)
 X_train, Y_train, X_test, Y_test = gen.get_single_train_test_split()
 ```
 All our learning algorithms are implemented using the scikit-learn estimator API.
 Fitting our FATE-Network algorithm is as simple as calling the `fit` method:
 ```python
-fate = cs.FATEObjectRanker()
+fate = FATEObjectRanker(n_object_features=2)
 fate.fit(X_train, Y_train)
 ```
 Predictions can then be obtained using:

diff --git a/csrank/fate_ranking.py b/csrank/fate_ranking.py
@@ -23,7 +23,7 @@
 from csrank.dyadranking.contextual_ranking import ContextualRanker
 from csrank.labelranking.label_ranker import LabelRanker
 from csrank.layers import DeepSet
-from csrank.losses import hinged_rank_loss
+from csrank.losses import hinged_rank_loss, smooth_rank_loss
 from csrank.metrics import zero_one_rank_loss_for_scores_ties, \
     zero_one_rank_loss_for_scores
 from csrank.objectranking.object_ranker import ObjectRanker
@@ -594,9 +594,37 @@ def set_tunable_parameter_ranges(cls, param_ranges_dict):
 
 class FATEObjectRanker(FATEObjectRankingCore, ObjectRanker):
 
-    def __init__(self, loss_function=hinged_rank_loss, metrics=None,
+    def __init__(self, n_object_features,
+                 n_hidden_set_layers=2,
+                 n_hidden_set_units=32,
+                 loss_function=smooth_rank_loss,
+                 metrics=None,
                  **kwargs):
-        FATEObjectRankingCore.__init__(self, **kwargs)
+        """ Create a FATE-network architecture for object ranking.
+
+        Training complexity is quadratic in the number of objects and
+        prediction complexity is only linear.
+
+        Parameters
+        ----------
+        n_object_features : int
+            Dimensionality of the feature space of each object
+        n_hidden_set_layers : int
+            Number of hidden layers for the context representation
+        n_hidden_set_units : int
+            Number of hidden units in each layer of the context representation
+        loss_function : function
+            Differentiable loss function for the score vector
+        metrics : list
+            List of evaluation metrics (can be non-differentiable)
+        **kwargs
+            Keyword arguments for the hidden units
+        """
+        FATEObjectRankingCore.__init__(self,
+                                       n_object_features=n_object_features,
+                                       n_hidden_set_layers=n_hidden_set_layers,
+                                       n_hidden_set_units=n_hidden_set_units,
+                                       **kwargs)
         self.loss_function = loss_function
         self.logger = logging.getLogger(GENERAL_OBJECT_RANKER)
         if metrics is None:
@@ -647,7 +675,7 @@ def predict(self, X, **kwargs):
 class FATELabelRanker(FATERankingCore, LabelRanker):
     def __init__(self, loss_function=hinged_rank_loss, metrics=None,
                  **kwargs):
-        FATEObjectRankingCore.__init__(self, label_ranker=True, **kwargs)
+        super().__init__(self, label_ranker=True, **kwargs)
         self.loss_function = loss_function
         self.logger = logging.getLogger(GENERAL_LABEL_RANKER)
         if metrics is None:

diff --git a/notebooks/GeneralizationOfExperiments.ipynb b/notebooks/GeneralizationOfExperiments.ipynb
@@ -0,0 +1,291 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/prithag/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
+      "  from ._conv import register_converters as _register_converters\n",
+      "Using TensorFlow backend.\n",
+      "/home/prithag/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6\n",
+      "  return f(*args, **kwds)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import inspect\n",
+    "import os\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from docopt import docopt\n",
+    "from csrank.fate_ranking import FATEObjectRanker\n",
+    "from csrank.objectranking.feta_ranker import FETANetwork\n",
+    "from csrank.callbacks import DebugOutput\n",
+    "from csrank.metrics import zero_one_rank_loss_for_scores\n",
+    "from csrank.util import rename_file_if_exist, configure_logging_numpy_keras, get_tensor_value\n",
+    "from csrank.dataset_reader import SyntheticDatasetGenerator"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Defining the Constants"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "MODEL = \"aModel\"\n",
+    "ERROR_OUTPUT_STRING = 'Out of sample error {} : {} for n_objects {}'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Generate the Medoid sythentic dataset for defined number of objects"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def generate_dataset(n_objects=5, random_state=42):\n",
+    "    parameters = {\"n_features\": 2, \"n_objects\": n_objects, \"n_train_instances\": 10000, \"n_test_instances\": 100000,\n",
+    "                 \"dataset_type\": \"medoid\",\"random_state\":random_state}\n",
+    "    generator = SyntheticDatasetGenerator(**parameters)\n",
+    "    return generator.get_single_train_test_split()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Fit the given ranker and predict on rankings with different sizes and check the zero one rank loss for them"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def get_evaluation_result(gor, X_train, Y_train, epochs):\n",
+    "    gor.fit(X_train, Y_train, log_callbacks=[DebugOutput(delta=10)], verbose=False, epochs=epochs)\n",
+    "    eval_results = {}\n",
+    "    for n_objects in np.arange(3, 15):\n",
+    "        _, _, X_test, Y_test = generate_dataset(n_objects=n_objects, random_state=seed + n_objects * 5)\n",
+    "        y_pred_scores = gor.predict_scores(X_test, batch_size=X_test.shape[0])\n",
+    "        metric_loss = get_tensor_value(zero_one_rank_loss_for_scores(Y_test, y_pred_scores))\n",
+    "        logger.info(ERROR_OUTPUT_STRING.format(\"zero_one_rank_loss\", str(np.mean(metric_loss)), n_objects))\n",
+    "        eval_results[n_objects] = metric_loss\n",
+    "    return eval_results"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Initialize the log file path and the dataframe path."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "n_objects = 5\n",
+    "dirname = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))\n",
+    "log_path = os.path.join(dirname, \"logs\", \"generalizing_mean_{}.log\".format(n_objects))\n",
+    "df_path = os.path.join(dirname, \"logs\", \"generalizing_mean_{}.csv\".format(n_objects))\n",
+    "random_state = np.random.RandomState(seed=42)\n",
+    "seed = random_state.randint(2 ** 32)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Initialize tensorflow and keras with the seed and initialize the log file path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "rows_list = []\n",
+    "logger = configure_logging_numpy_keras(seed=seed, log_path=log_path)\n",
+    "\n",
+    "X_train, Y_train, _, _ = generate_dataset(n_objects=n_objects, random_state=seed)\n",
+    "n_instances, n_objects, n_features = X_train.shape\n",
+    "\n",
+    "epochs = 50\n",
+    "params = {\"n_objects\": n_objects, \n",
+    "          \"n_features\": n_features, \n",
+    "          \"n_object_features\": n_features, \n",
+    "          \"use_early_stopping\": True, \n",
+    "          \"metrics\":[zero_one_rank_loss_for_scores]}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Evaluate the FETANetwork with best parameters and check the generalization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "logger.info(\"############################# With Best Parameters FETA ##############################\")\n",
+    "best_point =  [1, 16, 4.2054947998521569e-05, 2.6263496065703243e-10, 777]\n",
+    "gor = FETANetwork(**params)\n",
+    "gor.set_tunable_parameter_ranges({})\n",
+    "gor.set_tunable_parameters(best_point)\n",
+    "result = get_evaluation_result(gor, X_train, Y_train, epochs)\n",
+    "result[MODEL] = \"FETARanker\"\n",
+    "rows_list.append(result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Evaluate the FATEObjectRanker with best parameters and check the generalization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from csrank.losses import smooth_rank_loss\n",
+    "logger.info(\"############################# With Best Parameters FATE ##############################\")\n",
+    "best_point =   [1003, 0.0002908115170179143, 16, 132, 6, 247, 3.4195015492773324e-05]\n",
+    "gor = FATEObjectRanker(**params)\n",
+    "gor.set_tunable_parameter_ranges({})\n",
+    "gor.set_tunable_parameters(best_point)\n",
+    "result = get_evaluation_result(gor, X_train, Y_train, epochs)\n",
+    "result[MODEL] = \"FATERanker\"\n",
+    "rows_list.append(result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(rows_list)\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "cols = list(df.columns.values)\n",
+    "cols = cols[-7:] + cols[:-7]\n",
+    "MODEL = \"aModel\"\n",
+    "for x in ['Unnamed: 0', 'aModel']:\n",
+    "    if x in cols:\n",
+    "        cols.remove(x)\n",
+    "        cols.insert(0, x)\n",
+    "df = df[cols]\n",
+    "#del df['Unnamed: 0']\n",
+    "df = df.set_index(MODEL).T"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df.to_csv(df_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}