From f0a9d2750b33fe2185f1358f2e368734e2997c76 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Wed, 29 Aug 2018 09:03:43 -0500 Subject: [PATCH 01/94] Work in progrss --- examples/Untitled.ipynb | 159 +++++++++++++++ examples/new-api-sandbox.ipynb | 265 ++++++++++++++++++++++--- optimus/profiler/templates/__init__.py | 0 optimus/tasks.py | 11 + requirements.txt | 1 + 5 files changed, 409 insertions(+), 27 deletions(-) create mode 100644 examples/Untitled.ipynb create mode 100644 optimus/profiler/templates/__init__.py create mode 100644 optimus/tasks.py diff --git a/examples/Untitled.ipynb b/examples/Untitled.ipynb new file mode 100644 index 000000000..96808952e --- /dev/null +++ b/examples/Untitled.ipynb @@ -0,0 +1,159 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting optimuspyspark\n", + " Downloading https://files.pythonhosted.org/packages/ea/25/acae4eeaaa82b2e97a0e48b329e897b9fed09c43a46009d8d8dcea6c9985/optimuspyspark-2.0.4-py3-none-any.whl (65kB)\n", + "Requirement already satisfied: pygments>=2.2.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (2.2.0)\n", + "Collecting tensorflow==1.6.0 (from optimuspyspark)\n", + " Using cached https://files.pythonhosted.org/packages/56/7d/a0e3ae33e8034be8e7d6b99a7f512c6e71b2180603fc3e0cfb6047b7374f/tensorflow-1.6.0-cp36-cp36m-win_amd64.whl\n", + "Collecting keras==2.1.5 (from optimuspyspark)\n", + " Downloading https://files.pythonhosted.org/packages/ba/65/e4aff762b8696ec0626a6654b1e73b396fcc8b7cc6b98d78a1bc53b85b48/Keras-2.1.5-py2.py3-none-any.whl (334kB)\n", + "Collecting setuptools==40.2.0 (from optimuspyspark)\n", + " Downloading https://files.pythonhosted.org/packages/66/e8/570bb5ca88a8bcd2a1db9c6246bb66615750663ffaaeada95b04ffe74e12/setuptools-40.2.0-py2.py3-none-any.whl (568kB)\n", + "Requirement already satisfied: matplotlib==2.2.3 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (2.2.3)\n", + "Requirement already satisfied: h5py>=2.7.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (2.7.1)\n", + "Requirement already satisfied: pytest==3.7.2 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (3.7.2)\n", + "Collecting numpy==1.15.1 (from optimuspyspark)\n", + " Downloading https://files.pythonhosted.org/packages/fb/7d/f8b97d97809f184d90faf320fa8e2e7eac994844c5e6c57adbed1283e9e9/numpy-1.15.1-cp36-none-win_amd64.whl (13.5MB)\n", + "Requirement already satisfied: ipython==6.5.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (6.5.0)\n", + "Collecting fastnumbers==2.1.1 (from optimuspyspark)\n", + " Downloading https://files.pythonhosted.org/packages/7c/5d/a80733c011cc683de70de5d8c9f5e481cc986428519d14849edfcabb95ae/fastnumbers-2.1.1-cp36-cp36m-win_amd64.whl\n", + "Requirement already satisfied: nose==1.3.7 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (1.3.7)\n", + "Requirement already satisfied: pyspark==2.3.1 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (2.3.1)\n", + "Requirement already satisfied: tabulate==0.8.2 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (0.8.2)\n", + "Requirement already satisfied: seaborn==0.9.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (0.9.0)\n", + "Requirement already satisfied: Jinja2==2.10 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (2.10)\n", + "Requirement already satisfied: six>=1.10.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (1.11.0)\n", + "Requirement already satisfied: python-dateutil==2.7.3 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (2.7.3)\n", + "Requirement already satisfied: deprecated==1.2.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (1.2.0)\n", + "Collecting findspark==1.3.0 (from optimuspyspark)\n", + " Downloading https://files.pythonhosted.org/packages/b1/c8/e6e1f6a303ae5122dc28d131b5a67c5eb87cbf8f7ac5b9f87764ea1b1e1e/findspark-1.3.0-py2.py3-none-any.whl\n", + "Requirement already satisfied: multipledispatch==0.6.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (0.6.0)\n", + "Requirement already satisfied: pillow==5.2.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (5.2.0)\n", + "Collecting pyarrow==0.10.0 (from optimuspyspark)\n", + " Downloading https://files.pythonhosted.org/packages/42/58/4d29b02e9b422cc65f65c1afe5c4710d7e1827c2c1f651bcc19cec042d92/pyarrow-0.10.0-cp36-cp36m-win_amd64.whl (3.6MB)\n", + "Requirement already satisfied: flask==1.0.2 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (1.0.2)\n", + "Collecting tensorboard<1.7.0,>=1.6.0 (from tensorflow==1.6.0->optimuspyspark)\n", + " Using cached https://files.pythonhosted.org/packages/b0/67/a8c91665987d359211dcdca5c8b2a7c1e0876eb0702a4383c1e4ff76228d/tensorboard-1.6.0-py3-none-any.whl\n", + "Requirement already satisfied: protobuf>=3.4.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from tensorflow==1.6.0->optimuspyspark) (3.6.1)\n", + "Requirement already satisfied: wheel>=0.26 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from tensorflow==1.6.0->optimuspyspark) (0.31.1)\n", + "Requirement already satisfied: termcolor>=1.1.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from tensorflow==1.6.0->optimuspyspark) (1.1.0)\n", + "Requirement already satisfied: gast>=0.2.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from tensorflow==1.6.0->optimuspyspark) (0.2.0)\n", + "Requirement already satisfied: absl-py>=0.1.6 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from tensorflow==1.6.0->optimuspyspark) (0.4.0)\n", + "Requirement already satisfied: astor>=0.6.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from tensorflow==1.6.0->optimuspyspark) (0.7.1)\n", + "Requirement already satisfied: grpcio>=1.8.6 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from tensorflow==1.6.0->optimuspyspark) (1.14.1)\n", + "Requirement already satisfied: pyyaml in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from keras==2.1.5->optimuspyspark) (3.12)\n", + "Requirement already satisfied: scipy>=0.14 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from keras==2.1.5->optimuspyspark) (1.1.0)\n", + "Requirement already satisfied: cycler>=0.10 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from matplotlib==2.2.3->optimuspyspark) (0.10.0)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from matplotlib==2.2.3->optimuspyspark) (2.2.0)\n", + "Requirement already satisfied: pytz in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from matplotlib==2.2.3->optimuspyspark) (2018.4)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from matplotlib==2.2.3->optimuspyspark) (1.0.1)\n", + "Requirement already satisfied: py>=1.5.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from pytest==3.7.2->optimuspyspark) (1.5.3)\n", + "Requirement already satisfied: attrs>=17.4.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from pytest==3.7.2->optimuspyspark) (18.1.0)\n", + "Requirement already satisfied: more-itertools>=4.0.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from pytest==3.7.2->optimuspyspark) (4.1.0)\n", + "Requirement already satisfied: atomicwrites>=1.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from pytest==3.7.2->optimuspyspark) (1.1.5)\n", + "Requirement already satisfied: pluggy>=0.7 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from pytest==3.7.2->optimuspyspark) (0.7.1)\n", + "Requirement already satisfied: colorama in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from pytest==3.7.2->optimuspyspark) (0.3.9)\n", + "Requirement already satisfied: decorator in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from ipython==6.5.0->optimuspyspark) (4.3.0)\n", + "Requirement already satisfied: jedi>=0.10 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from ipython==6.5.0->optimuspyspark) (0.12.0)\n", + "Requirement already satisfied: traitlets>=4.2 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from ipython==6.5.0->optimuspyspark) (4.3.2)\n", + "Requirement already satisfied: pickleshare in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from ipython==6.5.0->optimuspyspark) (0.7.4)\n", + "Requirement already satisfied: prompt-toolkit<2.0.0,>=1.0.15 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from ipython==6.5.0->optimuspyspark) (1.0.15)\n", + "Requirement already satisfied: backcall in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from ipython==6.5.0->optimuspyspark) (0.1.0)\n", + "Requirement already satisfied: simplegeneric>0.8 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from ipython==6.5.0->optimuspyspark) (0.8.1)\n", + "Requirement already satisfied: py4j==0.10.7 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from pyspark==2.3.1->optimuspyspark) (0.10.7)\n", + "Requirement already satisfied: pandas>=0.15.2 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from seaborn==0.9.0->optimuspyspark) (0.23.0)\n", + "Requirement already satisfied: MarkupSafe>=0.23 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from Jinja2==2.10->optimuspyspark) (1.0)\n", + "Requirement already satisfied: wrapt<2,>=1 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from deprecated==1.2.0->optimuspyspark) (1.10.11)\n", + "Requirement already satisfied: Werkzeug>=0.14 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from flask==1.0.2->optimuspyspark) (0.14.1)\n", + "Requirement already satisfied: click>=5.1 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from flask==1.0.2->optimuspyspark) (6.7)\n", + "Requirement already satisfied: itsdangerous>=0.24 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from flask==1.0.2->optimuspyspark) (0.24)\n", + "Requirement already satisfied: bleach==1.5.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from tensorboard<1.7.0,>=1.6.0->tensorflow==1.6.0->optimuspyspark) (1.5.0)\n", + "Requirement already satisfied: markdown>=2.6.8 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from tensorboard<1.7.0,>=1.6.0->tensorflow==1.6.0->optimuspyspark) (2.6.11)\n", + "Requirement already satisfied: html5lib==0.9999999 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from tensorboard<1.7.0,>=1.6.0->tensorflow==1.6.0->optimuspyspark) (0.9999999)\n", + "Requirement already satisfied: parso>=0.2.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from jedi>=0.10->ipython==6.5.0->optimuspyspark) (0.2.0)\n", + "Requirement already satisfied: ipython_genutils in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from traitlets>=4.2->ipython==6.5.0->optimuspyspark) (0.2.0)\n", + "Requirement already satisfied: wcwidth in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from prompt-toolkit<2.0.0,>=1.0.15->ipython==6.5.0->optimuspyspark) (0.1.7)\n", + "Installing collected packages: numpy, tensorboard, tensorflow, keras, setuptools, fastnumbers, findspark, pyarrow, optimuspyspark\n", + " Found existing installation: numpy 1.14.4\n", + " Uninstalling numpy-1.14.4:\n", + " Successfully uninstalled numpy-1.14.4\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "keras-preprocessing 1.0.2 has requirement keras>=2.1.6, but you'll have keras 2.1.5 which is incompatible.\n", + "keras-applications 1.0.4 has requirement keras>=2.1.6, but you'll have keras 2.1.5 which is incompatible.\n", + "Could not install packages due to an EnvironmentError: [WinError 5] Access is denied: 'C:\\\\Users\\\\ARGENI~1\\\\AppData\\\\Local\\\\Temp\\\\pip-uninstall-r4y_p5ei\\\\users\\\\argenisleon\\\\appdata\\\\local\\\\continuum\\\\anaconda3\\\\lib\\\\site-packages\\\\numpy\\\\.libs\\\\libopenblas.bnvrk7633hsx7yvo2tadgr4a5kekxjaw.gfortran-win_amd64.dll'\n", + "Consider using the `--user` option or check the permissions.\n", + "\n", + "You are using pip version 10.0.1, however version 18.0 is available.\n", + "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n" + ] + } + ], + "source": [ + "!pip install optimuspyspark" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'optimus'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0moptimus\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mOptimus\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'optimus'" + ] + } + ], + "source": [ + "from optimus import Optimus" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/new-api-sandbox.ipynb b/examples/new-api-sandbox.ipynb index 8fcfe267e..4bfc8625d 100644 --- a/examples/new-api-sandbox.ipynb +++ b/examples/new-api-sandbox.ipynb @@ -31,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ @@ -828,42 +828,253 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| id| firstName| lastName|billingId| product|price| birth|dummyCol|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| 1| Luis| Alvarez$$%!| 123| Cake| 10|1980/07/07| never|\n", - "| 2| André| Ampère| 423| piza| 8|1950/07/08| gonna|\n", - "| 3| NiELS| Böhr//((%%| 551| pizza| 8|1990/07/09| give|\n", - "| 4| PAUL| dirac$| 521| pizza| 8|1954/07/10| you|\n", - "| 5| Albert| Einstein| 634| pizza| 8|1990/07/11| up|\n", - "| 6| Galileo| GALiLEI| 672| arepa| 5|1930/08/12| never|\n", - "| 7| CaRL| Ga%%%uss| 323| taco| 3|1970/07/13| gonna|\n", - "| 8| David| H$$$ilbert| 624| taaaccoo| 3|1950/07/14| let|\n", - "| 9| Johannes| KEPLER| 735| taco| 3|1920/04/22| you|\n", - "| 10| JaMES| M$$ax%%well| 875| taco| 3|1923/03/12| down|\n", - "| 11| Isaac| Newton| 992| pasta| 9|1999/02/15| never |\n", - "| 12| Emmy%%| Nöether$| 234| pasta| 9|1993/12/08| gonna|\n", - "| 13| Max!!!| Planck!!!| 111|hamburguer| 4|1994/01/04| run |\n", - "| 14| Fred| Hoy&&&le| 553| pizzza| 8|1997/06/27| around|\n", - "| 15|((( Heinrich )))))| Hertz| 116| pizza| 8|1956/11/30| and|\n", - "| 16| William| Gilbert###| 886| BEER| 2|1958/03/26| desert|\n", - "| 17| Marie| CURIE| 912| Rice| 1|2000/03/22| you|\n", - "| 18| Arthur| COM%%%pton| 812| 110790| 5|1899/01/01| #|\n", - "| 19| JAMES| Chadwick| 467| null| 10|1921/05/03| #|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", + "Collecting celery\n", + " Downloading https://files.pythonhosted.org/packages/e8/58/2a0b1067ab2c12131b5c089dfc579467c76402475c5231095e36a43b749c/celery-4.2.1-py2.py3-none-any.whl (401kB)\n", + "Requirement already satisfied: pytz>dev in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from celery) (2018.4)\n", + "Collecting billiard<3.6.0,>=3.5.0.2 (from celery)\n", + " Downloading https://files.pythonhosted.org/packages/87/ac/9b3cc065557ad5769d0626fd5dba0ad1cb40e3a72fe6acd3d081b4ad864e/billiard-3.5.0.4.tar.gz (150kB)\n", + "Collecting kombu<5.0,>=4.2.0 (from celery)\n", + " Downloading https://files.pythonhosted.org/packages/97/61/65838c7da048e56d549e358ac19c0979c892e17dc6186610c49531d35b70/kombu-4.2.1-py2.py3-none-any.whl (177kB)\n", + "Collecting amqp<3.0,>=2.1.4 (from kombu<5.0,>=4.2.0->celery)\n", + " Downloading https://files.pythonhosted.org/packages/7f/cf/12d4611fc67babd4ae250c9e8249c5650ae1933395488e9e7e3562b4ff24/amqp-2.3.2-py2.py3-none-any.whl (48kB)\n", + "Collecting vine>=1.1.3 (from amqp<3.0,>=2.1.4->kombu<5.0,>=4.2.0->celery)\n", + " Downloading https://files.pythonhosted.org/packages/10/50/5b1ebe42843c19f35edb15022ecae339fbec6db5b241a7a13c924dabf2a3/vine-1.1.4-py2.py3-none-any.whl\n", + "Building wheels for collected packages: billiard\n", + " Running setup.py bdist_wheel for billiard: started\n", + " Running setup.py bdist_wheel for billiard: finished with status 'done'\n", + " Stored in directory: C:\\Users\\argenisleon\\AppData\\Local\\pip\\Cache\\wheels\\43\\a5\\a3\\d5513d0baa904819b2ee9c58ff594a0c834cd6fcd19d43d9ce\n", + "Successfully built billiard\n", + "Installing collected packages: billiard, vine, amqp, kombu, celery\n", + "Successfully installed amqp-2.3.2 billiard-3.5.0.4 celery-4.2.1 kombu-4.2.1 vine-1.1.4\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "You are using pip version 10.0.1, however version 18.0 is available.\n", + "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n" + ] + } + ], + "source": [ + "!pip install celery" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "sys.path.append(\"..\")\n", + "from optimus.tasks import add\n", + "task = add.delay(4, 4)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "task\n", + "from celery import Celery\n", + "app = Celery('tasks', backend='rpc://', broker='pyamqp://')\n", + "result = add.delay(4, 4)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "!pip install " + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<__main__.Request at 0x2cac51e3fd0>" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "['https://httpbin.org/get', 'https://httpbin.org/get']\n" + ] + } + ], + "source": [ + "import requests\n", + "from threading import Timer\n", + "import time\n", + "\n", + "class Queue:\n", + " def __init__(self, seconds, rate):\n", + " self.seconds = seconds\n", + " self.rate = rate\n", + " self.iqueue = []\n", + " t = Timer(1, self.check)\n", + " t.start()\n", + " self.t = t\n", + " \n", + " def request(self, url):\n", + " # Can be call it or need to wait\n", + " self.iqueue.append(Request(url))\n", + " return \n", + " \n", + " #return requests.get(url)\n", + " def check(self):\n", + " for i in iqueue:\n", + " \n", + " print(self.t)\n", + " print(self.iqueue)\n", + " \n", + " \n", + "class Request:\n", + " def __init__(self):\n", + " self.flag= True\n", + " def run(): \n", + " while self.flag: \n", + " time.sleep(1)\n", + " \n", + " pass\n", + " \n", + "q = Queue(60, 12)\n", + "q.request(\"https://httpbin.org/get\")\n", + "q.request(\"https://httpbin.org/get\")" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'RDD' object has no attribute 'forEachPartition'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrdd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mforEachPartition\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m: 'RDD' object has no attribute 'forEachPartition'" + ] + } + ], + "source": [ + "def f(iterator):\n", + " for x in iterator:\n", + " print(x)\n", + " \n", + "df.rdd.forEachPartition(f)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [], + "source": [ + "rdd1 = op.sc.parallelize([1, 2, 3, 4, 5])\n", + "rdd1.foreachPartition(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "['https://httpbin.org/get', 'https://httpbin.org/get']\n", + "\n", + "['https://httpbin.org/get', 'https://httpbin.org/get']\n" + ] + } + ], + "source": [ + "q.check()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "queue = Queue(60, 1)\n", + "def func(val, args):\n", + " r = queue.request(\"https://httpbin.org/get\")\n", + " return(r.text)\n", + " \n", + "df1= df.limit(1)\n", + "df1.cols.apply(\"dummyCol\", func,\"str\").table()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"args\": {}, \n", + " \"headers\": {\n", + " \"Accept\": \"*/*\", \n", + " \"Accept-Encoding\": \"gzip, deflate\", \n", + " \"Connection\": \"close\", \n", + " \"Host\": \"httpbin.org\", \n", + " \"User-Agent\": \"python-requests/2.18.4\"\n", + " }, \n", + " \"origin\": \"201.141.36.106\", \n", + " \"url\": \"https://httpbin.org/get\"\n", + "}\n", "\n" ] } ], "source": [ - "df.show()" + "r = requests.get(\"https://httpbin.org/get\")\n", + "print(r.text)" ] }, { diff --git a/optimus/profiler/templates/__init__.py b/optimus/profiler/templates/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/optimus/tasks.py b/optimus/tasks.py new file mode 100644 index 000000000..45f04299c --- /dev/null +++ b/optimus/tasks.py @@ -0,0 +1,11 @@ +from celery import Celery + +app = Celery('tasks', broker='pyamqp://guest@localhost//') + +# To run +# Install erglang +# Install rabiitmq +# >> celery -A tasks worker --loglevel=info +@app.task +def add(x, y): + return x + y diff --git a/requirements.txt b/requirements.txt index 04f530ce5..f09e1cd89 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +celery fastnumbers==2.1.1 multipledispatch==0.6.0 python_dateutil==2.7.3 From e4289ba98915cc8eed299f043caca0302299df12 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Wed, 29 Aug 2018 16:07:42 -0500 Subject: [PATCH 02/94] Testing Redis --- examples/Untitled.ipynb | 159 -- examples/new-api-sandbox.ipynb | 3481 +------------------------------- 2 files changed, 34 insertions(+), 3606 deletions(-) delete mode 100644 examples/Untitled.ipynb diff --git a/examples/Untitled.ipynb b/examples/Untitled.ipynb deleted file mode 100644 index 96808952e..000000000 --- a/examples/Untitled.ipynb +++ /dev/null @@ -1,159 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting optimuspyspark\n", - " Downloading https://files.pythonhosted.org/packages/ea/25/acae4eeaaa82b2e97a0e48b329e897b9fed09c43a46009d8d8dcea6c9985/optimuspyspark-2.0.4-py3-none-any.whl (65kB)\n", - "Requirement already satisfied: pygments>=2.2.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (2.2.0)\n", - "Collecting tensorflow==1.6.0 (from optimuspyspark)\n", - " Using cached https://files.pythonhosted.org/packages/56/7d/a0e3ae33e8034be8e7d6b99a7f512c6e71b2180603fc3e0cfb6047b7374f/tensorflow-1.6.0-cp36-cp36m-win_amd64.whl\n", - "Collecting keras==2.1.5 (from optimuspyspark)\n", - " Downloading https://files.pythonhosted.org/packages/ba/65/e4aff762b8696ec0626a6654b1e73b396fcc8b7cc6b98d78a1bc53b85b48/Keras-2.1.5-py2.py3-none-any.whl (334kB)\n", - "Collecting setuptools==40.2.0 (from optimuspyspark)\n", - " Downloading https://files.pythonhosted.org/packages/66/e8/570bb5ca88a8bcd2a1db9c6246bb66615750663ffaaeada95b04ffe74e12/setuptools-40.2.0-py2.py3-none-any.whl (568kB)\n", - "Requirement already satisfied: matplotlib==2.2.3 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (2.2.3)\n", - "Requirement already satisfied: h5py>=2.7.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (2.7.1)\n", - "Requirement already satisfied: pytest==3.7.2 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (3.7.2)\n", - "Collecting numpy==1.15.1 (from optimuspyspark)\n", - " Downloading https://files.pythonhosted.org/packages/fb/7d/f8b97d97809f184d90faf320fa8e2e7eac994844c5e6c57adbed1283e9e9/numpy-1.15.1-cp36-none-win_amd64.whl (13.5MB)\n", - "Requirement already satisfied: ipython==6.5.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (6.5.0)\n", - "Collecting fastnumbers==2.1.1 (from optimuspyspark)\n", - " Downloading https://files.pythonhosted.org/packages/7c/5d/a80733c011cc683de70de5d8c9f5e481cc986428519d14849edfcabb95ae/fastnumbers-2.1.1-cp36-cp36m-win_amd64.whl\n", - "Requirement already satisfied: nose==1.3.7 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (1.3.7)\n", - "Requirement already satisfied: pyspark==2.3.1 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (2.3.1)\n", - "Requirement already satisfied: tabulate==0.8.2 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (0.8.2)\n", - "Requirement already satisfied: seaborn==0.9.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (0.9.0)\n", - "Requirement already satisfied: Jinja2==2.10 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (2.10)\n", - "Requirement already satisfied: six>=1.10.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (1.11.0)\n", - "Requirement already satisfied: python-dateutil==2.7.3 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (2.7.3)\n", - "Requirement already satisfied: deprecated==1.2.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (1.2.0)\n", - "Collecting findspark==1.3.0 (from optimuspyspark)\n", - " Downloading https://files.pythonhosted.org/packages/b1/c8/e6e1f6a303ae5122dc28d131b5a67c5eb87cbf8f7ac5b9f87764ea1b1e1e/findspark-1.3.0-py2.py3-none-any.whl\n", - "Requirement already satisfied: multipledispatch==0.6.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (0.6.0)\n", - "Requirement already satisfied: pillow==5.2.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (5.2.0)\n", - "Collecting pyarrow==0.10.0 (from optimuspyspark)\n", - " Downloading https://files.pythonhosted.org/packages/42/58/4d29b02e9b422cc65f65c1afe5c4710d7e1827c2c1f651bcc19cec042d92/pyarrow-0.10.0-cp36-cp36m-win_amd64.whl (3.6MB)\n", - "Requirement already satisfied: flask==1.0.2 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from optimuspyspark) (1.0.2)\n", - "Collecting tensorboard<1.7.0,>=1.6.0 (from tensorflow==1.6.0->optimuspyspark)\n", - " Using cached https://files.pythonhosted.org/packages/b0/67/a8c91665987d359211dcdca5c8b2a7c1e0876eb0702a4383c1e4ff76228d/tensorboard-1.6.0-py3-none-any.whl\n", - "Requirement already satisfied: protobuf>=3.4.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from tensorflow==1.6.0->optimuspyspark) (3.6.1)\n", - "Requirement already satisfied: wheel>=0.26 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from tensorflow==1.6.0->optimuspyspark) (0.31.1)\n", - "Requirement already satisfied: termcolor>=1.1.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from tensorflow==1.6.0->optimuspyspark) (1.1.0)\n", - "Requirement already satisfied: gast>=0.2.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from tensorflow==1.6.0->optimuspyspark) (0.2.0)\n", - "Requirement already satisfied: absl-py>=0.1.6 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from tensorflow==1.6.0->optimuspyspark) (0.4.0)\n", - "Requirement already satisfied: astor>=0.6.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from tensorflow==1.6.0->optimuspyspark) (0.7.1)\n", - "Requirement already satisfied: grpcio>=1.8.6 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from tensorflow==1.6.0->optimuspyspark) (1.14.1)\n", - "Requirement already satisfied: pyyaml in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from keras==2.1.5->optimuspyspark) (3.12)\n", - "Requirement already satisfied: scipy>=0.14 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from keras==2.1.5->optimuspyspark) (1.1.0)\n", - "Requirement already satisfied: cycler>=0.10 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from matplotlib==2.2.3->optimuspyspark) (0.10.0)\n", - "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from matplotlib==2.2.3->optimuspyspark) (2.2.0)\n", - "Requirement already satisfied: pytz in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from matplotlib==2.2.3->optimuspyspark) (2018.4)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from matplotlib==2.2.3->optimuspyspark) (1.0.1)\n", - "Requirement already satisfied: py>=1.5.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from pytest==3.7.2->optimuspyspark) (1.5.3)\n", - "Requirement already satisfied: attrs>=17.4.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from pytest==3.7.2->optimuspyspark) (18.1.0)\n", - "Requirement already satisfied: more-itertools>=4.0.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from pytest==3.7.2->optimuspyspark) (4.1.0)\n", - "Requirement already satisfied: atomicwrites>=1.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from pytest==3.7.2->optimuspyspark) (1.1.5)\n", - "Requirement already satisfied: pluggy>=0.7 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from pytest==3.7.2->optimuspyspark) (0.7.1)\n", - "Requirement already satisfied: colorama in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from pytest==3.7.2->optimuspyspark) (0.3.9)\n", - "Requirement already satisfied: decorator in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from ipython==6.5.0->optimuspyspark) (4.3.0)\n", - "Requirement already satisfied: jedi>=0.10 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from ipython==6.5.0->optimuspyspark) (0.12.0)\n", - "Requirement already satisfied: traitlets>=4.2 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from ipython==6.5.0->optimuspyspark) (4.3.2)\n", - "Requirement already satisfied: pickleshare in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from ipython==6.5.0->optimuspyspark) (0.7.4)\n", - "Requirement already satisfied: prompt-toolkit<2.0.0,>=1.0.15 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from ipython==6.5.0->optimuspyspark) (1.0.15)\n", - "Requirement already satisfied: backcall in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from ipython==6.5.0->optimuspyspark) (0.1.0)\n", - "Requirement already satisfied: simplegeneric>0.8 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from ipython==6.5.0->optimuspyspark) (0.8.1)\n", - "Requirement already satisfied: py4j==0.10.7 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from pyspark==2.3.1->optimuspyspark) (0.10.7)\n", - "Requirement already satisfied: pandas>=0.15.2 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from seaborn==0.9.0->optimuspyspark) (0.23.0)\n", - "Requirement already satisfied: MarkupSafe>=0.23 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from Jinja2==2.10->optimuspyspark) (1.0)\n", - "Requirement already satisfied: wrapt<2,>=1 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from deprecated==1.2.0->optimuspyspark) (1.10.11)\n", - "Requirement already satisfied: Werkzeug>=0.14 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from flask==1.0.2->optimuspyspark) (0.14.1)\n", - "Requirement already satisfied: click>=5.1 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from flask==1.0.2->optimuspyspark) (6.7)\n", - "Requirement already satisfied: itsdangerous>=0.24 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from flask==1.0.2->optimuspyspark) (0.24)\n", - "Requirement already satisfied: bleach==1.5.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from tensorboard<1.7.0,>=1.6.0->tensorflow==1.6.0->optimuspyspark) (1.5.0)\n", - "Requirement already satisfied: markdown>=2.6.8 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from tensorboard<1.7.0,>=1.6.0->tensorflow==1.6.0->optimuspyspark) (2.6.11)\n", - "Requirement already satisfied: html5lib==0.9999999 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from tensorboard<1.7.0,>=1.6.0->tensorflow==1.6.0->optimuspyspark) (0.9999999)\n", - "Requirement already satisfied: parso>=0.2.0 in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from jedi>=0.10->ipython==6.5.0->optimuspyspark) (0.2.0)\n", - "Requirement already satisfied: ipython_genutils in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from traitlets>=4.2->ipython==6.5.0->optimuspyspark) (0.2.0)\n", - "Requirement already satisfied: wcwidth in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from prompt-toolkit<2.0.0,>=1.0.15->ipython==6.5.0->optimuspyspark) (0.1.7)\n", - "Installing collected packages: numpy, tensorboard, tensorflow, keras, setuptools, fastnumbers, findspark, pyarrow, optimuspyspark\n", - " Found existing installation: numpy 1.14.4\n", - " Uninstalling numpy-1.14.4:\n", - " Successfully uninstalled numpy-1.14.4\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "keras-preprocessing 1.0.2 has requirement keras>=2.1.6, but you'll have keras 2.1.5 which is incompatible.\n", - "keras-applications 1.0.4 has requirement keras>=2.1.6, but you'll have keras 2.1.5 which is incompatible.\n", - "Could not install packages due to an EnvironmentError: [WinError 5] Access is denied: 'C:\\\\Users\\\\ARGENI~1\\\\AppData\\\\Local\\\\Temp\\\\pip-uninstall-r4y_p5ei\\\\users\\\\argenisleon\\\\appdata\\\\local\\\\continuum\\\\anaconda3\\\\lib\\\\site-packages\\\\numpy\\\\.libs\\\\libopenblas.bnvrk7633hsx7yvo2tadgr4a5kekxjaw.gfortran-win_amd64.dll'\n", - "Consider using the `--user` option or check the permissions.\n", - "\n", - "You are using pip version 10.0.1, however version 18.0 is available.\n", - "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n" - ] - } - ], - "source": [ - "!pip install optimuspyspark" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'optimus'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0moptimus\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mOptimus\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'optimus'" - ] - } - ], - "source": [ - "from optimus import Optimus" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/new-api-sandbox.ipynb b/examples/new-api-sandbox.ipynb index 4bfc8625d..175571fe9 100644 --- a/examples/new-api-sandbox.ipynb +++ b/examples/new-api-sandbox.ipynb @@ -828,208 +828,49 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting celery\n", - " Downloading https://files.pythonhosted.org/packages/e8/58/2a0b1067ab2c12131b5c089dfc579467c76402475c5231095e36a43b749c/celery-4.2.1-py2.py3-none-any.whl (401kB)\n", - "Requirement already satisfied: pytz>dev in c:\\users\\argenisleon\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages (from celery) (2018.4)\n", - "Collecting billiard<3.6.0,>=3.5.0.2 (from celery)\n", - " Downloading https://files.pythonhosted.org/packages/87/ac/9b3cc065557ad5769d0626fd5dba0ad1cb40e3a72fe6acd3d081b4ad864e/billiard-3.5.0.4.tar.gz (150kB)\n", - "Collecting kombu<5.0,>=4.2.0 (from celery)\n", - " Downloading https://files.pythonhosted.org/packages/97/61/65838c7da048e56d549e358ac19c0979c892e17dc6186610c49531d35b70/kombu-4.2.1-py2.py3-none-any.whl (177kB)\n", - "Collecting amqp<3.0,>=2.1.4 (from kombu<5.0,>=4.2.0->celery)\n", - " Downloading https://files.pythonhosted.org/packages/7f/cf/12d4611fc67babd4ae250c9e8249c5650ae1933395488e9e7e3562b4ff24/amqp-2.3.2-py2.py3-none-any.whl (48kB)\n", - "Collecting vine>=1.1.3 (from amqp<3.0,>=2.1.4->kombu<5.0,>=4.2.0->celery)\n", - " Downloading https://files.pythonhosted.org/packages/10/50/5b1ebe42843c19f35edb15022ecae339fbec6db5b241a7a13c924dabf2a3/vine-1.1.4-py2.py3-none-any.whl\n", - "Building wheels for collected packages: billiard\n", - " Running setup.py bdist_wheel for billiard: started\n", - " Running setup.py bdist_wheel for billiard: finished with status 'done'\n", - " Stored in directory: C:\\Users\\argenisleon\\AppData\\Local\\pip\\Cache\\wheels\\43\\a5\\a3\\d5513d0baa904819b2ee9c58ff594a0c834cd6fcd19d43d9ce\n", - "Successfully built billiard\n", - "Installing collected packages: billiard, vine, amqp, kombu, celery\n", - "Successfully installed amqp-2.3.2 billiard-3.5.0.4 celery-4.2.1 kombu-4.2.1 vine-1.1.4\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "You are using pip version 10.0.1, however version 18.0 is available.\n", - "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n" - ] - } - ], - "source": [ - "!pip install celery" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [], - "source": [ - "sys.path.append(\"..\")\n", - "from optimus.tasks import add\n", - "task = add.delay(4, 4)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "task\n", - "from celery import Celery\n", - "app = Celery('tasks', backend='rpc://', broker='pyamqp://')\n", - "result = add.delay(4, 4)" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "!pip install " - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "<__main__.Request at 0x2cac51e3fd0>" - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "['https://httpbin.org/get', 'https://httpbin.org/get']\n" - ] - } - ], - "source": [ - "import requests\n", - "from threading import Timer\n", - "import time\n", - "\n", - "class Queue:\n", - " def __init__(self, seconds, rate):\n", - " self.seconds = seconds\n", - " self.rate = rate\n", - " self.iqueue = []\n", - " t = Timer(1, self.check)\n", - " t.start()\n", - " self.t = t\n", - " \n", - " def request(self, url):\n", - " # Can be call it or need to wait\n", - " self.iqueue.append(Request(url))\n", - " return \n", - " \n", - " #return requests.get(url)\n", - " def check(self):\n", - " for i in iqueue:\n", - " \n", - " print(self.t)\n", - " print(self.iqueue)\n", - " \n", - " \n", - "class Request:\n", - " def __init__(self):\n", - " self.flag= True\n", - " def run(): \n", - " while self.flag: \n", - " time.sleep(1)\n", - " \n", - " pass\n", - " \n", - "q = Queue(60, 12)\n", - "q.request(\"https://httpbin.org/get\")\n", - "q.request(\"https://httpbin.org/get\")" - ] - }, - { - "cell_type": "code", - "execution_count": 73, + "execution_count": 2, "metadata": {}, "outputs": [ { - "ename": "AttributeError", - "evalue": "'RDD' object has no attribute 'forEachPartition'", + "ename": "ConnectionError", + "evalue": "Error 10061 connecting to localhost:6379. No connection could be made because the target machine actively refused it.", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrdd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mforEachPartition\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;31mAttributeError\u001b[0m: 'RDD' object has no attribute 'forEachPartition'" - ] - } - ], - "source": [ - "def f(iterator):\n", - " for x in iterator:\n", - " print(x)\n", - " \n", - "df.rdd.forEachPartition(f)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": {}, - "outputs": [], - "source": [ - "rdd1 = op.sc.parallelize([1, 2, 3, 4, 5])\n", - "rdd1.foreachPartition(f)" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "['https://httpbin.org/get', 'https://httpbin.org/get']\n", - "\n", - "['https://httpbin.org/get', 'https://httpbin.org/get']\n" + "\u001b[1;31mConnectionRefusedError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\redis\\connection.py\u001b[0m in \u001b[0;36mconnect\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 483\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 484\u001b[1;33m \u001b[0msock\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_connect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 485\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\redis\\connection.py\u001b[0m in \u001b[0;36m_connect\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 540\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0merr\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 541\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0merr\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 542\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"socket.getaddrinfo returned an empty list\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\redis\\connection.py\u001b[0m in \u001b[0;36m_connect\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 528\u001b[0m \u001b[1;31m# connect\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 529\u001b[1;33m \u001b[0msock\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msocket_address\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 530\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mConnectionRefusedError\u001b[0m: [WinError 10061] No connection could be made because the target machine actively refused it", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[1;31mConnectionError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\redis\\client.py\u001b[0m in \u001b[0;36mexecute_command\u001b[1;34m(self, *args, **options)\u001b[0m\n\u001b[0;32m 666\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 667\u001b[1;33m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 668\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse_response\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mconnection\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcommand_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0moptions\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\redis\\connection.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m 609\u001b[0m \u001b[1;34m\"Pack and send a command to the Redis server\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 610\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_packed_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpack_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 611\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\redis\\connection.py\u001b[0m in \u001b[0;36msend_packed_command\u001b[1;34m(self, command)\u001b[0m\n\u001b[0;32m 584\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 585\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 586\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\redis\\connection.py\u001b[0m in \u001b[0;36mconnect\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 488\u001b[0m \u001b[0me\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexc_info\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 489\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_error_message\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0me\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 490\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mConnectionError\u001b[0m: Error 10061 connecting to localhost:6379. No connection could be made because the target machine actively refused it.", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[1;31mConnectionRefusedError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\redis\\connection.py\u001b[0m in \u001b[0;36mconnect\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 483\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 484\u001b[1;33m \u001b[0msock\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_connect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 485\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\redis\\connection.py\u001b[0m in \u001b[0;36m_connect\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 540\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0merr\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 541\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0merr\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 542\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"socket.getaddrinfo returned an empty list\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\redis\\connection.py\u001b[0m in \u001b[0;36m_connect\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 528\u001b[0m \u001b[1;31m# connect\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 529\u001b[1;33m \u001b[0msock\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msocket_address\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 530\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mConnectionRefusedError\u001b[0m: [WinError 10061] No connection could be made because the target machine actively refused it", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[1;31mConnectionError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mredis\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0mr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mredis\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mStrictRedis\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhost\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'localhost'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mport\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m6379\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdb\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mset\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'foo'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'bar'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\redis\\client.py\u001b[0m in \u001b[0;36mset\u001b[1;34m(self, name, value, ex, px, nx, xx)\u001b[0m\n\u001b[0;32m 1169\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mxx\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1170\u001b[0m \u001b[0mpieces\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'XX'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1171\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'SET'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0mpieces\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1172\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1173\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__setitem__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\redis\\client.py\u001b[0m in \u001b[0;36mexecute_command\u001b[1;34m(self, *args, **options)\u001b[0m\n\u001b[0;32m 671\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mretry_on_timeout\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0me\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mTimeoutError\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 672\u001b[0m \u001b[1;32mraise\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 673\u001b[1;33m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 674\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse_response\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mconnection\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcommand_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0moptions\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 675\u001b[0m \u001b[1;32mfinally\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\redis\\connection.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m 608\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 609\u001b[0m \u001b[1;34m\"Pack and send a command to the Redis server\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 610\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_packed_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpack_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 611\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 612\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mcan_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\redis\\connection.py\u001b[0m in \u001b[0;36msend_packed_command\u001b[1;34m(self, command)\u001b[0m\n\u001b[0;32m 583\u001b[0m \u001b[1;34m\"Send an already packed command to the Redis server\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 584\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 585\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 586\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 587\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\redis\\connection.py\u001b[0m in \u001b[0;36mconnect\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 487\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 488\u001b[0m \u001b[0me\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexc_info\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 489\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_error_message\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0me\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 490\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 491\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sock\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msock\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mConnectionError\u001b[0m: Error 10061 connecting to localhost:6379. No connection could be made because the target machine actively refused it." ] } ], "source": [ - "q.check()" + "import redis\n", + "r = redis.StrictRedis(host='localhost', port=6379, db=0)\n", + "r.set('foo', 'bar')" ] }, { @@ -1038,3261 +879,7 @@ "metadata": {}, "outputs": [], "source": [ - "queue = Queue(60, 1)\n", - "def func(val, args):\n", - " r = queue.request(\"https://httpbin.org/get\")\n", - " return(r.text)\n", - " \n", - "df1= df.limit(1)\n", - "df1.cols.apply(\"dummyCol\", func,\"str\").table()" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"args\": {}, \n", - " \"headers\": {\n", - " \"Accept\": \"*/*\", \n", - " \"Accept-Encoding\": \"gzip, deflate\", \n", - " \"Connection\": \"close\", \n", - " \"Host\": \"httpbin.org\", \n", - " \"User-Agent\": \"python-requests/2.18.4\"\n", - " }, \n", - " \"origin\": \"201.141.36.106\", \n", - " \"url\": \"https://httpbin.org/get\"\n", - "}\n", - "\n" - ] - } - ], - "source": [ - "r = requests.get(\"https://httpbin.org/get\")\n", - "print(r.text)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---+---------+--------+---------+-----------------+-----+----------+----------+-------------+\n", - "| id|firstname|lastname|billingid| product|price| birth| new_date|years_between|\n", - "+---+---------+--------+---------+-----------------+-----+----------+----------+-------------+\n", - "| 10| james| maxwell| 875| taco| 3|1923/03/12|12-03-1923| 95.4355|\n", - "| 11| isaac| newton| 992| pasta| 9|1999/02/15|15-02-1999| 19.5108|\n", - "| 12| emmy| noether| 234| pasta| 9|1993/12/08|08-12-1993| 24.6962|\n", - "| 13| max| planck| 111| hamburguer| 4|1994/01/04|04-01-1994| 24.6237|\n", - "| 14| fred| hoyle| 553| pizza| 8|1997/06/27|27-06-1997| 21.1452|\n", - "| 15| heinrich| hertz| 116| pizza| 8|1956/11/30|30-11-1956| 61.7204|\n", - "| 16| william| gilbert| 886| BEER| 2|1958/03/26|26-03-1958| 60.3978|\n", - "| 17| marie| curie| 912| Rice| 1|2000/03/22|22-03-2000| 18.4086|\n", - "| 18| arthur| compton| 812|this was a number| 5|1899/01/01|01-01-1899| 119.6317|\n", - "| 19| james|chadwick| 467| null| 10|1921/05/03|03-05-1921| 97.293|\n", - "| 7| carl| gauss| 323| taco| 3|1970/07/13|13-07-1970| 48.0995|\n", - "| 8| david| hilbert| 624| taco| 3|1950/07/14|14-07-1950| 68.0968|\n", - "| 9| johannes| kepler| 735| taco| 3|1920/04/22|22-04-1920| 98.3253|\n", - "+---+---------+--------+---------+-----------------+-----+----------+----------+-------------+\n", - "\n" - ] - } - ], - "source": [ - "# This is a custom function\n", - "def num_to_string(value, arg):\n", - " return \"this was a number\"\n", - " \n", - "df\\\n", - " .rows.sort(\"product\",\"desc\")\\\n", - " .cols.lower([\"firstName\",\"lastName\"])\\\n", - " .cols.date_transform(\"birth\", \"new_date\", \"yyyy/MM/dd\", \"dd-MM-YYYY\")\\\n", - " .cols.years_between(\"birth\", \"years_between\", \"yyyy/MM/dd\")\\\n", - " .cols.remove_accents(\"lastName\")\\\n", - " .cols.remove_special_chars([\"firstName\",\"lastName\"])\\\n", - " .cols.replace(\"product\",\"taaaccoo\",\"taco\")\\\n", - " .cols.replace(\"product\",[\"piza\",\"pizzza\"],\"pizza\")\\\n", - " .rows.drop(df[\"id\"]<7)\\\n", - " .cols.drop(\"dummyCol\")\\\n", - " .cols.rename(str.lower)\\\n", - " .cols.apply_by_dtypes(\"product\",num_to_string,\"string\", data_type=\"integer\")\\\n", - " .cols.trim(\"*\")\\\n", - " .rows.sort(\"id\")\\\n", - " .show()" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| id| firstName| lastName|billingId| product|price| birth|dummyCol|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| 1| Luis| Alvarez$$%!| 123| Cake| 10|1980/07/07| never|\n", - "| 2| André| Ampère| 423| piza| 8|1950/07/08| gonna|\n", - "| 3| NiELS| Böhr//((%%| 551| pizza| 8|1990/07/09| give|\n", - "| 4| PAUL| dirac$| 521| pizza| 8|1954/07/10| you|\n", - "| 5| Albert| Einstein| 634| pizza| 8|1990/07/11| up|\n", - "| 6| Galileo| GALiLEI| 672| arepa| 5|1930/08/12| never|\n", - "| 7| CaRL| Ga%%%uss| 323| taco| 3|1970/07/13| gonna|\n", - "| 8| David| H$$$ilbert| 624| taaaccoo| 3|1950/07/14| let|\n", - "| 9| Johannes| KEPLER| 735| taco| 3|1920/04/22| you|\n", - "| 10| JaMES| M$$ax%%well| 875| taco| 3|1923/03/12| down|\n", - "| 11| Isaac| Newton| 992| pasta| 9|1999/02/15| never |\n", - "| 12| Emmy%%| Nöether$| 234| pasta| 9|1993/12/08| gonna|\n", - "| 13| Max!!!| Planck!!!| 111|hamburguer| 4|1994/01/04| run |\n", - "| 14| Fred| Hoy&&&le| 553| pizzza| 8|1997/06/27| around|\n", - "| 15|((( Heinrich )))))| Hertz| 116| pizza| 8|1956/11/30| and|\n", - "| 16| William| Gilbert###| 886| BEER| 2|1958/03/26| desert|\n", - "| 17| Marie| CURIE| 912| Rice| 1|2000/03/22| you|\n", - "| 18| Arthur| COM%%%pton| 812| 110790| 5|1899/01/01| #|\n", - "| 19| JAMES| Chadwick| 467| null| 10|1921/05/03| #|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| id| firstName| lastName|billingid| product|price| birth|dummyCol|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| 1| Luis| Alvarez$$%!| 126| Cake| 10|1980/07/07| never|\n", - "| 2| André| Ampère| 426| piza| 8|1950/07/08| gonna|\n", - "| 3| NiELS| Böhr//((%%| 554| pizza| 8|1990/07/09| give|\n", - "| 4| PAUL| dirac$| 524| pizza| 8|1954/07/10| you|\n", - "| 5| Albert| Einstein| 637| pizza| 8|1990/07/11| up|\n", - "| 6| Galileo| GALiLEI| 675| arepa| 5|1930/08/12| never|\n", - "| 7| CaRL| Ga%%%uss| 326| taco| 3|1970/07/13| gonna|\n", - "| 8| David| H$$$ilbert| 627| taaaccoo| 3|1950/07/14| let|\n", - "| 9| Johannes| KEPLER| 738| taco| 3|1920/04/22| you|\n", - "| 10| JaMES| M$$ax%%well| 878| taco| 3|1923/03/12| down|\n", - "| 11| Isaac| Newton| 995| pasta| 9|1999/02/15| never |\n", - "| 12| Emmy%%| Nöether$| 237| pasta| 9|1993/12/08| gonna|\n", - "| 13| Max!!!| Planck!!!| 114|hamburguer| 4|1994/01/04| run |\n", - "| 14| Fred| Hoy&&&le| 556| pizzza| 8|1997/06/27| around|\n", - "| 15|((( Heinrich )))))| Hertz| 119| pizza| 8|1956/11/30| and|\n", - "| 16| William| Gilbert###| 889| BEER| 2|1958/03/26| desert|\n", - "| 17| Marie| CURIE| 915| Rice| 1|2000/03/22| you|\n", - "| 18| Arthur| COM%%%pton| 815| 110790| 5|1899/01/01| #|\n", - "| 19| JAMES| Chadwick| 470| null| 10|1921/05/03| #|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "\n" - ] - } - ], - "source": [ - "df.show()\n", - "def func(value, args):\n", - " return value +args[0] + args[1]\n", - "\n", - "\n", - "df.cols.apply(\"billingid\",func,\"int\", [1,2]).show()\n", - "\n", - "#df.cols.apply([\"num\", \"new_col_1\"], func, \"int\", 32 ,\"udf\").table()" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| id| firstName| lastName|billingid| product|price| birth|dummyCol|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| 1| Luis| Alvarez$$%!| 6.15| Cake| 10|1980/07/07| never|\n", - "| 2| André| Ampère| 21.15| piza| 8|1950/07/08| gonna|\n", - "| 3| NiELS| Böhr//((%%| 27.55| pizza| 8|1990/07/09| give|\n", - "| 4| PAUL| dirac$| 26.05| pizza| 8|1954/07/10| you|\n", - "| 5| Albert| Einstein| 31.7| pizza| 8|1990/07/11| up|\n", - "| 6| Galileo| GALiLEI| 33.6| arepa| 5|1930/08/12| never|\n", - "| 7| CaRL| Ga%%%uss| 16.15| taco| 3|1970/07/13| gonna|\n", - "| 8| David| H$$$ilbert| 31.2| taaaccoo| 3|1950/07/14| let|\n", - "| 9| Johannes| KEPLER| 36.75| taco| 3|1920/04/22| you|\n", - "| 10| JaMES| M$$ax%%well| 43.75| taco| 3|1923/03/12| down|\n", - "| 11| Isaac| Newton| 49.6| pasta| 9|1999/02/15| never |\n", - "| 12| Emmy%%| Nöether$| 11.7| pasta| 9|1993/12/08| gonna|\n", - "| 13| Max!!!| Planck!!!| 5.55|hamburguer| 4|1994/01/04| run |\n", - "| 14| Fred| Hoy&&&le| 27.65| pizzza| 8|1997/06/27| around|\n", - "| 15|((( Heinrich )))))| Hertz| 5.8| pizza| 8|1956/11/30| and|\n", - "| 16| William| Gilbert###| 44.3| BEER| 2|1958/03/26| desert|\n", - "| 17| Marie| CURIE| 45.6| Rice| 1|2000/03/22| you|\n", - "| 18| Arthur| COM%%%pton| 40.6| 110790| 5|1899/01/01| #|\n", - "| 19| JAMES| Chadwick| 23.35| null| 10|1921/05/03| #|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "\n" - ] - } - ], - "source": [ - "from pyspark.sql import functions as F\n", - "\n", - "def func(col_name, args):\n", - " return F.col(col_name)/20\n", - "\n", - "df.cols.apply_expr(\"billingid\", func, 20).show()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Row(double=1, single=1),\n", - " Row(double=4, single=2),\n", - " Row(double=9, single=3),\n", - " Row(double=16, single=4),\n", - " Row(double=25, single=5)]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pyspark.sql import SparkSession\n", - "\n", - "spark = SparkSession \\\n", - " .builder \\\n", - " .appName(\"Python Spark SQL basic example\") \\\n", - " .config(\"spark.some.config.option\", \"some-value\") \\\n", - " .getOrCreate()\n", - "\n", - "sc = spark.sparkContext\n", - "\n", - "spark.createDataFrame(sc.parallelize(range(1, 6)).map(lambda i: Row(single=i, double=i ** 2))).collect()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Cheat Sheet" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "df = op.create.df(\n", - " [\n", - " (\"words\", \"str\", True),\n", - " (\"num\", \"int\", True),\n", - " ],\n", - " [\n", - " (\" I like fish \", 1),\n", - " (\" zombies\", 2, ),\n", - " (\"simpsons cat lady\", 2),\n", - " (None, 3)\n", - " ])" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
Viewing 9 of 9
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
id
\n", - "
(int)
\n", - "\n", - "
\n", - "
firstName
\n", - "
(string)
\n", - "\n", - "
\n", - "
lastName
\n", - "
(string)
\n", - "\n", - "
\n", - "
billingId
\n", - "
(int)
\n", - "\n", - "
\n", - "
product
\n", - "
(string)
\n", - "\n", - "
\n", - "
price
\n", - "
(int)
\n", - "\n", - "
\n", - "
birth
\n", - "
(string)
\n", - "\n", - "
\n", - "
dummyCol
\n", - "
(string)
\n", - "\n", - "
\n", - " 2\n", - " \n", - " André\n", - " \n", - " Ampère\n", - " \n", - " 423\n", - " \n", - " piza\n", - " \n", - " 8\n", - " \n", - " 1950/07/08\n", - " \n", - " gonna\n", - "
\n", - " 7\n", - " \n", - " CaRL\n", - " \n", - " Ga%%%uss\n", - " \n", - " 323\n", - " \n", - " taco\n", - " \n", - " 3\n", - " \n", - " 1970/07/13\n", - " \n", - " gonna\n", - "
\n", - " 9\n", - " \n", - " Johannes\n", - " \n", - " KEPLER\n", - " \n", - " 735\n", - " \n", - " taco\n", - " \n", - " 3\n", - " \n", - " 1920/04/22\n", - " \n", - " you\n", - "
\n", - " 10\n", - " \n", - " JaMES\n", - " \n", - " M$$ax%%well\n", - " \n", - " 875\n", - " \n", - " taco\n", - " \n", - " 3\n", - " \n", - " 1923/03/12\n", - " \n", - " down\n", - "
\n", - " 11\n", - " \n", - " Isaac\n", - " \n", - " Newton\n", - " \n", - " 992\n", - " \n", - " pasta\n", - " \n", - " 9\n", - " \n", - " 1999/02/15\n", - " \n", - " never⸱\n", - "
\n", - " 14\n", - " \n", - " Fred\n", - " \n", - " Hoy&&&le\n", - " \n", - " 553\n", - " \n", - " pizzza\n", - " \n", - " 8\n", - " \n", - " 1997/06/27\n", - " \n", - " around\n", - "
\n", - " 16\n", - " \n", - " William\n", - " \n", - " Gilbert###\n", - " \n", - " 886\n", - " \n", - " BEER\n", - " \n", - " 2\n", - " \n", - " 1958/03/26\n", - " \n", - " desert\n", - "
\n", - " 17\n", - " \n", - " Marie\n", - " \n", - " CURIE\n", - " \n", - " 912\n", - " \n", - " Rice\n", - " \n", - " 1\n", - " \n", - " 2000/03/22\n", - " \n", - " you\n", - "
\n", - " 18\n", - " \n", - " Arthur\n", - " \n", - " COM%%%pton\n", - " \n", - " 812\n", - " \n", - " 110790\n", - " \n", - " 5\n", - " \n", - " 1899/01/01\n", - " \n", - " #\n", - "
\n", - "
Viewing 9 of 9
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df.sample_by(10, False).table()" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+--------------------+--------------------+---------+\n", - "| firstName| lastName|billingId|\n", - "+--------------------+--------------------+---------+\n", - "| Luis| Alvarez$$%!| 123|\n", - "| André| Ampère| 423|\n", - "| NiELS| Böhr//((%%| 551|\n", - "| PAUL| dirac$| 521|\n", - "| Albert| Einstein| 634|\n", - "| Galileo| GALiLEI| 672|\n", - "| CaRL| Ga%%%uss| 323|\n", - "| David| H$$$ilbert| 624|\n", - "| Johannes| KEPLER| 735|\n", - "| JaMES| M$$ax%%well| 875|\n", - "| Isaac| Newton| 992|\n", - "| Emmy%%| Nöether$| 234|\n", - "| Max!!!| Planck!!!| 111|\n", - "| Fred| Hoy&&&le| 553|\n", - "|((( Heinrich )))))| Hertz| 116|\n", - "| William| Gilbert###| 886|\n", - "| Marie| CURIE| 912|\n", - "| Arthur| COM%%%pton| 812|\n", - "| JAMES| Chadwick| 467|\n", - "+--------------------+--------------------+---------+\n", - "\n" - ] - } - ], - "source": [ - "df.cols.select([\"firstName\",2,3]).show()" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "value = [\"1\"]\n", - "bool(value) and isinstance(value, list) and all(isinstance(elem, (int, str)) for elem in value)" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "DataFrame[id: int, firstName: string, lastName: string, billingId: int, product: string, price: int, birth: string, dummyCol: string]" - ] - }, - "execution_count": 67, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
Viewing 19 of 19
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
id
\n", - "
(int)
\n", - "\n", - "
\n", - "
firstName
\n", - "
(string)
\n", - "\n", - "
\n", - "
lastName
\n", - "
(string)
\n", - "\n", - "
\n", - "
billingId
\n", - "
(int)
\n", - "\n", - "
\n", - "
product
\n", - "
(string)
\n", - "\n", - "
\n", - "
price
\n", - "
(int)
\n", - "\n", - "
\n", - "
birth
\n", - "
(string)
\n", - "\n", - "
\n", - "
dummyCol
\n", - "
(string)
\n", - "\n", - "
\n", - "
b
\n", - "
(double)
\n", - "\n", - "
\n", - " 1\n", - " \n", - " Luis\n", - " \n", - " Alvarez$$%!\n", - " \n", - " 123\n", - " \n", - " Cake\n", - " \n", - " 10\n", - " \n", - " 1980/07/07\n", - " \n", - " never\n", - " \n", - " 0.0\n", - "
\n", - " 2\n", - " \n", - " André\n", - " \n", - " Ampère\n", - " \n", - " 423\n", - " \n", - " piza\n", - " \n", - " 8\n", - " \n", - " 1950/07/08\n", - " \n", - " gonna\n", - " \n", - " 0.0\n", - "
\n", - " 3\n", - " \n", - " NiELS\n", - " \n", - " Böhr//((%%\n", - " \n", - " 551\n", - " \n", - " pizza\n", - " \n", - " 8\n", - " \n", - " 1990/07/09\n", - " \n", - " give\n", - " \n", - " 1.0\n", - "
\n", - " 4\n", - " \n", - " PAUL\n", - " \n", - " dirac$\n", - " \n", - " 521\n", - " \n", - " pizza\n", - " \n", - " 8\n", - " \n", - " 1954/07/10\n", - " \n", - " you\n", - " \n", - " 1.0\n", - "
\n", - " 5\n", - " \n", - " Albert\n", - " \n", - " Einstein\n", - " \n", - " 634\n", - " \n", - " pizza\n", - " \n", - " 8\n", - " \n", - " 1990/07/11\n", - " \n", - " up\n", - " \n", - " 1.0\n", - "
\n", - " 6\n", - " \n", - " Galileo\n", - " \n", - " ⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱GALiLEI\n", - " \n", - " 672\n", - " \n", - " arepa\n", - " \n", - " 5\n", - " \n", - " 1930/08/12\n", - " \n", - " never\n", - " \n", - " 2.0\n", - "
\n", - " 7\n", - " \n", - " CaRL\n", - " \n", - " Ga%%%uss\n", - " \n", - " 323\n", - " \n", - " taco\n", - " \n", - " 3\n", - " \n", - " 1970/07/13\n", - " \n", - " gonna\n", - " \n", - " 0.0\n", - "
\n", - " 8\n", - " \n", - " David\n", - " \n", - " H$$$ilbert\n", - " \n", - " 624\n", - " \n", - " taaaccoo\n", - " \n", - " 3\n", - " \n", - " 1950/07/14\n", - " \n", - " let\n", - " \n", - " 1.0\n", - "
\n", - " 9\n", - " \n", - " Johannes\n", - " \n", - " KEPLER\n", - " \n", - " 735\n", - " \n", - " taco\n", - " \n", - " 3\n", - " \n", - " 1920/04/22\n", - " \n", - " you\n", - " \n", - " 2.0\n", - "
\n", - " 10\n", - " \n", - " JaMES\n", - " \n", - " M$$ax%%well\n", - " \n", - " 875\n", - " \n", - " taco\n", - " \n", - " 3\n", - " \n", - " 1923/03/12\n", - " \n", - " down\n", - " \n", - " 2.0\n", - "
\n", - " 11\n", - " \n", - " Isaac\n", - " \n", - " Newton\n", - " \n", - " 992\n", - " \n", - " pasta\n", - " \n", - " 9\n", - " \n", - " 1999/02/15\n", - " \n", - " never⸱\n", - " \n", - " 2.0\n", - "
\n", - " 12\n", - " \n", - " Emmy%%\n", - " \n", - " Nöether$\n", - " \n", - " 234\n", - " \n", - " pasta\n", - " \n", - " 9\n", - " \n", - " 1993/12/08\n", - " \n", - " gonna\n", - " \n", - " 0.0\n", - "
\n", - " 13\n", - " \n", - " Max!!!\n", - " \n", - " Planck!!!\n", - " \n", - " 111\n", - " \n", - " hamburguer\n", - " \n", - " 4\n", - " \n", - " 1994/01/04\n", - " \n", - " run⸱\n", - " \n", - " 0.0\n", - "
\n", - " 14\n", - " \n", - " Fred\n", - " \n", - " Hoy&&&le\n", - " \n", - " 553\n", - " \n", - " pizzza\n", - " \n", - " 8\n", - " \n", - " 1997/06/27\n", - " \n", - " around\n", - " \n", - " 1.0\n", - "
\n", - " 15\n", - " \n", - " (((⸱⸱⸱Heinrich⸱)))))\n", - " \n", - " Hertz\n", - " \n", - " 116\n", - " \n", - " pizza\n", - " \n", - " 8\n", - " \n", - " 1956/11/30\n", - " \n", - " and\n", - " \n", - " 0.0\n", - "
\n", - " 16\n", - " \n", - " William\n", - " \n", - " Gilbert###\n", - " \n", - " 886\n", - " \n", - " BEER\n", - " \n", - " 2\n", - " \n", - " 1958/03/26\n", - " \n", - " desert\n", - " \n", - " 2.0\n", - "
\n", - " 17\n", - " \n", - " Marie\n", - " \n", - " CURIE\n", - " \n", - " 912\n", - " \n", - " Rice\n", - " \n", - " 1\n", - " \n", - " 2000/03/22\n", - " \n", - " you\n", - " \n", - " 2.0\n", - "
\n", - " 18\n", - " \n", - " Arthur\n", - " \n", - " COM%%%pton\n", - " \n", - " 812\n", - " \n", - " 110790\n", - " \n", - " 5\n", - " \n", - " 1899/01/01\n", - " \n", - " #\n", - " \n", - " 2.0\n", - "
\n", - " 19\n", - " \n", - " JAMES\n", - " \n", - " Chadwick\n", - " \n", - " 467\n", - " \n", - " null\n", - " \n", - " 10\n", - " \n", - " 1921/05/03\n", - " \n", - " #\n", - " \n", - " 1.0\n", - "
\n", - "
Viewing 19 of 19
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from pyspark.sql import functions as F\n", - "c = \"firstName\"\n", - "df.cols.qcut(\"billingId\",\"b\",3).table()" - ] - }, - { - "cell_type": "code", - "execution_count": 98, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| id| firstName| lastName|billingId| product|price| birth|dummyCol|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| 1| Luis| Alvarez$$%!| 200| Cake| 10|1980/07/07| never|\n", - "| 2| André| Ampère| 400| piza| 8|1950/07/08| gonna|\n", - "| 3| NiELS| Böhr//((%%| 400| pizza| 8|1990/07/09| give|\n", - "| 4| PAUL| dirac$| 400| pizza| 8|1954/07/10| you|\n", - "| 5| Albert| Einstein| 400| pizza| 8|1990/07/11| up|\n", - "| 6| Galileo| GALiLEI| 400| arepa| 5|1930/08/12| never|\n", - "| 7| CaRL| Ga%%%uss| 323| taco| 3|1970/07/13| gonna|\n", - "| 8| David| H$$$ilbert| 400| taaaccoo| 3|1950/07/14| let|\n", - "| 9| Johannes| KEPLER| 400| taco| 3|1920/04/22| you|\n", - "| 10| JaMES| M$$ax%%well| 400| taco| 3|1923/03/12| down|\n", - "| 11| Isaac| Newton| 400| pasta| 9|1999/02/15| never |\n", - "| 12| Emmy%%| Nöether$| 234| pasta| 9|1993/12/08| gonna|\n", - "| 13| Max!!!| Planck!!!| 200|hamburguer| 4|1994/01/04| run |\n", - "| 14| Fred| Hoy&&&le| 400| pizzza| 8|1997/06/27| around|\n", - "| 15|((( Heinrich )))))| Hertz| 200| pizza| 8|1956/11/30| and|\n", - "| 16| William| Gilbert###| 400| BEER| 2|1958/03/26| desert|\n", - "| 17| Marie| CURIE| 400| Rice| 1|2000/03/22| you|\n", - "| 18| Arthur| COM%%%pton| 400| 110790| 5|1899/01/01| #|\n", - "| 19| JAMES| Chadwick| 400| null| 10|1921/05/03| #|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "\n" - ] - } - ], - "source": [ - "df.cols.clip(\"billingId\",200, 400).show()" - ] - }, - { - "cell_type": "code", - "execution_count": 89, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| id| firstName| lastName|billingId| product|price| birth|dummyCol|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| 1| Luis| Alvarez$$%!| 123| Cake| 10|1980/07/07| never|\n", - "| 2| André| Ampère| 423| piza| 8|1950/07/08| gonna|\n", - "| 3| NiELS| Böhr//((%%| 551| pizza| 8|1990/07/09| give|\n", - "| 4| PAUL| dirac$| 521| pizza| 8|1954/07/10| you|\n", - "| 5| Albert| Einstein| 634| pizza| 8|1990/07/11| up|\n", - "| 6| Galileo| GALiLEI| 672| arepa| 5|1930/08/12| never|\n", - "| 7| CaRL| Ga%%%uss| 323| taco| 3|1970/07/13| gonna|\n", - "| 8| David| H$$$ilbert| 624| taaaccoo| 3|1950/07/14| let|\n", - "| 9| Johannes| KEPLER| 735| taco| 3|1920/04/22| you|\n", - "| 10| JaMES| M$$ax%%well| 875| taco| 3|1923/03/12| down|\n", - "| 11| Isaac| Newton| 992| pasta| 9|1999/02/15| never |\n", - "| 12| Emmy%%| Nöether$| 234| pasta| 9|1993/12/08| gonna|\n", - "| 13| Max!!!| Planck!!!| 111|hamburguer| 4|1994/01/04| run |\n", - "| 14| Fred| Hoy&&&le| 553| pizzza| 8|1997/06/27| around|\n", - "| 15|((( Heinrich )))))| Hertz| 116| pizza| 8|1956/11/30| and|\n", - "| 16| William| Gilbert###| 886| BEER| 2|1958/03/26| desert|\n", - "| 17| Marie| CURIE| 912| Rice| 1|2000/03/22| you|\n", - "| 18| Arthur| COM%%%pton| 812| 110790| 5|1899/01/01| #|\n", - "| 19| JAMES| Chadwick| 467| null| 10|1921/05/03| #|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "\n" - ] - } - ], - "source": [ - "df.cols.abs(\"billingId\").show()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
Viewing 100 of 569
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
label
\n", - "
1 (double)
\n", - "\n", - "
\n", - "
prediction
\n", - "
2 (double)
\n", - "\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - "
Viewing 100 of 569
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from optimus import Optimus\n", - "\n", - "op = Optimus()\n", - "df_cancer =op.load.url(\"https://raw.githubusercontent.com/ironmussa/Optimus/master/tests/data_cancer.csv\")\n", - "\n", - "columns = ['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',\n", - " 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean',\n", - " 'fractal_dimension_mean']\n", - "\n", - "df_model, rf_model = op.ml.random_forest(df_cancer, columns, \"diagnosis\")\n", - "df_model.cols.select([\"label\",\"prediction\"]).table()" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "value = [{'name': 'Asuka 881627', 'id': '4336', 'nametype': 'Valid', 'recclass': 'L6', 'mass (g)': '42.68', 'fall': 'Found', 'year': '01/01/1988 12:00:00 AM', 'reclat': '-72.000000', 'reclong': '26.000000', 'GeoLocation': '(-72.000000, 26.000000)'}, {'name': 'Dhofar 1401', 'id': '35491', 'nametype': 'Valid', 'recclass': 'LL~6', 'mass (g)': '42.03', 'fall': 'Found', 'year': '01/01/2001 12:00:00 AM', 'reclat': '19.191350', 'reclong': '54.655450', 'GeoLocation': '(19.191350, 54.655450)'}, {'name': 'Elephant Moraine 87745', 'id': '8295', 'nametype': 'Valid', 'recclass': 'H5', 'mass (g)': '123', 'fall': 'Found', 'year': '01/01/1987 12:00:00 AM', 'reclat': '-76.183330', 'reclong': '157.166670', 'GeoLocation': '(-76.183330, 157.166670)'}, {'name': 'Frontier Mountain 90153', 'id': '10545', 'nametype': 'Valid', 'recclass': 'H4-6', 'mass (g)': '8.5', 'fall': 'Found', 'year': '01/01/1990 12:00:00 AM', 'reclat': '-72.954040', 'reclong': '160.538110', 'GeoLocation': '(-72.954040, 160.538110)'}, {'name': 'Larkman Nunatak 06750', 'id': '48858', 'nametype': 'Valid', 'recclass': 'L5', 'mass (g)': '66.8', 'fall': 'Found', 'year': '01/01/2006 12:00:00 AM', 'reclat': None, 'reclong': None, 'GeoLocation': None}, {'name': 'MacAlpine Hills 02539', 'id': '14845', 'nametype': 'Valid', 'recclass': 'LL6', 'mass (g)': '96.6', 'fall': 'Found', 'year': '01/01/2002 12:00:00 AM', 'reclat': '-84.216670', 'reclong': '160.500000', 'GeoLocation': '(-84.216670, 160.500000)'}, {'name': 'Miller Range 07124', 'id': '51180', 'nametype': 'Valid', 'recclass': 'L5', 'mass (g)': '378.6', 'fall': 'Found', 'year': '01/01/2007 12:00:00 AM', 'reclat': '0.000000', 'reclong': '0.000000', 'GeoLocation': '(0.000000, 0.000000)'}, {'name': 'Northwest Africa 4032', 'id': '34304', 'nametype': 'Valid', 'recclass': 'Eucrite-pmict', 'mass (g)': '10.5', 'fall': 'Found', 'year': '01/01/2004 12:00:00 AM', 'reclat': None, 'reclong': None, 'GeoLocation': None}, {'name': 'Northwest Africa 5953', 'id': '50839', 'nametype': 'Valid', 'recclass': 'LL4', 'mass (g)': '1450', 'fall': 'Found', 'year': '01/01/2005 12:00:00 AM', 'reclat': '0.000000', 'reclong': '0.000000', 'GeoLocation': '(0.000000, 0.000000)'}, {'name': 'Pecora Escarpment 91310', 'id': '18601', 'nametype': 'Valid', 'recclass': 'L5', 'mass (g)': '134.69999999999999', 'fall': 'Found', 'year': '01/01/1991 12:00:00 AM', 'reclat': '-85.682450', 'reclong': '-68.745390', 'GeoLocation': '(-85.682450, -68.745390)'}, {'name': 'Tungsten Mountain 006', 'id': '24077', 'nametype': 'Valid', 'recclass': 'L6', 'mass (g)': '8.800000000000001', 'fall': 'Found', 'year': '01/01/2001 12:00:00 AM', 'reclat': '39.684360', 'reclong': '-117.620180', 'GeoLocation': '(39.684360, -117.620180)'}]" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'name': 'Asuka 881627',\n", - " 'id': '4336',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'L6',\n", - " 'mass (g)': '42.68',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/1988 12:00:00 AM',\n", - " 'reclat': '-72.000000',\n", - " 'reclong': '26.000000',\n", - " 'GeoLocation': '(-72.000000, 26.000000)'},\n", - " {'name': 'Dhofar 1401',\n", - " 'id': '35491',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'LL~6',\n", - " 'mass (g)': '42.03',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/2001 12:00:00 AM',\n", - " 'reclat': '19.191350',\n", - " 'reclong': '54.655450',\n", - " 'GeoLocation': '(19.191350, 54.655450)'},\n", - " {'name': 'Elephant Moraine 87745',\n", - " 'id': '8295',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'H5',\n", - " 'mass (g)': '123',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/1987 12:00:00 AM',\n", - " 'reclat': '-76.183330',\n", - " 'reclong': '157.166670',\n", - " 'GeoLocation': '(-76.183330, 157.166670)'},\n", - " {'name': 'Frontier Mountain 90153',\n", - " 'id': '10545',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'H4-6',\n", - " 'mass (g)': '8.5',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/1990 12:00:00 AM',\n", - " 'reclat': '-72.954040',\n", - " 'reclong': '160.538110',\n", - " 'GeoLocation': '(-72.954040, 160.538110)'},\n", - " {'name': 'Larkman Nunatak 06750',\n", - " 'id': '48858',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'L5',\n", - " 'mass (g)': '66.8',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/2006 12:00:00 AM',\n", - " 'reclat': None,\n", - " 'reclong': None,\n", - " 'GeoLocation': None},\n", - " {'name': 'MacAlpine Hills 02539',\n", - " 'id': '14845',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'LL6',\n", - " 'mass (g)': '96.6',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/2002 12:00:00 AM',\n", - " 'reclat': '-84.216670',\n", - " 'reclong': '160.500000',\n", - " 'GeoLocation': '(-84.216670, 160.500000)'},\n", - " {'name': 'Miller Range 07124',\n", - " 'id': '51180',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'L5',\n", - " 'mass (g)': '378.6',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/2007 12:00:00 AM',\n", - " 'reclat': '0.000000',\n", - " 'reclong': '0.000000',\n", - " 'GeoLocation': '(0.000000, 0.000000)'},\n", - " {'name': 'Northwest Africa 4032',\n", - " 'id': '34304',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'Eucrite-pmict',\n", - " 'mass (g)': '10.5',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/2004 12:00:00 AM',\n", - " 'reclat': None,\n", - " 'reclong': None,\n", - " 'GeoLocation': None},\n", - " {'name': 'Northwest Africa 5953',\n", - " 'id': '50839',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'LL4',\n", - " 'mass (g)': '1450',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/2005 12:00:00 AM',\n", - " 'reclat': '0.000000',\n", - " 'reclong': '0.000000',\n", - " 'GeoLocation': '(0.000000, 0.000000)'},\n", - " {'name': 'Pecora Escarpment 91310',\n", - " 'id': '18601',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'L5',\n", - " 'mass (g)': '134.69999999999999',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/1991 12:00:00 AM',\n", - " 'reclat': '-85.682450',\n", - " 'reclong': '-68.745390',\n", - " 'GeoLocation': '(-85.682450, -68.745390)'},\n", - " {'name': 'Tungsten Mountain 006',\n", - " 'id': '24077',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'L6',\n", - " 'mass (g)': '8.800000000000001',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/2001 12:00:00 AM',\n", - " 'reclat': '39.684360',\n", - " 'reclong': '-117.620180',\n", - " 'GeoLocation': '(39.684360, -117.620180)'}]" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "value" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": {}, - "outputs": [], - "source": [ - "data = []\n", - "for l in value:\n", - " data.append([v for k,v in l.items()])\n", - "result = [{\"columns\":df.columns}, {\"data\":data}]\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting fastavro\n", - " Downloading https://files.pythonhosted.org/packages/3a/a5/b357909eb300ae3a8499f1718b3887b379e743553bcc2dc2ed325902072b/fastavro-0.21.4-cp36-cp36m-win_amd64.whl (282kB)\n", - "Installing collected packages: fastavro\n", - "Successfully installed fastavro-0.21.4\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "You are using pip version 10.0.1, however version 18.0 is available.\n", - "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n" - ] - } - ], - "source": [ - "!pip install fastavro" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pyspark\\sql\\session.py:360: UserWarning: Using RDD of dict to inferSchema is deprecated. Use pyspark.sql.Row instead\n", - " warnings.warn(\"Using RDD of dict to inferSchema is deprecated. \"\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+----------+--------------------+----------+\n", - "| timestamp| tweet| username|\n", - "+----------+--------------------+----------+\n", - "|1366150681|Rock: Nerf paper,...| miguno|\n", - "|1366154481|Works as intended...|BlizzardCS|\n", - "+----------+--------------------+----------+\n", - "\n" - ] - }, - { - "data": { - "text/plain": [ - "1366150681" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from io import BytesIO\n", - "import fastavro\n", - "\n", - "df = op.sc.binaryFiles(\"twitter.avro\")\\\n", - " .flatMap(lambda args: fastavro.reader(BytesIO(args[1]))).toDF()\n", - "\n", - "df.show()\n", - "# optimus function get the min value\n", - "df.cols.min(\"timestamp\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'station': '011990-99999', 'time': 1433269388, 'temp': 0}\n", - "{'station': '011990-99999', 'time': 1433270389, 'temp': 22}\n", - "{'station': '011990-99999', 'time': 1433273379, 'temp': -11}\n", - "{'station': '012650-99999', 'time': 1433275478, 'temp': 111}\n" - ] - } - ], - "source": [ - "from fastavro import writer, reader, parse_schema\n", - "\n", - "schema = {\n", - " 'doc': 'A weather reading.',\n", - " 'name': 'Weather',\n", - " 'namespace': 'test',\n", - " 'type': 'record',\n", - " 'fields': [\n", - " {'name': 'station', 'type': 'string'},\n", - " {'name': 'time', 'type': 'long'},\n", - " {'name': 'temp', 'type': 'int'},\n", - " ],\n", - "}\n", - "parsed_schema = parse_schema(schema)\n", - "\n", - "# 'records' can be an iterable (including generator)\n", - "records = [\n", - " {u'station': u'011990-99999', u'temp': 0, u'time': 1433269388},\n", - " {u'station': u'011990-99999', u'temp': 22, u'time': 1433270389},\n", - " {u'station': u'011990-99999', u'temp': -11, u'time': 1433273379},\n", - " {u'station': u'012650-99999', u'temp': 111, u'time': 1433275478},\n", - "]\n", - "\n", - "# Writing\n", - "with open('weather.avro', 'wb') as out:\n", - " writer(out, parsed_schema, records)\n", - "\n", - "# Reading\n", - "with open('weather.avro', 'rb') as fo:\n", - " for record in reader(fo):\n", - " print(record)" + "r.get('foo')" ] }, { From 87deb11616f8567897e7a62e500082bed16c69ea Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Wed, 29 Aug 2018 22:47:01 -0500 Subject: [PATCH 03/94] Added test to cast to Vector --- tests/test_cols.py | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/tests/test_cols.py b/tests/test_cols.py index e85315685..0f99c55df 100644 --- a/tests/test_cols.py +++ b/tests/test_cols.py @@ -1,7 +1,9 @@ -from optimus import Optimus -from pyspark.sql.types import * +from pyspark.ml.linalg import Vectors, VectorUDT from pyspark.sql import Row from pyspark.sql import functions as F +from pyspark.sql.types import * + +from optimus import Optimus op = Optimus() sc = op.sc @@ -391,6 +393,34 @@ def test_cast_advanced(): assert (actual_df.collect() == expected_df.collect()) + @staticmethod + def test_cast_vector(): + source_df = op.create.df( + rows=[ + ("happy", [1, 2, 3]), + ("excited", 2) + ], + cols=[ + ("emotion", ArrayType(), True), + ("num", IntegerType(), True) + ] + ) + + actual_df = source_df.cols.cast("happy", Vectors) + + expected_df = op.create.df( + rows=[ + ("happy",[1, 2, 3]), + ("excited", 2) + ], + cols=[ + ("emotion", VectorUDT(), True), + ("num", StringType(), True) + ] + ) + + assert (actual_df.collect() == expected_df.collect()) + @staticmethod def test_keep(): source_df = op.create.df( From d0650d09ceb6a22e1322beb4fffe74399bb3d57a Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Thu, 30 Aug 2018 01:03:33 -0500 Subject: [PATCH 04/94] Work in progress --- examples/new-api-enrichment.ipynb | 979 ++++++++++++++++++++++++++++++ 1 file changed, 979 insertions(+) create mode 100644 examples/new-api-enrichment.ipynb diff --git a/examples/new-api-enrichment.ipynb b/examples/new-api-enrichment.ipynb new file mode 100644 index 000000000..2f57c5eb7 --- /dev/null +++ b/examples/new-api-enrichment.ipynb @@ -0,0 +1,979 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install Redis for your OS\n", + "pip install redis\n", + "pip install python-redis-rate-limit # https://pypi.org/project/python-redis-rate-limit/\n", + "pip install sparkly\n", + "https://www.scivision.co/python-windows-visual-c++-14-required/\n", + "pip install sparkly[redis]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"..\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from optimus import Optimus" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "op= Optimus()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "df = op.load.url(\"https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 19 of 19 rows / 8 columns
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
id
\n", + "
1 (int)
\n", + "\n", + "
\n", + "
firstName
\n", + "
2 (string)
\n", + "\n", + "
\n", + "
lastName
\n", + "
3 (string)
\n", + "\n", + "
\n", + "
billingId
\n", + "
4 (int)
\n", + "\n", + "
\n", + "
product
\n", + "
5 (string)
\n", + "\n", + "
\n", + "
price
\n", + "
6 (int)
\n", + "\n", + "
\n", + "
birth
\n", + "
7 (string)
\n", + "\n", + "
\n", + "
dummyCol
\n", + "
8 (string)
\n", + "\n", + "
\n", + " 1\n", + " \n", + " Luis\n", + " \n", + " Alvarez$$%!\n", + " \n", + " 123\n", + " \n", + " Cake\n", + " \n", + " 10\n", + " \n", + " 1980/07/07\n", + " \n", + " never\n", + "
\n", + " 2\n", + " \n", + " André\n", + " \n", + " Ampère\n", + " \n", + " 423\n", + " \n", + " piza\n", + " \n", + " 8\n", + " \n", + " 1950/07/08\n", + " \n", + " gonna\n", + "
\n", + " 3\n", + " \n", + " NiELS\n", + " \n", + " Böhr//((%%\n", + " \n", + " 551\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1990/07/09\n", + " \n", + " give\n", + "
\n", + " 4\n", + " \n", + " PAUL\n", + " \n", + " dirac$\n", + " \n", + " 521\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1954/07/10\n", + " \n", + " you\n", + "
\n", + " 5\n", + " \n", + " Albert\n", + " \n", + " Einstein\n", + " \n", + " 634\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1990/07/11\n", + " \n", + " up\n", + "
\n", + " 6\n", + " \n", + " Galileo\n", + " \n", + " ⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱GALiLEI\n", + " \n", + " 672\n", + " \n", + " arepa\n", + " \n", + " 5\n", + " \n", + " 1930/08/12\n", + " \n", + " never\n", + "
\n", + " 7\n", + " \n", + " CaRL\n", + " \n", + " Ga%%%uss\n", + " \n", + " 323\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1970/07/13\n", + " \n", + " gonna\n", + "
\n", + " 8\n", + " \n", + " David\n", + " \n", + " H$$$ilbert\n", + " \n", + " 624\n", + " \n", + " taaaccoo\n", + " \n", + " 3\n", + " \n", + " 1950/07/14\n", + " \n", + " let\n", + "
\n", + " 9\n", + " \n", + " Johannes\n", + " \n", + " KEPLER\n", + " \n", + " 735\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1920/04/22\n", + " \n", + " you\n", + "
\n", + " 10\n", + " \n", + " JaMES\n", + " \n", + " M$$ax%%well\n", + " \n", + " 875\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1923/03/12\n", + " \n", + " down\n", + "
\n", + " 11\n", + " \n", + " Isaac\n", + " \n", + " Newton\n", + " \n", + " 992\n", + " \n", + " pasta\n", + " \n", + " 9\n", + " \n", + " 1999/02/15\n", + " \n", + " never⸱\n", + "
\n", + " 12\n", + " \n", + " Emmy%%\n", + " \n", + " Nöether$\n", + " \n", + " 234\n", + " \n", + " pasta\n", + " \n", + " 9\n", + " \n", + " 1993/12/08\n", + " \n", + " gonna\n", + "
\n", + " 13\n", + " \n", + " Max!!!\n", + " \n", + " Planck!!!\n", + " \n", + " 111\n", + " \n", + " hamburguer\n", + " \n", + " 4\n", + " \n", + " 1994/01/04\n", + " \n", + " run⸱\n", + "
\n", + " 14\n", + " \n", + " Fred\n", + " \n", + " Hoy&&&le\n", + " \n", + " 553\n", + " \n", + " pizzza\n", + " \n", + " 8\n", + " \n", + " 1997/06/27\n", + " \n", + " around\n", + "
\n", + " 15\n", + " \n", + " (((⸱⸱⸱Heinrich⸱)))))\n", + " \n", + " Hertz\n", + " \n", + " 116\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1956/11/30\n", + " \n", + " and\n", + "
\n", + " 16\n", + " \n", + " William\n", + " \n", + " Gilbert###\n", + " \n", + " 886\n", + " \n", + " BEER\n", + " \n", + " 2\n", + " \n", + " 1958/03/26\n", + " \n", + " desert\n", + "
\n", + " 17\n", + " \n", + " Marie\n", + " \n", + " CURIE\n", + " \n", + " 912\n", + " \n", + " Rice\n", + " \n", + " 1\n", + " \n", + " 2000/03/22\n", + " \n", + " you\n", + "
\n", + " 18\n", + " \n", + " Arthur\n", + " \n", + " COM%%%pton\n", + " \n", + " 812\n", + " \n", + " 110790\n", + " \n", + " 5\n", + " \n", + " 1899/01/01\n", + " \n", + " #\n", + "
\n", + " 19\n", + " \n", + " JAMES\n", + " \n", + " Chadwick\n", + " \n", + " 467\n", + " \n", + " null\n", + " \n", + " 10\n", + " \n", + " 1921/05/03\n", + " \n", + " #\n", + "
\n", + "\n", + "
Viewing 19 of 19 rows / 8 columns
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.table()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import redis\n", + "r = redis.StrictRedis(host='localhost', port=6379, db=0)\n", + "# r.set({\"hola\":\"hola\"}, 'bar')\n", + "r.hmset(\"hola\",{\"1\":\"1\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{b'1': b'1'}" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r.hgetall('hola')" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from sparkly import SparklySession\n", + "spark = SparklySession()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# make some test data\n", + "columns = ['id', 'dogs', 'cats']\n", + "vals = [\n", + " (1, 2, 0),\n", + " (2, 0, 1)\n", + "]\n", + "\n", + "# create DataFrame\n", + "df = spark.createDataFrame(vals, columns)\n", + "\n", + "df.write_ext.redis(\n", + " host='localhost',\n", + " port=6379,\n", + " key_by=['id', 'dogs'],\n", + " exclude_key_columns=True,\n", + " expire=24 * 60 * 60,\n", + " compression='gzip',\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "200 OK\n" + ] + } + ], + "source": [ + "from redis_rate_limit import RateLimit, TooManyRequests\n", + "try:\n", + " with RateLimit(resource='users_list', client='localhost', max_requests=10):\n", + " result = '200 OK'\n", + "except TooManyRequests:\n", + " result = '429 Too Many Requests'\n", + "print(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 7da28baf43c8be50bd106cadfdb042df189c5719 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Thu, 30 Aug 2018 10:48:29 -0500 Subject: [PATCH 05/94] Remove unnecesary cells --- examples/new-api-profiler.ipynb | 2202 ------------------------------- 1 file changed, 2202 deletions(-) diff --git a/examples/new-api-profiler.ipynb b/examples/new-api-profiler.ipynb index 19c2c8b0c..497ff5202 100644 --- a/examples/new-api-profiler.ipynb +++ b/examples/new-api-profiler.ipynb @@ -1687,2208 +1687,6 @@ "source": [ "df.correlation([\"id\",\"mass (g)\", \"reclat\"], output=\"array\")" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Benchmark " - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "df = op.load.csv(\"order_products__prior.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
Viewing 100 of 32434489 rows / 4 columns
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
order_id
\n", - "
1 (int)
\n", - "\n", - "
\n", - "
product_id
\n", - "
2 (int)
\n", - "\n", - "
\n", - "
add_to_cart_order
\n", - "
3 (int)
\n", - "\n", - "
\n", - "
reordered
\n", - "
4 (int)
\n", - "\n", - "
\n", - " 2\n", - " \n", - " 33120\n", - " \n", - " 1\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 28985\n", - " \n", - " 2\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 9327\n", - " \n", - " 3\n", - " \n", - " 0\n", - "
\n", - " 2\n", - " \n", - " 45918\n", - " \n", - " 4\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 30035\n", - " \n", - " 5\n", - " \n", - " 0\n", - "
\n", - " 2\n", - " \n", - " 17794\n", - " \n", - " 6\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 40141\n", - " \n", - " 7\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 1819\n", - " \n", - " 8\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 43668\n", - " \n", - " 9\n", - " \n", - " 0\n", - "
\n", - " 3\n", - " \n", - " 33754\n", - " \n", - " 1\n", - " \n", - " 1\n", - "
\n", - " 3\n", - " \n", - " 24838\n", - " \n", - " 2\n", - " \n", - " 1\n", - "
\n", - " 3\n", - " \n", - " 17704\n", - " \n", - " 3\n", - " \n", - " 1\n", - "
\n", - " 3\n", - " \n", - " 21903\n", - " \n", - " 4\n", - " \n", - " 1\n", - "
\n", - " 3\n", - " \n", - " 17668\n", - " \n", - " 5\n", - " \n", - " 1\n", - "
\n", - " 3\n", - " \n", - " 46667\n", - " \n", - " 6\n", - " \n", - " 1\n", - "
\n", - " 3\n", - " \n", - " 17461\n", - " \n", - " 7\n", - " \n", - " 1\n", - "
\n", - " 3\n", - " \n", - " 32665\n", - " \n", - " 8\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 46842\n", - " \n", - " 1\n", - " \n", - " 0\n", - "
\n", - " 4\n", - " \n", - " 26434\n", - " \n", - " 2\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 39758\n", - " \n", - " 3\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 27761\n", - " \n", - " 4\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 10054\n", - " \n", - " 5\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 21351\n", - " \n", - " 6\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 22598\n", - " \n", - " 7\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 34862\n", - " \n", - " 8\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 40285\n", - " \n", - " 9\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 17616\n", - " \n", - " 10\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 25146\n", - " \n", - " 11\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 32645\n", - " \n", - " 12\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 41276\n", - " \n", - " 13\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 13176\n", - " \n", - " 1\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 15005\n", - " \n", - " 2\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 47329\n", - " \n", - " 3\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 27966\n", - " \n", - " 4\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 23909\n", - " \n", - " 5\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 48370\n", - " \n", - " 6\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 13245\n", - " \n", - " 7\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 9633\n", - " \n", - " 8\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 27360\n", - " \n", - " 9\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 6348\n", - " \n", - " 10\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 40878\n", - " \n", - " 11\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 6184\n", - " \n", - " 12\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 48002\n", - " \n", - " 13\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 20914\n", - " \n", - " 14\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 37011\n", - " \n", - " 15\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 12962\n", - " \n", - " 16\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 45698\n", - " \n", - " 17\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 24773\n", - " \n", - " 18\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 18569\n", - " \n", - " 19\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 41176\n", - " \n", - " 20\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 48366\n", - " \n", - " 21\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 47209\n", - " \n", - " 22\n", - " \n", - " 0\n", - "
\n", - " 5\n", - " \n", - " 46522\n", - " \n", - " 23\n", - " \n", - " 0\n", - "
\n", - " 5\n", - " \n", - " 38693\n", - " \n", - " 24\n", - " \n", - " 0\n", - "
\n", - " 5\n", - " \n", - " 48825\n", - " \n", - " 25\n", - " \n", - " 0\n", - "
\n", - " 5\n", - " \n", - " 8479\n", - " \n", - " 26\n", - " \n", - " 0\n", - "
\n", - " 6\n", - " \n", - " 40462\n", - " \n", - " 1\n", - " \n", - " 0\n", - "
\n", - " 6\n", - " \n", - " 15873\n", - " \n", - " 2\n", - " \n", - " 0\n", - "
\n", - " 6\n", - " \n", - " 41897\n", - " \n", - " 3\n", - " \n", - " 0\n", - "
\n", - " 7\n", - " \n", - " 34050\n", - " \n", - " 1\n", - " \n", - " 0\n", - "
\n", - " 7\n", - " \n", - " 46802\n", - " \n", - " 2\n", - " \n", - " 0\n", - "
\n", - " 8\n", - " \n", - " 23423\n", - " \n", - " 1\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 21405\n", - " \n", - " 1\n", - " \n", - " 0\n", - "
\n", - " 9\n", - " \n", - " 47890\n", - " \n", - " 2\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 11182\n", - " \n", - " 3\n", - " \n", - " 0\n", - "
\n", - " 9\n", - " \n", - " 2014\n", - " \n", - " 4\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 29193\n", - " \n", - " 5\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 34203\n", - " \n", - " 6\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 14992\n", - " \n", - " 7\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 31506\n", - " \n", - " 8\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 23288\n", - " \n", - " 9\n", - " \n", - " 0\n", - "
\n", - " 9\n", - " \n", - " 44533\n", - " \n", - " 10\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 18362\n", - " \n", - " 11\n", - " \n", - " 0\n", - "
\n", - " 9\n", - " \n", - " 27366\n", - " \n", - " 12\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 432\n", - " \n", - " 13\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 3990\n", - " \n", - " 14\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 14183\n", - " \n", - " 15\n", - " \n", - " 0\n", - "
\n", - " 10\n", - " \n", - " 24852\n", - " \n", - " 1\n", - " \n", - " 1\n", - "
\n", - " 10\n", - " \n", - " 4796\n", - " \n", - " 2\n", - " \n", - " 1\n", - "
\n", - " 10\n", - " \n", - " 31717\n", - " \n", - " 3\n", - " \n", - " 0\n", - "
\n", - " 10\n", - " \n", - " 47766\n", - " \n", - " 4\n", - " \n", - " 1\n", - "
\n", - " 10\n", - " \n", - " 4605\n", - " \n", - " 5\n", - " \n", - " 1\n", - "
\n", - " 10\n", - " \n", - " 1529\n", - " \n", - " 6\n", - " \n", - " 0\n", - "
\n", - " 10\n", - " \n", - " 21137\n", - " \n", - " 7\n", - " \n", - " 1\n", - "
\n", - " 10\n", - " \n", - " 22122\n", - " \n", - " 8\n", - " \n", - " 1\n", - "
\n", - " 10\n", - " \n", - " 34134\n", - " \n", - " 9\n", - " \n", - " 1\n", - "
\n", - " 10\n", - " \n", - " 27156\n", - " \n", - " 10\n", - " \n", - " 0\n", - "
\n", - " 10\n", - " \n", - " 14992\n", - " \n", - " 11\n", - " \n", - " 0\n", - "
\n", - " 10\n", - " \n", - " 49235\n", - " \n", - " 12\n", - " \n", - " 1\n", - "
\n", - " 10\n", - " \n", - " 26842\n", - " \n", - " 13\n", - " \n", - " 0\n", - "
\n", - " 10\n", - " \n", - " 3464\n", - " \n", - " 14\n", - " \n", - " 0\n", - "
\n", - " 10\n", - " \n", - " 25720\n", - " \n", - " 15\n", - " \n", - " 0\n", - "
\n", - " 11\n", - " \n", - " 30162\n", - " \n", - " 1\n", - " \n", - " 1\n", - "
\n", - " 11\n", - " \n", - " 27085\n", - " \n", - " 2\n", - " \n", - " 1\n", - "
\n", - " 11\n", - " \n", - " 5994\n", - " \n", - " 3\n", - " \n", - " 1\n", - "
\n", - " 11\n", - " \n", - " 1313\n", - " \n", - " 4\n", - " \n", - " 1\n", - "
\n", - " 11\n", - " \n", - " 31506\n", - " \n", - " 5\n", - " \n", - " 1\n", - "
\n", - " 12\n", - " \n", - " 30597\n", - " \n", - " 1\n", - " \n", - " 1\n", - "
\n", - " 12\n", - " \n", - " 15221\n", - " \n", - " 2\n", - " \n", - " 1\n", - "
\n", - " 12\n", - " \n", - " 43772\n", - " \n", - " 3\n", - " \n", - " 1\n", - "
\n", - "\n", - "
Viewing 100 of 32434489 rows / 4 columns
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df.table()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Error while sending.\n", - "Traceback (most recent call last):\n", - " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 1145, in send_command\n", - " self.socket.sendall(command.encode(\"utf-8\"))\n", - "ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host\n", - "Exception while sending command.\n", - "Traceback (most recent call last):\n", - " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 1145, in send_command\n", - " self.socket.sendall(command.encode(\"utf-8\"))\n", - "ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 985, in send_command\n", - " response = connection.send_command(command)\n", - " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 1149, in send_command\n", - " \"Error while sending\", e, proto.ERROR_ON_SEND)\n", - "py4j.protocol.Py4JNetworkError: Error while sending\n", - "An error occurred while trying to connect to the Java server (127.0.0.1:50332)\n", - "Traceback (most recent call last):\n", - " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 1145, in send_command\n", - " self.socket.sendall(command.encode(\"utf-8\"))\n", - "ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 985, in send_command\n", - " response = connection.send_command(command)\n", - " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 1149, in send_command\n", - " \"Error while sending\", e, proto.ERROR_ON_SEND)\n", - "py4j.protocol.Py4JNetworkError: Error while sending\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 929, in _get_connection\n", - " connection = self.deque.pop()\n", - "IndexError: pop from an empty deque\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 1067, in start\n", - " self.socket.connect((self.address, self.port))\n", - "ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it\n" - ] - }, - { - "ename": "Py4JNetworkError", - "evalue": "An error occurred while trying to connect to the Java server (127.0.0.1:50332)", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mConnectionResetError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command)\u001b[0m\n\u001b[0;32m 1144\u001b[0m \u001b[1;31m# if it sent a RST packet (SO_LINGER)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1145\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msendall\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"utf-8\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1146\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mConnectionResetError\u001b[0m: [WinError 10054] An existing connection was forcibly closed by the remote host", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[1;31mPy4JNetworkError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command, retry, binary)\u001b[0m\n\u001b[0;32m 984\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 985\u001b[1;33m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 986\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mbinary\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command)\u001b[0m\n\u001b[0;32m 1148\u001b[0m raise Py4JNetworkError(\n\u001b[1;32m-> 1149\u001b[1;33m \"Error while sending\", e, proto.ERROR_ON_SEND)\n\u001b[0m\u001b[0;32m 1150\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mPy4JNetworkError\u001b[0m: Error while sending", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[1;31mIndexError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m_get_connection\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 928\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 929\u001b[1;33m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdeque\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 930\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mIndexError\u001b[0m: pop from an empty deque", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[1;31mConnectionRefusedError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36mstart\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1066\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1067\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maddress\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mport\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1068\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstream\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmakefile\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"rb\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mConnectionRefusedError\u001b[0m: [WinError 10061] No connection could be made because the target machine actively refused it", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[1;31mPy4JNetworkError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mop\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mprofiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"product_id\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;32m~\\Documents\\Optimus\\optimus\\profiler\\profiler.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self, df, columns, buckets)\u001b[0m\n\u001b[0;32m 343\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 344\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparse_columns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 345\u001b[1;33m \u001b[0moutput\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mProfiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_json\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbuckets\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 346\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 347\u001b[0m \u001b[1;31m# Load jinja\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Documents\\Optimus\\optimus\\profiler\\profiler.py\u001b[0m in \u001b[0;36mto_json\u001b[1;34m(df, columns, buckets)\u001b[0m\n\u001b[0;32m 402\u001b[0m \"\"\"\n\u001b[0;32m 403\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 404\u001b[1;33m \u001b[0moutput\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mProfiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbuckets\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 405\u001b[0m \u001b[0mdataset\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mProfiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdataset_info\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 406\u001b[0m \u001b[0moutput\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"summary\"\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdataset\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Documents\\Optimus\\optimus\\profiler\\profiler.py\u001b[0m in \u001b[0;36mcolumns\u001b[1;34m(df, columns, buckets)\u001b[0m\n\u001b[0;32m 163\u001b[0m \u001b[0mcolumn_info\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'columns'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 164\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 165\u001b[1;33m \u001b[0mrows_count\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 166\u001b[0m \u001b[0mcolumn_info\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'rows_count'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mrows_count\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 167\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pyspark\\sql\\dataframe.py\u001b[0m in \u001b[0;36mcount\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 453\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 454\u001b[0m \"\"\"\n\u001b[1;32m--> 455\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 456\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 457\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mignore_unicode_prefix\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m 1253\u001b[0m \u001b[0mproto\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mEND_COMMAND_PART\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1254\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1255\u001b[1;33m \u001b[0manswer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1256\u001b[0m return_value = get_return_value(\n\u001b[0;32m 1257\u001b[0m answer, self.gateway_client, self.target_id, self.name)\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command, retry, binary)\u001b[0m\n\u001b[0;32m 998\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_should_retry\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mretry\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpne\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 999\u001b[0m \u001b[0mlogging\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Exception while sending command.\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mexc_info\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1000\u001b[1;33m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbinary\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbinary\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1001\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1002\u001b[0m logging.exception(\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command, retry, binary)\u001b[0m\n\u001b[0;32m 981\u001b[0m \u001b[1;32mif\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0mbinary\u001b[0m\u001b[0;31m`\u001b[0m \u001b[1;32mis\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[0;31m`\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 982\u001b[0m \"\"\"\n\u001b[1;32m--> 983\u001b[1;33m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_connection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 984\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 985\u001b[0m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m_get_connection\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 929\u001b[0m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdeque\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 930\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 931\u001b[1;33m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_create_connection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 932\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 933\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m_create_connection\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 935\u001b[0m connection = GatewayConnection(\n\u001b[0;32m 936\u001b[0m self.gateway_parameters, self.gateway_property)\n\u001b[1;32m--> 937\u001b[1;33m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 938\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 939\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36mstart\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1077\u001b[0m \u001b[1;34m\"server ({0}:{1})\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maddress\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mport\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1078\u001b[0m \u001b[0mlogger\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexception\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1079\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mPy4JNetworkError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1080\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1081\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_authenticate_connection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mPy4JNetworkError\u001b[0m: An error occurred while trying to connect to the Java server (127.0.0.1:50332)" - ] - } - ], - "source": [ - "op.profiler.run(df, \"product_id\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 293ee80ef211dacdec997c553dd491a6d70507e5 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Thu, 30 Aug 2018 13:02:09 -0500 Subject: [PATCH 06/94] Work in progress --- examples/new-api-enrichment.ipynb | 861 ++++-------------------------- 1 file changed, 102 insertions(+), 759 deletions(-) diff --git a/examples/new-api-enrichment.ipynb b/examples/new-api-enrichment.ipynb index 2f57c5eb7..e6ef5f45e 100644 --- a/examples/new-api-enrichment.ipynb +++ b/examples/new-api-enrichment.ipynb @@ -4,17 +4,20 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "## Step for make it work\n", + "\n", "Install Redis for your OS\n", - "pip install redis\n", - "pip install python-redis-rate-limit # https://pypi.org/project/python-redis-rate-limit/\n", - "pip install sparkly\n", - "https://www.scivision.co/python-windows-visual-c++-14-required/\n", - "pip install sparkly[redis]" + "* pip install redis\n", + "* pip install python-redis-rate-limit # https://pypi.org/project/python-redis-rate-limit/\n", + "* pip install sparkly https://www.scivision.co/python-windows-visual-c++-14-required/\n", + "* pip install sparkly[redis]\n", + "* Install http://joeferner.github.io/redis-commander/ \n", + "Then `redis-commander --redis-host=localhost --redis-port=6379` from the browser http://localhost:8081/" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -24,7 +27,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -34,34 +37,54 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ - "from optimus import Optimus" + "import redis\n", + "r = redis.StrictRedis(host='localhost', port=6379, db=0)" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "op= Optimus()" + "from sparkly import SparklySession\n", + "spark = SparklySession()" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ - "df = op.load.url(\"https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.csv\")" + "# make some test data\n", + "columns = ['dogs', 'cats']\n", + "vals = [\n", + " (2, 0),\n", + " (0, 1),\n", + " (4, 1)\n", + "]\n", + "\n", + "# create DataFrame\n", + "df = spark.createDataFrame(vals, columns)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "import optimus as Optimus" + ] + }, + { + "cell_type": "code", + "execution_count": 47, "metadata": {}, "outputs": [ { @@ -85,57 +108,27 @@ "\n", "\n", "\n", - "
Viewing 19 of 19 rows / 8 columns
\n", + "
Viewing 3 of 3 rows / 3 columns
\n", "\n", "\n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", @@ -147,71 +140,15 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", @@ -219,35 +156,15 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", @@ -259,571 +176,11 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", @@ -831,7 +188,7 @@ " \n", "
\n", - "
id
\n", - "
1 (int)
\n", - "\n", - "
\n", - "
firstName
\n", - "
2 (string)
\n", - "\n", - "
\n", - "
lastName
\n", - "
3 (string)
\n", - "\n", - "
\n", - "
billingId
\n", - "
4 (int)
\n", - "\n", - "
\n", - "
product
\n", - "
5 (string)
\n", - "\n", - "
\n", - "
price
\n", - "
6 (int)
\n", + "
dogs
\n", + "
1 (bigint)
\n", "\n", "
\n", - "
birth
\n", - "
7 (string)
\n", + "
cats
\n", + "
2 (bigint)
\n", "\n", "
\n", - "
dummyCol
\n", - "
8 (string)
\n", + "
id
\n", + "
3 (bigint)
\n", "\n", "
\n", - " 1\n", - " \n", - " Luis\n", - " \n", - " Alvarez$$%!\n", - " \n", - " 123\n", - " \n", - " Cake\n", - " \n", - " 10\n", - " \n", - " 1980/07/07\n", - " \n", - " never\n", - "
\n", " 2\n", " \n", - " André\n", - " \n", - " Ampère\n", - " \n", - " 423\n", - " \n", - " piza\n", - " \n", - " 8\n", + " 0\n", " \n", - " 1950/07/08\n", - " \n", - " gonna\n", + " 17179869184\n", "
\n", - " 3\n", - " \n", - " NiELS\n", - " \n", - " Böhr//((%%\n", - " \n", - " 551\n", + " 0\n", " \n", - " pizza\n", - " \n", - " 8\n", - " \n", - " 1990/07/09\n", + " 1\n", " \n", - " give\n", + " 42949672960\n", "
\n", - " PAUL\n", - " \n", - " dirac$\n", - " \n", - " 521\n", - " \n", - " pizza\n", - " \n", - " 8\n", - " \n", - " 1954/07/10\n", - " \n", - " you\n", - "
\n", - " 5\n", - " \n", - " Albert\n", - " \n", - " Einstein\n", - " \n", - " 634\n", - " \n", - " pizza\n", - " \n", - " 8\n", - " \n", - " 1990/07/11\n", - " \n", - " up\n", - "
\n", - " 6\n", - " \n", - " Galileo\n", - " \n", - " ⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱GALiLEI\n", - " \n", - " 672\n", - " \n", - " arepa\n", - " \n", - " 5\n", - " \n", - " 1930/08/12\n", - " \n", - " never\n", - "
\n", - " 7\n", - " \n", - " CaRL\n", - " \n", - " Ga%%%uss\n", - " \n", - " 323\n", - " \n", - " taco\n", - " \n", - " 3\n", - " \n", - " 1970/07/13\n", - " \n", - " gonna\n", - "
\n", - " 8\n", - " \n", - " David\n", - " \n", - " H$$$ilbert\n", - " \n", - " 624\n", - " \n", - " taaaccoo\n", - " \n", - " 3\n", - " \n", - " 1950/07/14\n", + " 1\n", " \n", - " let\n", - "
\n", - " 9\n", - " \n", - " Johannes\n", - " \n", - " KEPLER\n", - " \n", - " 735\n", - " \n", - " taco\n", - " \n", - " 3\n", - " \n", - " 1920/04/22\n", - " \n", - " you\n", - "
\n", - " 10\n", - " \n", - " JaMES\n", - " \n", - " M$$ax%%well\n", - " \n", - " 875\n", - " \n", - " taco\n", - " \n", - " 3\n", - " \n", - " 1923/03/12\n", - " \n", - " down\n", - "
\n", - " 11\n", - " \n", - " Isaac\n", - " \n", - " Newton\n", - " \n", - " 992\n", - " \n", - " pasta\n", - " \n", - " 9\n", - " \n", - " 1999/02/15\n", - " \n", - " never⸱\n", - "
\n", - " 12\n", - " \n", - " Emmy%%\n", - " \n", - " Nöether$\n", - " \n", - " 234\n", - " \n", - " pasta\n", - " \n", - " 9\n", - " \n", - " 1993/12/08\n", - " \n", - " gonna\n", - "
\n", - " 13\n", - " \n", - " Max!!!\n", - " \n", - " Planck!!!\n", - " \n", - " 111\n", - " \n", - " hamburguer\n", - " \n", - " 4\n", - " \n", - " 1994/01/04\n", - " \n", - " run⸱\n", - "
\n", - " 14\n", - " \n", - " Fred\n", - " \n", - " Hoy&&&le\n", - " \n", - " 553\n", - " \n", - " pizzza\n", - " \n", - " 8\n", - " \n", - " 1997/06/27\n", - " \n", - " around\n", - "
\n", - " 15\n", - " \n", - " (((⸱⸱⸱Heinrich⸱)))))\n", - " \n", - " Hertz\n", - " \n", - " 116\n", - " \n", - " pizza\n", - " \n", - " 8\n", - " \n", - " 1956/11/30\n", - " \n", - " and\n", - "
\n", - " 16\n", - " \n", - " William\n", - " \n", - " Gilbert###\n", - " \n", - " 886\n", - " \n", - " BEER\n", - " \n", - " 2\n", - " \n", - " 1958/03/26\n", - " \n", - " desert\n", - "
\n", - " 17\n", - " \n", - " Marie\n", - " \n", - " CURIE\n", - " \n", - " 912\n", - " \n", - " Rice\n", - " \n", - " 1\n", - " \n", - " 2000/03/22\n", - " \n", - " you\n", - "
\n", - " 18\n", - " \n", - " Arthur\n", - " \n", - " COM%%%pton\n", - " \n", - " 812\n", - " \n", - " 110790\n", - " \n", - " 5\n", - " \n", - " 1899/01/01\n", - " \n", - " #\n", - "
\n", - " 19\n", - " \n", - " JAMES\n", - " \n", - " Chadwick\n", - " \n", - " 467\n", - " \n", - " null\n", - " \n", - " 10\n", - " \n", - " 1921/05/03\n", - " \n", - " #\n", + " 60129542144\n", "
\n", "\n", - "
Viewing 19 of 19 rows / 8 columns
\n" + "
Viewing 3 of 3 rows / 3 columns
\n" ], "text/plain": [ "" @@ -847,86 +204,84 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 44, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "import redis\n", - "r = redis.StrictRedis(host='localhost', port=6379, db=0)\n", - "# r.set({\"hola\":\"hola\"}, 'bar')\n", - "r.hmset(\"hola\",{\"1\":\"1\"})" + "from pyspark.sql import functions as F\n", + "\n", + "df = df.withColumn(\"id\", F.monotonically_increasing_id())" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 45, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "{b'1': b'1'}" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "+----+----+-----------+\n", + "|dogs|cats| id|\n", + "+----+----+-----------+\n", + "| 2| 0|17179869184|\n", + "| 0| 1|42949672960|\n", + "| 4| 1|60129542144|\n", + "+----+----+-----------+\n", + "\n" + ] } ], "source": [ - "r.hgetall('hola')" + "df.show()" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from sparkly import SparklySession\n", - "spark = SparklySession()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "# make some test data\n", - "columns = ['id', 'dogs', 'cats']\n", - "vals = [\n", - " (1, 2, 0),\n", - " (2, 0, 1)\n", - "]\n", - "\n", - "# create DataFrame\n", - "df = spark.createDataFrame(vals, columns)\n", - "\n", "df.write_ext.redis(\n", " host='localhost',\n", " port=6379,\n", - " key_by=['id', 'dogs'],\n", + " key_by=['id'],\n", " exclude_key_columns=True,\n", " expire=24 * 60 * 60,\n", - " compression='gzip',\n", + " #compression='gzip',\n", ")" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"userId\": 1,\n", + " \"id\": 1,\n", + " \"title\": \"sunt aut facere repellat provident occaecati excepturi optio reprehenderit\",\n", + " \"body\": \"quia et suscipit\\nsuscipit recusandae consequuntur expedita et cum\\nreprehenderit molestiae ut ut quas totam\\nnostrum rerum est autem sunt rem eveniet architecto\"\n", + "}\n" + ] + } + ], + "source": [ + "r.get(\"1\")\n", + "r.delete(\"1\")\n", + "import requests \n", + "result = requests.get(\"https://jsonplaceholder.typicode.com/posts/1\")\n", + "print(result.text)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -940,7 +295,7 @@ "source": [ "from redis_rate_limit import RateLimit, TooManyRequests\n", "try:\n", - " with RateLimit(resource='users_list', client='localhost', max_requests=10):\n", + " with RateLimit(resource='local', client='localhost', max_requests=10):\n", " result = '200 OK'\n", "except TooManyRequests:\n", " result = '429 Too Many Requests'\n", @@ -960,18 +315,6 @@ "display_name": "Python 3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" } }, "nbformat": 4, From f19392475c0f3cc12c660a0e3008adafa39e9065 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Thu, 30 Aug 2018 16:47:42 -0500 Subject: [PATCH 07/94] TestingCelery againg --- examples/new-api-enrichment.ipynb | 75 ++++++++++++++++++++++++++++++- optimus/enrichment/worker.py | 15 +++++++ 2 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 optimus/enrichment/worker.py diff --git a/examples/new-api-enrichment.ipynb b/examples/new-api-enrichment.ipynb index e6ef5f45e..55a1af001 100644 --- a/examples/new-api-enrichment.ipynb +++ b/examples/new-api-enrichment.ipynb @@ -239,7 +239,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ @@ -302,6 +302,67 @@ "print(result)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Celery\n", + "* pip instal redis\n", + "* pip install celery\n", + "* pip install -U \"celery[redis]\"\n", + "* Run worker from python https://gist.github.com/chenjianjx/53d8c2317f6023dc2fa0" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "from optimus.enrichment.worker import download" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r = download.delay('http\"\\\\mood.com.ve')\n", + "r.ready()" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r.ready()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -315,6 +376,18 @@ "display_name": "Python 3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" } }, "nbformat": 4, diff --git a/optimus/enrichment/worker.py b/optimus/enrichment/worker.py new file mode 100644 index 000000000..e8a79e02f --- /dev/null +++ b/optimus/enrichment/worker.py @@ -0,0 +1,15 @@ +from celery import Celery +import requests +import os + +# Create the app and set the broker location (RabbitMQ) +app = Celery('worker', + backend='rpc://', + broker='redis://localhost:6379') + + +@app.task +def download(url): + response = requests.get(url) + data = response.text() + print(data) \ No newline at end of file From afb9acf7072d489b4d3c237c298f813e3bc86932 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Fri, 31 Aug 2018 18:19:23 -0500 Subject: [PATCH 08/94] Downgrade to 0.12.2 because of jupyter notebook problems --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 04f530ce5..3482ca0b9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,5 +20,5 @@ pillow==5.2.0 pygments>=2.2.0 six>=1.10.0 h5py>=2.7.0 -flask==1.0.2 +flask==0.12.2 ipython==6.5.0 \ No newline at end of file From 9e8a5ec1bf2840a5ee3fef4b2ea718a1c2540505 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Fri, 31 Aug 2018 18:20:17 -0500 Subject: [PATCH 09/94] Test to encapsulate Optimus Server in a class --- examples/config.ini | 3 + examples/new-api-sandbox.ipynb | 3374 ++------------------------------ optimus/server.py | 88 +- 3 files changed, 184 insertions(+), 3281 deletions(-) diff --git a/examples/config.ini b/examples/config.ini index 4360b43b8..771c080e4 100644 --- a/examples/config.ini +++ b/examples/config.ini @@ -1,2 +1,5 @@ [PROFILER] Output = ../data.json + +[SERVER] +Input = ../data.json \ No newline at end of file diff --git a/examples/new-api-sandbox.ipynb b/examples/new-api-sandbox.ipynb index 8fcfe267e..872a71fa2 100644 --- a/examples/new-api-sandbox.ipynb +++ b/examples/new-api-sandbox.ipynb @@ -33,20 +33,132 @@ "cell_type": "code", "execution_count": 4, "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Just check that Spark and all necessary environments vars are present...\n", + "-----\n", + "SPARK_HOME=C:\\opt\\spark\\spark-2.3.1-bin-hadoop2.7\n", + "HADOOP_HOME=C:\\opt\\spark\\spark-2.3.1-bin-hadoop2.7\n", + "You don't have PYSPARK_PYTHON set\n", + "You don't have PYSPARK_DRIVER_PYTHON set\n", + "JAVA_HOME=C:\\Program Files\\Java\\jdk1.8.0_181\n", + "Pyarrow Installed\n", + "-----\n", + "Starting or getting SparkSession and SparkContext...\n", + "\n", + " ____ __ _ \n", + " / __ \\____ / /_(_)___ ___ __ _______\n", + " / / / / __ \\/ __/ / __ `__ \\/ / / / ___/\n", + " / /_/ / /_/ / /_/ / / / / / / /_/ (__ ) \n", + " \\____/ .___/\\__/_/_/ /_/ /_/\\__,_/____/ \n", + " /_/ \n", + " \n", + "Transform and Roll out...\n", + "Optimus successfully imported. Have fun :).\n" + ] + } + ], + "source": [ + "op= Optimus(verbose=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from optimus.server import Server\n", + "s = Server()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)\n" + ] + } + ], + "source": [ + "s.start()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, "outputs": [], "source": [ - "op= Optimus()" + "# https://stackoverflow.com/questions/89228/calling-an-external-command-in-python/92395#92395\n", + "import subprocess\n", + "return_code = subprocess.call(\"echo Hello World\", shell=True) " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n" + ] + } + ], + "source": [ + "print(return_code)" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = op.load.url(\"https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/foo.csv\")" ] }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'terminal' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mterminal\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;31mNameError\u001b[0m: name 'terminal' is not defined" + ] + } + ], + "source": [ + "terminal" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install flask=0.12.2" + ] + }, { "cell_type": "code", "execution_count": 12, @@ -826,3264 +938,6 @@ "df.table()" ] }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| id| firstName| lastName|billingId| product|price| birth|dummyCol|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| 1| Luis| Alvarez$$%!| 123| Cake| 10|1980/07/07| never|\n", - "| 2| André| Ampère| 423| piza| 8|1950/07/08| gonna|\n", - "| 3| NiELS| Böhr//((%%| 551| pizza| 8|1990/07/09| give|\n", - "| 4| PAUL| dirac$| 521| pizza| 8|1954/07/10| you|\n", - "| 5| Albert| Einstein| 634| pizza| 8|1990/07/11| up|\n", - "| 6| Galileo| GALiLEI| 672| arepa| 5|1930/08/12| never|\n", - "| 7| CaRL| Ga%%%uss| 323| taco| 3|1970/07/13| gonna|\n", - "| 8| David| H$$$ilbert| 624| taaaccoo| 3|1950/07/14| let|\n", - "| 9| Johannes| KEPLER| 735| taco| 3|1920/04/22| you|\n", - "| 10| JaMES| M$$ax%%well| 875| taco| 3|1923/03/12| down|\n", - "| 11| Isaac| Newton| 992| pasta| 9|1999/02/15| never |\n", - "| 12| Emmy%%| Nöether$| 234| pasta| 9|1993/12/08| gonna|\n", - "| 13| Max!!!| Planck!!!| 111|hamburguer| 4|1994/01/04| run |\n", - "| 14| Fred| Hoy&&&le| 553| pizzza| 8|1997/06/27| around|\n", - "| 15|((( Heinrich )))))| Hertz| 116| pizza| 8|1956/11/30| and|\n", - "| 16| William| Gilbert###| 886| BEER| 2|1958/03/26| desert|\n", - "| 17| Marie| CURIE| 912| Rice| 1|2000/03/22| you|\n", - "| 18| Arthur| COM%%%pton| 812| 110790| 5|1899/01/01| #|\n", - "| 19| JAMES| Chadwick| 467| null| 10|1921/05/03| #|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "\n" - ] - } - ], - "source": [ - "df.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---+---------+--------+---------+-----------------+-----+----------+----------+-------------+\n", - "| id|firstname|lastname|billingid| product|price| birth| new_date|years_between|\n", - "+---+---------+--------+---------+-----------------+-----+----------+----------+-------------+\n", - "| 10| james| maxwell| 875| taco| 3|1923/03/12|12-03-1923| 95.4355|\n", - "| 11| isaac| newton| 992| pasta| 9|1999/02/15|15-02-1999| 19.5108|\n", - "| 12| emmy| noether| 234| pasta| 9|1993/12/08|08-12-1993| 24.6962|\n", - "| 13| max| planck| 111| hamburguer| 4|1994/01/04|04-01-1994| 24.6237|\n", - "| 14| fred| hoyle| 553| pizza| 8|1997/06/27|27-06-1997| 21.1452|\n", - "| 15| heinrich| hertz| 116| pizza| 8|1956/11/30|30-11-1956| 61.7204|\n", - "| 16| william| gilbert| 886| BEER| 2|1958/03/26|26-03-1958| 60.3978|\n", - "| 17| marie| curie| 912| Rice| 1|2000/03/22|22-03-2000| 18.4086|\n", - "| 18| arthur| compton| 812|this was a number| 5|1899/01/01|01-01-1899| 119.6317|\n", - "| 19| james|chadwick| 467| null| 10|1921/05/03|03-05-1921| 97.293|\n", - "| 7| carl| gauss| 323| taco| 3|1970/07/13|13-07-1970| 48.0995|\n", - "| 8| david| hilbert| 624| taco| 3|1950/07/14|14-07-1950| 68.0968|\n", - "| 9| johannes| kepler| 735| taco| 3|1920/04/22|22-04-1920| 98.3253|\n", - "+---+---------+--------+---------+-----------------+-----+----------+----------+-------------+\n", - "\n" - ] - } - ], - "source": [ - "# This is a custom function\n", - "def num_to_string(value, arg):\n", - " return \"this was a number\"\n", - " \n", - "df\\\n", - " .rows.sort(\"product\",\"desc\")\\\n", - " .cols.lower([\"firstName\",\"lastName\"])\\\n", - " .cols.date_transform(\"birth\", \"new_date\", \"yyyy/MM/dd\", \"dd-MM-YYYY\")\\\n", - " .cols.years_between(\"birth\", \"years_between\", \"yyyy/MM/dd\")\\\n", - " .cols.remove_accents(\"lastName\")\\\n", - " .cols.remove_special_chars([\"firstName\",\"lastName\"])\\\n", - " .cols.replace(\"product\",\"taaaccoo\",\"taco\")\\\n", - " .cols.replace(\"product\",[\"piza\",\"pizzza\"],\"pizza\")\\\n", - " .rows.drop(df[\"id\"]<7)\\\n", - " .cols.drop(\"dummyCol\")\\\n", - " .cols.rename(str.lower)\\\n", - " .cols.apply_by_dtypes(\"product\",num_to_string,\"string\", data_type=\"integer\")\\\n", - " .cols.trim(\"*\")\\\n", - " .rows.sort(\"id\")\\\n", - " .show()" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| id| firstName| lastName|billingId| product|price| birth|dummyCol|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| 1| Luis| Alvarez$$%!| 123| Cake| 10|1980/07/07| never|\n", - "| 2| André| Ampère| 423| piza| 8|1950/07/08| gonna|\n", - "| 3| NiELS| Böhr//((%%| 551| pizza| 8|1990/07/09| give|\n", - "| 4| PAUL| dirac$| 521| pizza| 8|1954/07/10| you|\n", - "| 5| Albert| Einstein| 634| pizza| 8|1990/07/11| up|\n", - "| 6| Galileo| GALiLEI| 672| arepa| 5|1930/08/12| never|\n", - "| 7| CaRL| Ga%%%uss| 323| taco| 3|1970/07/13| gonna|\n", - "| 8| David| H$$$ilbert| 624| taaaccoo| 3|1950/07/14| let|\n", - "| 9| Johannes| KEPLER| 735| taco| 3|1920/04/22| you|\n", - "| 10| JaMES| M$$ax%%well| 875| taco| 3|1923/03/12| down|\n", - "| 11| Isaac| Newton| 992| pasta| 9|1999/02/15| never |\n", - "| 12| Emmy%%| Nöether$| 234| pasta| 9|1993/12/08| gonna|\n", - "| 13| Max!!!| Planck!!!| 111|hamburguer| 4|1994/01/04| run |\n", - "| 14| Fred| Hoy&&&le| 553| pizzza| 8|1997/06/27| around|\n", - "| 15|((( Heinrich )))))| Hertz| 116| pizza| 8|1956/11/30| and|\n", - "| 16| William| Gilbert###| 886| BEER| 2|1958/03/26| desert|\n", - "| 17| Marie| CURIE| 912| Rice| 1|2000/03/22| you|\n", - "| 18| Arthur| COM%%%pton| 812| 110790| 5|1899/01/01| #|\n", - "| 19| JAMES| Chadwick| 467| null| 10|1921/05/03| #|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| id| firstName| lastName|billingid| product|price| birth|dummyCol|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| 1| Luis| Alvarez$$%!| 126| Cake| 10|1980/07/07| never|\n", - "| 2| André| Ampère| 426| piza| 8|1950/07/08| gonna|\n", - "| 3| NiELS| Böhr//((%%| 554| pizza| 8|1990/07/09| give|\n", - "| 4| PAUL| dirac$| 524| pizza| 8|1954/07/10| you|\n", - "| 5| Albert| Einstein| 637| pizza| 8|1990/07/11| up|\n", - "| 6| Galileo| GALiLEI| 675| arepa| 5|1930/08/12| never|\n", - "| 7| CaRL| Ga%%%uss| 326| taco| 3|1970/07/13| gonna|\n", - "| 8| David| H$$$ilbert| 627| taaaccoo| 3|1950/07/14| let|\n", - "| 9| Johannes| KEPLER| 738| taco| 3|1920/04/22| you|\n", - "| 10| JaMES| M$$ax%%well| 878| taco| 3|1923/03/12| down|\n", - "| 11| Isaac| Newton| 995| pasta| 9|1999/02/15| never |\n", - "| 12| Emmy%%| Nöether$| 237| pasta| 9|1993/12/08| gonna|\n", - "| 13| Max!!!| Planck!!!| 114|hamburguer| 4|1994/01/04| run |\n", - "| 14| Fred| Hoy&&&le| 556| pizzza| 8|1997/06/27| around|\n", - "| 15|((( Heinrich )))))| Hertz| 119| pizza| 8|1956/11/30| and|\n", - "| 16| William| Gilbert###| 889| BEER| 2|1958/03/26| desert|\n", - "| 17| Marie| CURIE| 915| Rice| 1|2000/03/22| you|\n", - "| 18| Arthur| COM%%%pton| 815| 110790| 5|1899/01/01| #|\n", - "| 19| JAMES| Chadwick| 470| null| 10|1921/05/03| #|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "\n" - ] - } - ], - "source": [ - "df.show()\n", - "def func(value, args):\n", - " return value +args[0] + args[1]\n", - "\n", - "\n", - "df.cols.apply(\"billingid\",func,\"int\", [1,2]).show()\n", - "\n", - "#df.cols.apply([\"num\", \"new_col_1\"], func, \"int\", 32 ,\"udf\").table()" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| id| firstName| lastName|billingid| product|price| birth|dummyCol|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| 1| Luis| Alvarez$$%!| 6.15| Cake| 10|1980/07/07| never|\n", - "| 2| André| Ampère| 21.15| piza| 8|1950/07/08| gonna|\n", - "| 3| NiELS| Böhr//((%%| 27.55| pizza| 8|1990/07/09| give|\n", - "| 4| PAUL| dirac$| 26.05| pizza| 8|1954/07/10| you|\n", - "| 5| Albert| Einstein| 31.7| pizza| 8|1990/07/11| up|\n", - "| 6| Galileo| GALiLEI| 33.6| arepa| 5|1930/08/12| never|\n", - "| 7| CaRL| Ga%%%uss| 16.15| taco| 3|1970/07/13| gonna|\n", - "| 8| David| H$$$ilbert| 31.2| taaaccoo| 3|1950/07/14| let|\n", - "| 9| Johannes| KEPLER| 36.75| taco| 3|1920/04/22| you|\n", - "| 10| JaMES| M$$ax%%well| 43.75| taco| 3|1923/03/12| down|\n", - "| 11| Isaac| Newton| 49.6| pasta| 9|1999/02/15| never |\n", - "| 12| Emmy%%| Nöether$| 11.7| pasta| 9|1993/12/08| gonna|\n", - "| 13| Max!!!| Planck!!!| 5.55|hamburguer| 4|1994/01/04| run |\n", - "| 14| Fred| Hoy&&&le| 27.65| pizzza| 8|1997/06/27| around|\n", - "| 15|((( Heinrich )))))| Hertz| 5.8| pizza| 8|1956/11/30| and|\n", - "| 16| William| Gilbert###| 44.3| BEER| 2|1958/03/26| desert|\n", - "| 17| Marie| CURIE| 45.6| Rice| 1|2000/03/22| you|\n", - "| 18| Arthur| COM%%%pton| 40.6| 110790| 5|1899/01/01| #|\n", - "| 19| JAMES| Chadwick| 23.35| null| 10|1921/05/03| #|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "\n" - ] - } - ], - "source": [ - "from pyspark.sql import functions as F\n", - "\n", - "def func(col_name, args):\n", - " return F.col(col_name)/20\n", - "\n", - "df.cols.apply_expr(\"billingid\", func, 20).show()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Row(double=1, single=1),\n", - " Row(double=4, single=2),\n", - " Row(double=9, single=3),\n", - " Row(double=16, single=4),\n", - " Row(double=25, single=5)]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pyspark.sql import SparkSession\n", - "\n", - "spark = SparkSession \\\n", - " .builder \\\n", - " .appName(\"Python Spark SQL basic example\") \\\n", - " .config(\"spark.some.config.option\", \"some-value\") \\\n", - " .getOrCreate()\n", - "\n", - "sc = spark.sparkContext\n", - "\n", - "spark.createDataFrame(sc.parallelize(range(1, 6)).map(lambda i: Row(single=i, double=i ** 2))).collect()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Cheat Sheet" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "df = op.create.df(\n", - " [\n", - " (\"words\", \"str\", True),\n", - " (\"num\", \"int\", True),\n", - " ],\n", - " [\n", - " (\" I like fish \", 1),\n", - " (\" zombies\", 2, ),\n", - " (\"simpsons cat lady\", 2),\n", - " (None, 3)\n", - " ])" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
Viewing 9 of 9
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
id
\n", - "
(int)
\n", - "\n", - "
\n", - "
firstName
\n", - "
(string)
\n", - "\n", - "
\n", - "
lastName
\n", - "
(string)
\n", - "\n", - "
\n", - "
billingId
\n", - "
(int)
\n", - "\n", - "
\n", - "
product
\n", - "
(string)
\n", - "\n", - "
\n", - "
price
\n", - "
(int)
\n", - "\n", - "
\n", - "
birth
\n", - "
(string)
\n", - "\n", - "
\n", - "
dummyCol
\n", - "
(string)
\n", - "\n", - "
\n", - " 2\n", - " \n", - " André\n", - " \n", - " Ampère\n", - " \n", - " 423\n", - " \n", - " piza\n", - " \n", - " 8\n", - " \n", - " 1950/07/08\n", - " \n", - " gonna\n", - "
\n", - " 7\n", - " \n", - " CaRL\n", - " \n", - " Ga%%%uss\n", - " \n", - " 323\n", - " \n", - " taco\n", - " \n", - " 3\n", - " \n", - " 1970/07/13\n", - " \n", - " gonna\n", - "
\n", - " 9\n", - " \n", - " Johannes\n", - " \n", - " KEPLER\n", - " \n", - " 735\n", - " \n", - " taco\n", - " \n", - " 3\n", - " \n", - " 1920/04/22\n", - " \n", - " you\n", - "
\n", - " 10\n", - " \n", - " JaMES\n", - " \n", - " M$$ax%%well\n", - " \n", - " 875\n", - " \n", - " taco\n", - " \n", - " 3\n", - " \n", - " 1923/03/12\n", - " \n", - " down\n", - "
\n", - " 11\n", - " \n", - " Isaac\n", - " \n", - " Newton\n", - " \n", - " 992\n", - " \n", - " pasta\n", - " \n", - " 9\n", - " \n", - " 1999/02/15\n", - " \n", - " never⸱\n", - "
\n", - " 14\n", - " \n", - " Fred\n", - " \n", - " Hoy&&&le\n", - " \n", - " 553\n", - " \n", - " pizzza\n", - " \n", - " 8\n", - " \n", - " 1997/06/27\n", - " \n", - " around\n", - "
\n", - " 16\n", - " \n", - " William\n", - " \n", - " Gilbert###\n", - " \n", - " 886\n", - " \n", - " BEER\n", - " \n", - " 2\n", - " \n", - " 1958/03/26\n", - " \n", - " desert\n", - "
\n", - " 17\n", - " \n", - " Marie\n", - " \n", - " CURIE\n", - " \n", - " 912\n", - " \n", - " Rice\n", - " \n", - " 1\n", - " \n", - " 2000/03/22\n", - " \n", - " you\n", - "
\n", - " 18\n", - " \n", - " Arthur\n", - " \n", - " COM%%%pton\n", - " \n", - " 812\n", - " \n", - " 110790\n", - " \n", - " 5\n", - " \n", - " 1899/01/01\n", - " \n", - " #\n", - "
\n", - "
Viewing 9 of 9
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df.sample_by(10, False).table()" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+--------------------+--------------------+---------+\n", - "| firstName| lastName|billingId|\n", - "+--------------------+--------------------+---------+\n", - "| Luis| Alvarez$$%!| 123|\n", - "| André| Ampère| 423|\n", - "| NiELS| Böhr//((%%| 551|\n", - "| PAUL| dirac$| 521|\n", - "| Albert| Einstein| 634|\n", - "| Galileo| GALiLEI| 672|\n", - "| CaRL| Ga%%%uss| 323|\n", - "| David| H$$$ilbert| 624|\n", - "| Johannes| KEPLER| 735|\n", - "| JaMES| M$$ax%%well| 875|\n", - "| Isaac| Newton| 992|\n", - "| Emmy%%| Nöether$| 234|\n", - "| Max!!!| Planck!!!| 111|\n", - "| Fred| Hoy&&&le| 553|\n", - "|((( Heinrich )))))| Hertz| 116|\n", - "| William| Gilbert###| 886|\n", - "| Marie| CURIE| 912|\n", - "| Arthur| COM%%%pton| 812|\n", - "| JAMES| Chadwick| 467|\n", - "+--------------------+--------------------+---------+\n", - "\n" - ] - } - ], - "source": [ - "df.cols.select([\"firstName\",2,3]).show()" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "value = [\"1\"]\n", - "bool(value) and isinstance(value, list) and all(isinstance(elem, (int, str)) for elem in value)" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "DataFrame[id: int, firstName: string, lastName: string, billingId: int, product: string, price: int, birth: string, dummyCol: string]" - ] - }, - "execution_count": 67, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
Viewing 19 of 19
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
id
\n", - "
(int)
\n", - "\n", - "
\n", - "
firstName
\n", - "
(string)
\n", - "\n", - "
\n", - "
lastName
\n", - "
(string)
\n", - "\n", - "
\n", - "
billingId
\n", - "
(int)
\n", - "\n", - "
\n", - "
product
\n", - "
(string)
\n", - "\n", - "
\n", - "
price
\n", - "
(int)
\n", - "\n", - "
\n", - "
birth
\n", - "
(string)
\n", - "\n", - "
\n", - "
dummyCol
\n", - "
(string)
\n", - "\n", - "
\n", - "
b
\n", - "
(double)
\n", - "\n", - "
\n", - " 1\n", - " \n", - " Luis\n", - " \n", - " Alvarez$$%!\n", - " \n", - " 123\n", - " \n", - " Cake\n", - " \n", - " 10\n", - " \n", - " 1980/07/07\n", - " \n", - " never\n", - " \n", - " 0.0\n", - "
\n", - " 2\n", - " \n", - " André\n", - " \n", - " Ampère\n", - " \n", - " 423\n", - " \n", - " piza\n", - " \n", - " 8\n", - " \n", - " 1950/07/08\n", - " \n", - " gonna\n", - " \n", - " 0.0\n", - "
\n", - " 3\n", - " \n", - " NiELS\n", - " \n", - " Böhr//((%%\n", - " \n", - " 551\n", - " \n", - " pizza\n", - " \n", - " 8\n", - " \n", - " 1990/07/09\n", - " \n", - " give\n", - " \n", - " 1.0\n", - "
\n", - " 4\n", - " \n", - " PAUL\n", - " \n", - " dirac$\n", - " \n", - " 521\n", - " \n", - " pizza\n", - " \n", - " 8\n", - " \n", - " 1954/07/10\n", - " \n", - " you\n", - " \n", - " 1.0\n", - "
\n", - " 5\n", - " \n", - " Albert\n", - " \n", - " Einstein\n", - " \n", - " 634\n", - " \n", - " pizza\n", - " \n", - " 8\n", - " \n", - " 1990/07/11\n", - " \n", - " up\n", - " \n", - " 1.0\n", - "
\n", - " 6\n", - " \n", - " Galileo\n", - " \n", - " ⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱GALiLEI\n", - " \n", - " 672\n", - " \n", - " arepa\n", - " \n", - " 5\n", - " \n", - " 1930/08/12\n", - " \n", - " never\n", - " \n", - " 2.0\n", - "
\n", - " 7\n", - " \n", - " CaRL\n", - " \n", - " Ga%%%uss\n", - " \n", - " 323\n", - " \n", - " taco\n", - " \n", - " 3\n", - " \n", - " 1970/07/13\n", - " \n", - " gonna\n", - " \n", - " 0.0\n", - "
\n", - " 8\n", - " \n", - " David\n", - " \n", - " H$$$ilbert\n", - " \n", - " 624\n", - " \n", - " taaaccoo\n", - " \n", - " 3\n", - " \n", - " 1950/07/14\n", - " \n", - " let\n", - " \n", - " 1.0\n", - "
\n", - " 9\n", - " \n", - " Johannes\n", - " \n", - " KEPLER\n", - " \n", - " 735\n", - " \n", - " taco\n", - " \n", - " 3\n", - " \n", - " 1920/04/22\n", - " \n", - " you\n", - " \n", - " 2.0\n", - "
\n", - " 10\n", - " \n", - " JaMES\n", - " \n", - " M$$ax%%well\n", - " \n", - " 875\n", - " \n", - " taco\n", - " \n", - " 3\n", - " \n", - " 1923/03/12\n", - " \n", - " down\n", - " \n", - " 2.0\n", - "
\n", - " 11\n", - " \n", - " Isaac\n", - " \n", - " Newton\n", - " \n", - " 992\n", - " \n", - " pasta\n", - " \n", - " 9\n", - " \n", - " 1999/02/15\n", - " \n", - " never⸱\n", - " \n", - " 2.0\n", - "
\n", - " 12\n", - " \n", - " Emmy%%\n", - " \n", - " Nöether$\n", - " \n", - " 234\n", - " \n", - " pasta\n", - " \n", - " 9\n", - " \n", - " 1993/12/08\n", - " \n", - " gonna\n", - " \n", - " 0.0\n", - "
\n", - " 13\n", - " \n", - " Max!!!\n", - " \n", - " Planck!!!\n", - " \n", - " 111\n", - " \n", - " hamburguer\n", - " \n", - " 4\n", - " \n", - " 1994/01/04\n", - " \n", - " run⸱\n", - " \n", - " 0.0\n", - "
\n", - " 14\n", - " \n", - " Fred\n", - " \n", - " Hoy&&&le\n", - " \n", - " 553\n", - " \n", - " pizzza\n", - " \n", - " 8\n", - " \n", - " 1997/06/27\n", - " \n", - " around\n", - " \n", - " 1.0\n", - "
\n", - " 15\n", - " \n", - " (((⸱⸱⸱Heinrich⸱)))))\n", - " \n", - " Hertz\n", - " \n", - " 116\n", - " \n", - " pizza\n", - " \n", - " 8\n", - " \n", - " 1956/11/30\n", - " \n", - " and\n", - " \n", - " 0.0\n", - "
\n", - " 16\n", - " \n", - " William\n", - " \n", - " Gilbert###\n", - " \n", - " 886\n", - " \n", - " BEER\n", - " \n", - " 2\n", - " \n", - " 1958/03/26\n", - " \n", - " desert\n", - " \n", - " 2.0\n", - "
\n", - " 17\n", - " \n", - " Marie\n", - " \n", - " CURIE\n", - " \n", - " 912\n", - " \n", - " Rice\n", - " \n", - " 1\n", - " \n", - " 2000/03/22\n", - " \n", - " you\n", - " \n", - " 2.0\n", - "
\n", - " 18\n", - " \n", - " Arthur\n", - " \n", - " COM%%%pton\n", - " \n", - " 812\n", - " \n", - " 110790\n", - " \n", - " 5\n", - " \n", - " 1899/01/01\n", - " \n", - " #\n", - " \n", - " 2.0\n", - "
\n", - " 19\n", - " \n", - " JAMES\n", - " \n", - " Chadwick\n", - " \n", - " 467\n", - " \n", - " null\n", - " \n", - " 10\n", - " \n", - " 1921/05/03\n", - " \n", - " #\n", - " \n", - " 1.0\n", - "
\n", - "
Viewing 19 of 19
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from pyspark.sql import functions as F\n", - "c = \"firstName\"\n", - "df.cols.qcut(\"billingId\",\"b\",3).table()" - ] - }, - { - "cell_type": "code", - "execution_count": 98, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| id| firstName| lastName|billingId| product|price| birth|dummyCol|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| 1| Luis| Alvarez$$%!| 200| Cake| 10|1980/07/07| never|\n", - "| 2| André| Ampère| 400| piza| 8|1950/07/08| gonna|\n", - "| 3| NiELS| Böhr//((%%| 400| pizza| 8|1990/07/09| give|\n", - "| 4| PAUL| dirac$| 400| pizza| 8|1954/07/10| you|\n", - "| 5| Albert| Einstein| 400| pizza| 8|1990/07/11| up|\n", - "| 6| Galileo| GALiLEI| 400| arepa| 5|1930/08/12| never|\n", - "| 7| CaRL| Ga%%%uss| 323| taco| 3|1970/07/13| gonna|\n", - "| 8| David| H$$$ilbert| 400| taaaccoo| 3|1950/07/14| let|\n", - "| 9| Johannes| KEPLER| 400| taco| 3|1920/04/22| you|\n", - "| 10| JaMES| M$$ax%%well| 400| taco| 3|1923/03/12| down|\n", - "| 11| Isaac| Newton| 400| pasta| 9|1999/02/15| never |\n", - "| 12| Emmy%%| Nöether$| 234| pasta| 9|1993/12/08| gonna|\n", - "| 13| Max!!!| Planck!!!| 200|hamburguer| 4|1994/01/04| run |\n", - "| 14| Fred| Hoy&&&le| 400| pizzza| 8|1997/06/27| around|\n", - "| 15|((( Heinrich )))))| Hertz| 200| pizza| 8|1956/11/30| and|\n", - "| 16| William| Gilbert###| 400| BEER| 2|1958/03/26| desert|\n", - "| 17| Marie| CURIE| 400| Rice| 1|2000/03/22| you|\n", - "| 18| Arthur| COM%%%pton| 400| 110790| 5|1899/01/01| #|\n", - "| 19| JAMES| Chadwick| 400| null| 10|1921/05/03| #|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "\n" - ] - } - ], - "source": [ - "df.cols.clip(\"billingId\",200, 400).show()" - ] - }, - { - "cell_type": "code", - "execution_count": 89, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| id| firstName| lastName|billingId| product|price| birth|dummyCol|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| 1| Luis| Alvarez$$%!| 123| Cake| 10|1980/07/07| never|\n", - "| 2| André| Ampère| 423| piza| 8|1950/07/08| gonna|\n", - "| 3| NiELS| Böhr//((%%| 551| pizza| 8|1990/07/09| give|\n", - "| 4| PAUL| dirac$| 521| pizza| 8|1954/07/10| you|\n", - "| 5| Albert| Einstein| 634| pizza| 8|1990/07/11| up|\n", - "| 6| Galileo| GALiLEI| 672| arepa| 5|1930/08/12| never|\n", - "| 7| CaRL| Ga%%%uss| 323| taco| 3|1970/07/13| gonna|\n", - "| 8| David| H$$$ilbert| 624| taaaccoo| 3|1950/07/14| let|\n", - "| 9| Johannes| KEPLER| 735| taco| 3|1920/04/22| you|\n", - "| 10| JaMES| M$$ax%%well| 875| taco| 3|1923/03/12| down|\n", - "| 11| Isaac| Newton| 992| pasta| 9|1999/02/15| never |\n", - "| 12| Emmy%%| Nöether$| 234| pasta| 9|1993/12/08| gonna|\n", - "| 13| Max!!!| Planck!!!| 111|hamburguer| 4|1994/01/04| run |\n", - "| 14| Fred| Hoy&&&le| 553| pizzza| 8|1997/06/27| around|\n", - "| 15|((( Heinrich )))))| Hertz| 116| pizza| 8|1956/11/30| and|\n", - "| 16| William| Gilbert###| 886| BEER| 2|1958/03/26| desert|\n", - "| 17| Marie| CURIE| 912| Rice| 1|2000/03/22| you|\n", - "| 18| Arthur| COM%%%pton| 812| 110790| 5|1899/01/01| #|\n", - "| 19| JAMES| Chadwick| 467| null| 10|1921/05/03| #|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "\n" - ] - } - ], - "source": [ - "df.cols.abs(\"billingId\").show()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
Viewing 100 of 569
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
label
\n", - "
1 (double)
\n", - "\n", - "
\n", - "
prediction
\n", - "
2 (double)
\n", - "\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - "
Viewing 100 of 569
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from optimus import Optimus\n", - "\n", - "op = Optimus()\n", - "df_cancer =op.load.url(\"https://raw.githubusercontent.com/ironmussa/Optimus/master/tests/data_cancer.csv\")\n", - "\n", - "columns = ['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',\n", - " 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean',\n", - " 'fractal_dimension_mean']\n", - "\n", - "df_model, rf_model = op.ml.random_forest(df_cancer, columns, \"diagnosis\")\n", - "df_model.cols.select([\"label\",\"prediction\"]).table()" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "value = [{'name': 'Asuka 881627', 'id': '4336', 'nametype': 'Valid', 'recclass': 'L6', 'mass (g)': '42.68', 'fall': 'Found', 'year': '01/01/1988 12:00:00 AM', 'reclat': '-72.000000', 'reclong': '26.000000', 'GeoLocation': '(-72.000000, 26.000000)'}, {'name': 'Dhofar 1401', 'id': '35491', 'nametype': 'Valid', 'recclass': 'LL~6', 'mass (g)': '42.03', 'fall': 'Found', 'year': '01/01/2001 12:00:00 AM', 'reclat': '19.191350', 'reclong': '54.655450', 'GeoLocation': '(19.191350, 54.655450)'}, {'name': 'Elephant Moraine 87745', 'id': '8295', 'nametype': 'Valid', 'recclass': 'H5', 'mass (g)': '123', 'fall': 'Found', 'year': '01/01/1987 12:00:00 AM', 'reclat': '-76.183330', 'reclong': '157.166670', 'GeoLocation': '(-76.183330, 157.166670)'}, {'name': 'Frontier Mountain 90153', 'id': '10545', 'nametype': 'Valid', 'recclass': 'H4-6', 'mass (g)': '8.5', 'fall': 'Found', 'year': '01/01/1990 12:00:00 AM', 'reclat': '-72.954040', 'reclong': '160.538110', 'GeoLocation': '(-72.954040, 160.538110)'}, {'name': 'Larkman Nunatak 06750', 'id': '48858', 'nametype': 'Valid', 'recclass': 'L5', 'mass (g)': '66.8', 'fall': 'Found', 'year': '01/01/2006 12:00:00 AM', 'reclat': None, 'reclong': None, 'GeoLocation': None}, {'name': 'MacAlpine Hills 02539', 'id': '14845', 'nametype': 'Valid', 'recclass': 'LL6', 'mass (g)': '96.6', 'fall': 'Found', 'year': '01/01/2002 12:00:00 AM', 'reclat': '-84.216670', 'reclong': '160.500000', 'GeoLocation': '(-84.216670, 160.500000)'}, {'name': 'Miller Range 07124', 'id': '51180', 'nametype': 'Valid', 'recclass': 'L5', 'mass (g)': '378.6', 'fall': 'Found', 'year': '01/01/2007 12:00:00 AM', 'reclat': '0.000000', 'reclong': '0.000000', 'GeoLocation': '(0.000000, 0.000000)'}, {'name': 'Northwest Africa 4032', 'id': '34304', 'nametype': 'Valid', 'recclass': 'Eucrite-pmict', 'mass (g)': '10.5', 'fall': 'Found', 'year': '01/01/2004 12:00:00 AM', 'reclat': None, 'reclong': None, 'GeoLocation': None}, {'name': 'Northwest Africa 5953', 'id': '50839', 'nametype': 'Valid', 'recclass': 'LL4', 'mass (g)': '1450', 'fall': 'Found', 'year': '01/01/2005 12:00:00 AM', 'reclat': '0.000000', 'reclong': '0.000000', 'GeoLocation': '(0.000000, 0.000000)'}, {'name': 'Pecora Escarpment 91310', 'id': '18601', 'nametype': 'Valid', 'recclass': 'L5', 'mass (g)': '134.69999999999999', 'fall': 'Found', 'year': '01/01/1991 12:00:00 AM', 'reclat': '-85.682450', 'reclong': '-68.745390', 'GeoLocation': '(-85.682450, -68.745390)'}, {'name': 'Tungsten Mountain 006', 'id': '24077', 'nametype': 'Valid', 'recclass': 'L6', 'mass (g)': '8.800000000000001', 'fall': 'Found', 'year': '01/01/2001 12:00:00 AM', 'reclat': '39.684360', 'reclong': '-117.620180', 'GeoLocation': '(39.684360, -117.620180)'}]" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'name': 'Asuka 881627',\n", - " 'id': '4336',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'L6',\n", - " 'mass (g)': '42.68',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/1988 12:00:00 AM',\n", - " 'reclat': '-72.000000',\n", - " 'reclong': '26.000000',\n", - " 'GeoLocation': '(-72.000000, 26.000000)'},\n", - " {'name': 'Dhofar 1401',\n", - " 'id': '35491',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'LL~6',\n", - " 'mass (g)': '42.03',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/2001 12:00:00 AM',\n", - " 'reclat': '19.191350',\n", - " 'reclong': '54.655450',\n", - " 'GeoLocation': '(19.191350, 54.655450)'},\n", - " {'name': 'Elephant Moraine 87745',\n", - " 'id': '8295',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'H5',\n", - " 'mass (g)': '123',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/1987 12:00:00 AM',\n", - " 'reclat': '-76.183330',\n", - " 'reclong': '157.166670',\n", - " 'GeoLocation': '(-76.183330, 157.166670)'},\n", - " {'name': 'Frontier Mountain 90153',\n", - " 'id': '10545',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'H4-6',\n", - " 'mass (g)': '8.5',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/1990 12:00:00 AM',\n", - " 'reclat': '-72.954040',\n", - " 'reclong': '160.538110',\n", - " 'GeoLocation': '(-72.954040, 160.538110)'},\n", - " {'name': 'Larkman Nunatak 06750',\n", - " 'id': '48858',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'L5',\n", - " 'mass (g)': '66.8',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/2006 12:00:00 AM',\n", - " 'reclat': None,\n", - " 'reclong': None,\n", - " 'GeoLocation': None},\n", - " {'name': 'MacAlpine Hills 02539',\n", - " 'id': '14845',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'LL6',\n", - " 'mass (g)': '96.6',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/2002 12:00:00 AM',\n", - " 'reclat': '-84.216670',\n", - " 'reclong': '160.500000',\n", - " 'GeoLocation': '(-84.216670, 160.500000)'},\n", - " {'name': 'Miller Range 07124',\n", - " 'id': '51180',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'L5',\n", - " 'mass (g)': '378.6',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/2007 12:00:00 AM',\n", - " 'reclat': '0.000000',\n", - " 'reclong': '0.000000',\n", - " 'GeoLocation': '(0.000000, 0.000000)'},\n", - " {'name': 'Northwest Africa 4032',\n", - " 'id': '34304',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'Eucrite-pmict',\n", - " 'mass (g)': '10.5',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/2004 12:00:00 AM',\n", - " 'reclat': None,\n", - " 'reclong': None,\n", - " 'GeoLocation': None},\n", - " {'name': 'Northwest Africa 5953',\n", - " 'id': '50839',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'LL4',\n", - " 'mass (g)': '1450',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/2005 12:00:00 AM',\n", - " 'reclat': '0.000000',\n", - " 'reclong': '0.000000',\n", - " 'GeoLocation': '(0.000000, 0.000000)'},\n", - " {'name': 'Pecora Escarpment 91310',\n", - " 'id': '18601',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'L5',\n", - " 'mass (g)': '134.69999999999999',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/1991 12:00:00 AM',\n", - " 'reclat': '-85.682450',\n", - " 'reclong': '-68.745390',\n", - " 'GeoLocation': '(-85.682450, -68.745390)'},\n", - " {'name': 'Tungsten Mountain 006',\n", - " 'id': '24077',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'L6',\n", - " 'mass (g)': '8.800000000000001',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/2001 12:00:00 AM',\n", - " 'reclat': '39.684360',\n", - " 'reclong': '-117.620180',\n", - " 'GeoLocation': '(39.684360, -117.620180)'}]" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "value" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": {}, - "outputs": [], - "source": [ - "data = []\n", - "for l in value:\n", - " data.append([v for k,v in l.items()])\n", - "result = [{\"columns\":df.columns}, {\"data\":data}]\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting fastavro\n", - " Downloading https://files.pythonhosted.org/packages/3a/a5/b357909eb300ae3a8499f1718b3887b379e743553bcc2dc2ed325902072b/fastavro-0.21.4-cp36-cp36m-win_amd64.whl (282kB)\n", - "Installing collected packages: fastavro\n", - "Successfully installed fastavro-0.21.4\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "You are using pip version 10.0.1, however version 18.0 is available.\n", - "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n" - ] - } - ], - "source": [ - "!pip install fastavro" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pyspark\\sql\\session.py:360: UserWarning: Using RDD of dict to inferSchema is deprecated. Use pyspark.sql.Row instead\n", - " warnings.warn(\"Using RDD of dict to inferSchema is deprecated. \"\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+----------+--------------------+----------+\n", - "| timestamp| tweet| username|\n", - "+----------+--------------------+----------+\n", - "|1366150681|Rock: Nerf paper,...| miguno|\n", - "|1366154481|Works as intended...|BlizzardCS|\n", - "+----------+--------------------+----------+\n", - "\n" - ] - }, - { - "data": { - "text/plain": [ - "1366150681" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from io import BytesIO\n", - "import fastavro\n", - "\n", - "df = op.sc.binaryFiles(\"twitter.avro\")\\\n", - " .flatMap(lambda args: fastavro.reader(BytesIO(args[1]))).toDF()\n", - "\n", - "df.show()\n", - "# optimus function get the min value\n", - "df.cols.min(\"timestamp\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'station': '011990-99999', 'time': 1433269388, 'temp': 0}\n", - "{'station': '011990-99999', 'time': 1433270389, 'temp': 22}\n", - "{'station': '011990-99999', 'time': 1433273379, 'temp': -11}\n", - "{'station': '012650-99999', 'time': 1433275478, 'temp': 111}\n" - ] - } - ], - "source": [ - "from fastavro import writer, reader, parse_schema\n", - "\n", - "schema = {\n", - " 'doc': 'A weather reading.',\n", - " 'name': 'Weather',\n", - " 'namespace': 'test',\n", - " 'type': 'record',\n", - " 'fields': [\n", - " {'name': 'station', 'type': 'string'},\n", - " {'name': 'time', 'type': 'long'},\n", - " {'name': 'temp', 'type': 'int'},\n", - " ],\n", - "}\n", - "parsed_schema = parse_schema(schema)\n", - "\n", - "# 'records' can be an iterable (including generator)\n", - "records = [\n", - " {u'station': u'011990-99999', u'temp': 0, u'time': 1433269388},\n", - " {u'station': u'011990-99999', u'temp': 22, u'time': 1433270389},\n", - " {u'station': u'011990-99999', u'temp': -11, u'time': 1433273379},\n", - " {u'station': u'012650-99999', u'temp': 111, u'time': 1433275478},\n", - "]\n", - "\n", - "# Writing\n", - "with open('weather.avro', 'wb') as out:\n", - " writer(out, parsed_schema, records)\n", - "\n", - "# Reading\n", - "with open('weather.avro', 'rb') as fo:\n", - " for record in reader(fo):\n", - " print(record)" - ] - }, { "cell_type": "code", "execution_count": null, diff --git a/optimus/server.py b/optimus/server.py index 0d4686582..567369062 100644 --- a/optimus/server.py +++ b/optimus/server.py @@ -1,34 +1,80 @@ import configparser import json - from flask import Flask from flask import jsonify +from multiprocessing import Process + + +class Server: + def __init__(self): + config = configparser.ConfigParser() + + self.path = "" + # try to load the config file + try: + config.read("config.ini") + self.path = config["SERVER"]["Input"] + except IOError: + print("config.ini not found") + + except KeyError: + print("Input info not found in config.ini. Be sure you have...") + print("[SERVER]") + print("Input = config.ini") + + self.server = None + + app = Flask(__name__) + + @app.route('/') + def index(): + """ + Return a message indicating if the server is running. + :return: + """ + + return jsonify("Optimus Server si Running... Go to json /profiler to get the Optimus profiler data.") -config = configparser.ConfigParser() + @app.route('/profiler') + def profiler(): + """ + Return the data profiler in json format. + :return: + """ + try: + with self.app.app_context(): + with open(self.path, encoding="utf8") as f: + data = json.loads(f.read()) + return jsonify(data) + except IOError: + return jsonify("Not data profiling available") -# try to load the config file -try: - config.read("config.ini") - path = config["SERVER"]["Input"] -except (IOError, KeyError): - print("config.ini not found") - pass + self.app = app + def start(self): + """ + Start the Optimus Server + :return: + """ -app = Flask(__name__) + # References + # https://stackoverflow.com/questions/15562446/how-to-stop-flask-application-without-using-ctrl-c + # https://stackoverflow.com/questions/33927616/multiprocess-within-flask-app-spinning-up-2-processes + self.server = Process(target=self.app.run(debug=True, use_reloader=False)) + self.server.start() -@app.route('/') -def output_json(): - try: - with app.app_context(): + sys.stdout.flush() - with open(path, encoding="utf8") as f: - data = json.loads(f.read()) - return jsonify(data) - except IOError: - raise + return + # app.add_url_rule('/', 'index', index) + # app.add_url_rule('/', "profiler ", profiler) -if __name__ == '__main__': - app.run() + def stop(self): + """ + Stop the server + :return: + """ + self.server.terminate() + self.server.join() From e4b44f29cad6d2e3ad4ee448a9aa5f716c40a8cf Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Fri, 31 Aug 2018 19:34:08 -0500 Subject: [PATCH 10/94] Testing server process --- examples/new-api-sandbox.ipynb | 81 ++++++++++----------------------- optimus/server.py | 80 -------------------------------- optimus/server/__init__.py | 0 optimus/{ => server}/config.ini | 0 optimus/server/run.py | 50 ++++++++++++++++++++ optimus/server/server.py | 47 +++++++++++++++++++ 6 files changed, 121 insertions(+), 137 deletions(-) delete mode 100644 optimus/server.py create mode 100644 optimus/server/__init__.py rename optimus/{ => server}/config.ini (100%) create mode 100644 optimus/server/run.py create mode 100644 optimus/server/server.py diff --git a/examples/new-api-sandbox.ipynb b/examples/new-api-sandbox.ipynb index 872a71fa2..fbe18dbad 100644 --- a/examples/new-api-sandbox.ipynb +++ b/examples/new-api-sandbox.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -31,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -67,96 +67,63 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from optimus.server import Server\n", - "s = Server()" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - " * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)\n" + "python C:\\Users\\argenisleon\\Documents\\Optimus\\optimus\\server\\run.py\n" ] } ], "source": [ - "s.start()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# https://stackoverflow.com/questions/89228/calling-an-external-command-in-python/92395#92395\n", - "import subprocess\n", - "return_code = subprocess.call(\"echo Hello World\", shell=True) " + "from optimus.server.server import Server\n", + "s = Server()" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0\n" + "14948\n", + "\n" ] } ], "source": [ - "print(return_code)" + "import os\n", + "import signal\n", + "s.status()\n", + "os.kill(14948, signal.CTRL_C_EVENT)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "df = op.load.url(\"https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/foo.csv\")" + "s.stop()" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'terminal' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mterminal\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;31mNameError\u001b[0m: name 'terminal' is not defined" - ] - } - ], - "source": [ - "terminal" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "!pip install flask=0.12.2" + "# Subprocess Resourcess https://stackoverflow.com/questions/89228/calling-an-external-command-in-python/92395#92395\n", + "# Create subprocess on windows https://stackoverflow.com/questions/13243807/popen-waiting-for-child-process-even-when-the-immediate-child-has-terminated/13256908#13256908\n", + "\n", + "import subprocess\n", + "return_code = subprocess.call(\"echo Hello World\", shell=True) " ] }, { diff --git a/optimus/server.py b/optimus/server.py deleted file mode 100644 index 567369062..000000000 --- a/optimus/server.py +++ /dev/null @@ -1,80 +0,0 @@ -import configparser -import json -from flask import Flask -from flask import jsonify -from multiprocessing import Process - - -class Server: - def __init__(self): - config = configparser.ConfigParser() - - self.path = "" - # try to load the config file - try: - config.read("config.ini") - self.path = config["SERVER"]["Input"] - except IOError: - print("config.ini not found") - - except KeyError: - print("Input info not found in config.ini. Be sure you have...") - print("[SERVER]") - print("Input = config.ini") - - self.server = None - - app = Flask(__name__) - - @app.route('/') - def index(): - """ - Return a message indicating if the server is running. - :return: - """ - - return jsonify("Optimus Server si Running... Go to json /profiler to get the Optimus profiler data.") - - @app.route('/profiler') - def profiler(): - """ - Return the data profiler in json format. - :return: - """ - try: - with self.app.app_context(): - with open(self.path, encoding="utf8") as f: - data = json.loads(f.read()) - return jsonify(data) - except IOError: - return jsonify("Not data profiling available") - - self.app = app - - def start(self): - """ - Start the Optimus Server - :return: - """ - - # References - # https://stackoverflow.com/questions/15562446/how-to-stop-flask-application-without-using-ctrl-c - # https://stackoverflow.com/questions/33927616/multiprocess-within-flask-app-spinning-up-2-processes - - self.server = Process(target=self.app.run(debug=True, use_reloader=False)) - self.server.start() - - sys.stdout.flush() - - return - - # app.add_url_rule('/', 'index', index) - # app.add_url_rule('/', "profiler ", profiler) - - def stop(self): - """ - Stop the server - :return: - """ - self.server.terminate() - self.server.join() diff --git a/optimus/server/__init__.py b/optimus/server/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/optimus/config.ini b/optimus/server/config.ini similarity index 100% rename from optimus/config.ini rename to optimus/server/config.ini diff --git a/optimus/server/run.py b/optimus/server/run.py new file mode 100644 index 000000000..66adc2272 --- /dev/null +++ b/optimus/server/run.py @@ -0,0 +1,50 @@ +import configparser +import json +from flask import Flask +from flask import jsonify +from multiprocessing import Process + +config = configparser.ConfigParser() + +path = "" +# try to load the config file +try: + config.read("config.ini") + path = config["SERVER"]["Input"] +except IOError: + print("config.ini not found") + +except KeyError: + print("Input info not found in config.ini. Be sure you have...") + print("[SERVER]") + print("Input = config.ini") + raise + +app = Flask(__name__) + + +@app.route('/') +def index(): + """ + Return a message indicating if the server is running. + :return: + """ + return jsonify("Optimus Server si Running... Go to json /profiler to get the Optimus profiler data.") + + +@app.route('/profiler') +def profiler(): + """ + Return the data profiler in json format. + :return: + """ + try: + with app.app_context(): + with open(path, encoding="utf8") as f: + data = json.loads(f.read()) + return jsonify(data) + except IOError: + return jsonify("Not data profiling available") + + +app.run() diff --git a/optimus/server/server.py b/optimus/server/server.py new file mode 100644 index 000000000..ab8633b42 --- /dev/null +++ b/optimus/server/server.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python +import os +import sys +import platform +from subprocess import Popen, PIPE +import signal + + +class Server: + def __init__(self): + + # set system/version dependent "start_new_session" analogs + kwargs = {} + if platform.system() == 'Windows': + # from msdn [1] + create_new_process_group = 0x00000200 # note: could get it from subprocess + detached_process = 0x00000008 # 0x8 | 0x200 == 0x208 + kwargs.update(creationflags=detached_process | create_new_process_group) + elif sys.version_info < (3, 2): # assume posix + kwargs.update(preexec_fn=os.setsid) + else: # Python 3.2+ and Unix + kwargs.update(start_new_session=True) + path = "python " + os.path.dirname(os.path.abspath(__file__)) + "\\run.py" + print(path) + + self.p = Popen(path, stdin=PIPE, stdout=PIPE, stderr=PIPE, **kwargs) + assert not self.p.poll() + + def stop(self): + """ + + :return: + """ + p = self.p + if platform.system() == 'Windows': + os.kill(p.pid, signal.CTRL_C_EVENT) + else: + os.killpg(os.getpgid(p.pid), signal.SIGTERM) + + def status(self): + """ + + :return: + """ + p = self.p + print(p.pid) + print(p) From 5961b97f9d316b15536e135fada88855a0cd9aa4 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Fri, 31 Aug 2018 21:38:40 -0500 Subject: [PATCH 11/94] Added server class --- examples/new-api-sandbox.ipynb | 72 +++------------------------------- optimus/optimus.py | 10 ++++- optimus/server/server.py | 52 +++++++----------------- 3 files changed, 28 insertions(+), 106 deletions(-) diff --git a/examples/new-api-sandbox.ipynb b/examples/new-api-sandbox.ipynb index fbe18dbad..1751b982e 100644 --- a/examples/new-api-sandbox.ipynb +++ b/examples/new-api-sandbox.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 6, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -31,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -57,73 +57,13 @@ " /_/ \n", " \n", "Transform and Roll out...\n", + "Starting Optimus Server...\n", "Optimus successfully imported. Have fun :).\n" ] } ], "source": [ - "op= Optimus(verbose=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "python C:\\Users\\argenisleon\\Documents\\Optimus\\optimus\\server\\run.py\n" - ] - } - ], - "source": [ - "from optimus.server.server import Server\n", - "s = Server()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "14948\n", - "\n" - ] - } - ], - "source": [ - "import os\n", - "import signal\n", - "s.status()\n", - "os.kill(14948, signal.CTRL_C_EVENT)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "s.stop()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# Subprocess Resourcess https://stackoverflow.com/questions/89228/calling-an-external-command-in-python/92395#92395\n", - "# Create subprocess on windows https://stackoverflow.com/questions/13243807/popen-waiting-for-child-process-even-when-the-immediate-child-has-terminated/13256908#13256908\n", - "\n", - "import subprocess\n", - "return_code = subprocess.call(\"echo Hello World\", shell=True) " + "op= Optimus(verbose=True, server= True)" ] }, { diff --git a/optimus/optimus.py b/optimus/optimus.py index 8286aec5c..7aecbebb3 100644 --- a/optimus/optimus.py +++ b/optimus/optimus.py @@ -9,6 +9,7 @@ from optimus.io.load import Load from optimus.ml.models import ML from optimus.profiler.profiler import Profiler +from optimus.server.server import Server from optimus.spark import Spark Spark.instance = None @@ -17,7 +18,7 @@ class Optimus: def __init__(self, master="local[*]", app_name="optimus", checkpoint=False, path=None, file_system="local", - verbose=False, dl=False): + verbose=False, dl=False, server=False): """ Transform and roll out @@ -41,7 +42,6 @@ def __init__(self, master="local[*]", app_name="optimus", checkpoint=False, path from optimus.dl.models import DL self.dl = DL() else: - Spark.instance = Spark(master, app_name) pass @@ -62,6 +62,12 @@ def __init__(self, master="local[*]", app_name="optimus", checkpoint=False, path if checkpoint is True: self.set_check_point_folder(path, file_system) + if server is True: + logging.info("Starting Optimus Server...") + s = Server() + s.start() + self.server_instance = s + logging.info(SUCCESS) self.create = Create() diff --git a/optimus/server/server.py b/optimus/server/server.py index ab8633b42..d9b47903e 100644 --- a/optimus/server/server.py +++ b/optimus/server/server.py @@ -1,47 +1,23 @@ -#!/usr/bin/env python +from optimus.server.process import Process import os -import sys -import platform -from subprocess import Popen, PIPE -import signal +import atexit class Server: - def __init__(self): + def __init__(self, path=None): + if path is None: + path = "python " + os.path.dirname(os.path.abspath(__file__)) + "\\run.py" - # set system/version dependent "start_new_session" analogs - kwargs = {} - if platform.system() == 'Windows': - # from msdn [1] - create_new_process_group = 0x00000200 # note: could get it from subprocess - detached_process = 0x00000008 # 0x8 | 0x200 == 0x208 - kwargs.update(creationflags=detached_process | create_new_process_group) - elif sys.version_info < (3, 2): # assume posix - kwargs.update(preexec_fn=os.setsid) - else: # Python 3.2+ and Unix - kwargs.update(start_new_session=True) - path = "python " + os.path.dirname(os.path.abspath(__file__)) + "\\run.py" - print(path) + self.process = None + self.path = path - self.p = Popen(path, stdin=PIPE, stdout=PIPE, stderr=PIPE, **kwargs) - assert not self.p.poll() + def start(self): + self.process = Process(self.path) def stop(self): - """ + self.process.stop() - :return: - """ - p = self.p - if platform.system() == 'Windows': - os.kill(p.pid, signal.CTRL_C_EVENT) - else: - os.killpg(os.getpgid(p.pid), signal.SIGTERM) - - def status(self): - """ - - :return: - """ - p = self.p - print(p.pid) - print(p) + @atexit.register + def goodbye(self): + self.stop() + print("You are now leaving the Python sector.") From f178342840b2a82900f4e5c8cbab36a3a4909329 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Fri, 31 Aug 2018 21:59:05 -0500 Subject: [PATCH 12/94] Looking for strategies to control the child process --- examples/new-api-sandbox.ipynb | 17 ++++++- optimus/server/process.py | 85 ++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 optimus/server/process.py diff --git a/examples/new-api-sandbox.ipynb b/examples/new-api-sandbox.ipynb index 1751b982e..92cefd2b6 100644 --- a/examples/new-api-sandbox.ipynb +++ b/examples/new-api-sandbox.ipynb @@ -57,8 +57,21 @@ " /_/ \n", " \n", "Transform and Roll out...\n", - "Starting Optimus Server...\n", - "Optimus successfully imported. Have fun :).\n" + "Starting Optimus Server...\n" + ] + }, + { + "ename": "AttributeError", + "evalue": "'Popen' object has no attribute 'join'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mop\u001b[0m\u001b[1;33m=\u001b[0m \u001b[0mOptimus\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mverbose\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mserver\u001b[0m\u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\optimus.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, master, app_name, checkpoint, path, file_system, verbose, dl, server)\u001b[0m\n\u001b[0;32m 66\u001b[0m \u001b[0mlogging\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Starting Optimus Server...\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 67\u001b[0m \u001b[0ms\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mServer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 68\u001b[1;33m \u001b[0ms\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 69\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mserver_instance\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0ms\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 70\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\server\\server.py\u001b[0m in \u001b[0;36mstart\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 13\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 14\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mstart\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 15\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mprocess\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mProcess\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 16\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 17\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mstop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\server\\process.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, path)\u001b[0m\n\u001b[0;32m 31\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 32\u001b[0m \u001b[1;31m# Ensure that a child process has completed before the main process\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 33\u001b[1;33m \u001b[0mprocess\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 34\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 35\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mprocess\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mprocess\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mAttributeError\u001b[0m: 'Popen' object has no attribute 'join'" ] } ], diff --git a/optimus/server/process.py b/optimus/server/process.py new file mode 100644 index 000000000..79e445da8 --- /dev/null +++ b/optimus/server/process.py @@ -0,0 +1,85 @@ +import os +import platform +import psutil +import signal +import sys +from subprocess import Popen, PIPE + +WINDOWS = "windows" + +# test https://stackoverflow.com/questions/984941/python-subprocess-popen-from-a-thread + +class Process: + """ + A helper class to start and stop process on windows/unix systems + """ + + def __init__(self, path=None): + + # set system/version dependent "start_new_session" analogs + kwargs = {} + if platform.system() == WINDOWS: + # from msdn [1] + create_new_process_group = 0x00000200 # note: could get it from subprocess + detached_process = 0x00000008 # 0x8 | 0x200 == 0x208 + kwargs.update(creationflags=detached_process | create_new_process_group) + elif sys.version_info < (3, 2): # assume posix + kwargs.update(preexec_fn=os.setsid) + else: # Python 3.2+ and Unix + kwargs.update(start_new_session=True) + + process = Popen(path, stdin=PIPE, stdout=PIPE, stderr=PIPE, **kwargs) + + # Ensure that a child process has completed before the main process + process.join() + + self.process = process + + assert not self.process.poll() + + self.path = path + + def stop(self): + """ + Stop the process that start the server + :return: + """ + process = self.process + + # Reference https://stackoverflow.com/questions/1230669/subprocess-deleting-child-processes-in-windows + def kill_proc_tree(pid, including_parent=True): + """ + Kill process and children + :param pid: + :param including_parent: + :return: + """ + parent = psutil.Process(pid) + children = parent.children(recursive=True) + for child in children: + child.kill() + gone, still_alive = psutil.wait_procs(children, timeout=5) + if including_parent: + parent.wait(5) + parent.kill() + + def kill_proc(pid): + """ + Kill process + :param pid: + :return: + """ + parent = psutil.Process(pid) + parent.kill() + + if platform.system() == WINDOWS: + kill_proc(process.pid) + else: + os.killpg(os.getpgid(process.pid), signal.SIGTERM) + + def status(self): + """ + Return the process status + :return: + """ + return self.process From 74d1061d3293aec9a42b8cc1971b7ebb4758839d Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Sun, 2 Sep 2018 12:43:55 -0500 Subject: [PATCH 13/94] Added nullable to table information and made rows number human readable --- optimus/dataframe/extension.py | 10 +++++++--- optimus/profiler/functions.py | 14 -------------- optimus/profiler/profiler.py | 8 +++++--- optimus/templates/table.html | 5 ++++- requirements-test.txt | 1 + requirements.txt | 3 ++- 6 files changed, 19 insertions(+), 22 deletions(-) diff --git a/optimus/dataframe/extension.py b/optimus/dataframe/extension.py index db4aceb21..2ed80be61 100644 --- a/optimus/dataframe/extension.py +++ b/optimus/dataframe/extension.py @@ -1,6 +1,8 @@ import logging +import multiprocessing import os +import humanize import jinja2 from IPython.core.display import display, HTML from pyspark.ml.feature import SQLTransformer @@ -12,7 +14,6 @@ from optimus.helpers.decorators import * from optimus.helpers.functions import parse_columns, collect_as_dict, random_int, val_to_list from optimus.spark import Spark -import multiprocessing cpu_count = multiprocessing.cpu_count() @@ -205,14 +206,17 @@ def table_html(self, limit=100, columns=None): template = template_env.get_template("table.html") # Filter only the columns and data type info need it - dtypes = list(filter(lambda x: x[0] in columns, self.dtypes)) + dtypes = [(i[0], i[1], j.nullable,) for i, j in zip(self.dtypes, self.schema)] total_rows = self.count() if total_rows < limit: limit = total_rows + total_rows = humanize.intword(total_rows) + total_cols = self.cols.count() + # Print table - output = template.render(cols=dtypes, data=data, limit=limit, total_rows=total_rows, total_cols=self.cols.count()) + output = template.render(cols=dtypes, data=data, limit=limit, total_rows=total_rows, total_cols=total_cols) return output diff --git a/optimus/profiler/functions.py b/optimus/profiler/functions.py index dcc081863..bd5853c5a 100644 --- a/optimus/profiler/functions.py +++ b/optimus/profiler/functions.py @@ -58,20 +58,6 @@ def write_json(data, path): pass -def human_readable_bytes(value, suffix='B'): - """ - Return a human readable file size - :param value: - :param suffix: - :return: - """ - for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']: - if abs(value) < 1024.0: - return "%3.1f%s%s" % (value, unit, suffix) - value /= 1024.0 - return "%.1f%s%s" % (value, 'Yi', suffix) - - def sample_size(population_size, confidence_level, confidence_interval): """ Get a sample number of the whole population diff --git a/optimus/profiler/profiler.py b/optimus/profiler/profiler.py index 1cbc74240..01fc026ac 100644 --- a/optimus/profiler/profiler.py +++ b/optimus/profiler/profiler.py @@ -11,9 +11,11 @@ from optimus.functions import filter_row_by_data_type as fbdt, plot_hist, plot_freq from optimus.helpers.functions import parse_columns -from optimus.profiler.functions import human_readable_bytes, fill_missing_var_types, fill_missing_col_types, \ +from optimus.profiler.functions import fill_missing_var_types, fill_missing_col_types, \ write_json +import humanize + class Profiler: @@ -51,7 +53,7 @@ def dataset_info(df): {'cols_count': cols_count, 'rows_count': rows_count, 'missing_count': str(missing_count / rows_count) + "%", - 'size': human_readable_bytes(df.size())} + 'size': humanize.naturalsize(df.size())} ) # TODO: This should check only the StringType Columns. The datatype from others columns can be taken from schema(). @@ -168,7 +170,7 @@ def columns(df, columns, buckets=40, relative_error=1): count_dtypes = Profiler.count_data_types(df, columns) column_info["count_types"] = count_dtypes["count_types"] - column_info['size'] = human_readable_bytes(df.size()) + column_info['size'] = humanize.naturalsize(df.size()) def na(col_name): return F.count(F.when(F.isnan(col_name) | F.col(col_name).isNull(), col_name)) diff --git a/optimus/templates/table.html b/optimus/templates/table.html index 4786a6852..4956d4709 100644 --- a/optimus/templates/table.html +++ b/optimus/templates/table.html @@ -11,6 +11,7 @@ font-size: 0.8em; } + {% macro header_footer(limit, total_rows, total_cols) %} @@ -25,7 +26,9 @@
{{col[0]}}
{{loop.index}} ({{col[1]}})
- + {% if col[2] == true %} +
nullable
+ {% endif %} {% endfor %} diff --git a/requirements-test.txt b/requirements-test.txt index 7251c66c3..b533a2e07 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -25,3 +25,4 @@ h5py>=2.7.0 flask==1.0.2 ipython==6.5.0 pytest-cov==2.5.1 +humanize=0.5.1 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 04f530ce5..b1b3abd8c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,4 +21,5 @@ pygments>=2.2.0 six>=1.10.0 h5py>=2.7.0 flask==1.0.2 -ipython==6.5.0 \ No newline at end of file +ipython==6.5.0 +humanize=0.5.1 \ No newline at end of file From b01f620d3faef1bb92bf7edbc58174a652ff4c23 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Sun, 2 Sep 2018 12:46:45 -0500 Subject: [PATCH 14/94] Now dataframe can be created anly with the column number or with column and datatype --- optimus/create.py | 38 +++++++++++------ tests/test_optimus.py | 98 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+), 12 deletions(-) create mode 100644 tests/test_optimus.py diff --git a/optimus/create.py b/optimus/create.py index a5bb2b871..97d6e2533 100644 --- a/optimus/create.py +++ b/optimus/create.py @@ -1,7 +1,7 @@ -# Helpers -from pyspark.sql.types import StructField, StructType +from pyspark.sql.types import StructField, StructType, StringType # Helpers +from optimus.helpers.checkit import is_tuple from optimus.helpers.constants import SPARK_DTYPES from optimus.helpers.functions import get_spark_dtypes_object from optimus.spark import Spark @@ -12,22 +12,36 @@ class Create: @staticmethod def data_frame(cols, rows): """ - Helper to create a Spark dataframe - :param cols: - :param rows: - :return: + Helper to create a Spark dataframe: + :param cols: List of Tuple with name, data type and a flag to accept null + :param rows: List of Tuples if vals with the same number and types that cols + :return: Dataframe """ specs = [] for c in cols: - value = c[1] - # Try to find if the type var is a Spark datatype - if isinstance(value, SPARK_DTYPES): - var_type = value - # else, try to parse a str, int, float ...... + + # Get columns name + if not is_tuple(c): + col_name = c else: + col_name = c[0] + + # Get columns data type + if len(c) == 2: var_type = get_spark_dtypes_object(c[1]) - specs.append([c[0], var_type, c[2]]) + else: + var_type = StringType() + + # Get column nullable flag. It's just to tell if a column accept nulls as values + if len(c) == 3: + nullable = c[2] + else: + nullable = True + + # If tuple has not the third param with put it to true to accepts Null in columns + specs.append([col_name, var_type, nullable]) + struct_fields = list(map(lambda x: StructField(*x), specs)) diff --git a/tests/test_optimus.py b/tests/test_optimus.py new file mode 100644 index 000000000..5246b3a8a --- /dev/null +++ b/tests/test_optimus.py @@ -0,0 +1,98 @@ +from pyspark.ml.linalg import Vectors, VectorUDT +from pyspark.sql import Row +from pyspark.sql import functions as F +from pyspark.sql.types import * + +from optimus import Optimus + +op = Optimus() + + +class TestDataFrameCols(object): + + @staticmethod + def test_create_data_frames_plain(): + source_df = op.create.df( + rows=[ + ("BOB", 1), + ("JoSe", 2) + ], + cols=[ + "name", + "age" + ] + ) + + actual_df = source_df + + expected_df = op.create.df( + rows=[ + ("BOB", 1), + ("JoSe", 2) + ], + cols= + [ + ("name", StringType(), True), + ("age", IntegerType(), False) + ] + ) + + assert (expected_df.collect() == actual_df.collect()) + + @staticmethod + def test_create_data_frames_with_datatypes(): + source_df = op.create.df( + rows=[ + ("BOB", 1), + ("JoSe", 2) + ], + cols=[ + ("name", StringType(), True), + ("age", IntegerType(), False) + ] + ) + + actual_df = source_df + + expected_df = op.create.df( + rows=[ + ("BOB", 1), + ("JoSe", 2) + ], + cols= + [ + ("name", StringType(), True), + ("age", IntegerType(), False) + ] + ) + + assert (expected_df.collect() == actual_df.collect()) + + @staticmethod + def test_create_data_frames_nullable(): + source_df = op.create.df( + rows=[ + ("BOB", 1), + ("JoSe", 2) + ], + cols=[ + ("name", StringType()), + ("age", IntegerType()) + ] + ) + + actual_df = source_df + + expected_df = op.create.df( + rows=[ + ("BOB", 1), + ("JoSe", 2) + ], + cols= + [ + ("name", StringType(), True), + ("age", IntegerType(), True) + ] + ) + + assert (expected_df.collect() == actual_df.collect()) From 3a7105e3e9c957b382e716dcc0567feb02306c3f Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Sun, 2 Sep 2018 19:09:22 -0500 Subject: [PATCH 15/94] is_in() added. Test included --- optimus/dataframe/rows.py | 23 +++++++++++++++++++++-- tests/test_rows.py | 18 ++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/optimus/dataframe/rows.py b/optimus/dataframe/rows.py index c5869aa58..215c5953c 100644 --- a/optimus/dataframe/rows.py +++ b/optimus/dataframe/rows.py @@ -1,3 +1,5 @@ +from functools import reduce + from multipledispatch import dispatch from pyspark.sql import DataFrame from pyspark.sql import functions as F @@ -8,7 +10,7 @@ from optimus.helpers.checkit import is_list_of_str_or_int from optimus.helpers.constants import * from optimus.helpers.decorators import * -from optimus.helpers.functions import validate_columns_names, parse_columns, one_list_to_val +from optimus.helpers.functions import validate_columns_names, parse_columns, one_list_to_val, val_to_list def rows(self): @@ -111,7 +113,6 @@ def sort(col_sort): col_name = one_list_to_val(cs[0]) order = cs[1] - if order == "asc": sort_func = F.asc elif order == "desc": @@ -176,6 +177,24 @@ def drop_first(): """ return self.zipWithIndex().filter(lambda tup: tup[1] > 0).map(lambda tup: tup[0]) + @add_attr(rows) + def isin(columns, values): + """ + Filter rows which columns that match a specific value + :return: Spark DataFrame + """ + + # Ensure that we have a list + values = val_to_list(values) + + # Create column/value expression + column_expr = [(F.col(columns) == v) for v in values] + + # Concat expression with and logical or + expr = reduce(lambda a, b: a | b, column_expr) + + return self.rows.select(expr) + return rows diff --git a/tests/test_rows.py b/tests/test_rows.py index ef412d392..41883fa68 100644 --- a/tests/test_rows.py +++ b/tests/test_rows.py @@ -162,3 +162,21 @@ def test_sort(): ]) assert (expected_df.collect() == actual_df.collect()) + + @staticmethod + def test_isin(): + actual_df = source_df.rows.isin("num", 2) + + expected_df = op.create.df([ + ("words", "str", True), + ("num", "int", True), + ("animals", "str", True), + ("thing", StringType(), True), + ("second", "int", True), + ("filter", StringType(), True) + ], + [ + (" zombies", 2, "cat", "tv", 6, "b") + ]) + + assert (expected_df.collect() == actual_df.collect()) From 71920496bc1c51eb4b1b5cf621c16a50f36e0c5f Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Sun, 2 Sep 2018 19:11:12 -0500 Subject: [PATCH 16/94] is_na() added. Test Included --- optimus/dataframe/columns.py | 42 ++++++++++++++++++------ tests/test_cols.py | 63 ++++++++++++++++++++++++++++++++++-- 2 files changed, 93 insertions(+), 12 deletions(-) diff --git a/optimus/dataframe/columns.py b/optimus/dataframe/columns.py index b22293ee5..8f5ae68f3 100644 --- a/optimus/dataframe/columns.py +++ b/optimus/dataframe/columns.py @@ -851,16 +851,38 @@ def impute(input_cols, output_cols, strategy="mean"): @add_attr(cols) def fill_na(columns, value): """ - Reaplce null data with a specified value + Replace null data with a specified value :param columns: :param value: :return: """ + columns = parse_columns(self, columns) + + def _fill_na(_col_name, _value): + return F.when(F.isnan(_col_name) | F.col(_col_name).isNull(), _value).otherwise(_col_name) - def _fill_na(col_name, args): - return F.when(F.isnan(col_name) | F.col(col_name).isNull(), col_name).otherwise(args) + df = self + for col_name in columns: + df = df.cols.apply_expr(col_name, _fill_na, value) + return df - return self.cols.apply_expr(columns, _fill_na, value) + @add_attr(cols) + def is_na(columns): + """ + Replace null values per True and non null per False + :param columns: + :param value: + :return: + """ + + def _replace_na(_col_name, _value): + return F.when(F.isnan(_col_name) | F.col(_col_name).isNull(), True).otherwise(False) + + df = self + for col_name in columns: + df = df.cols.apply_expr(col_name, _replace_na) + + return df @add_attr(cols) def count(): @@ -1001,25 +1023,25 @@ def replace(columns, search_and_replace=None, value=None, regex=None): :param regex: :return: """ - replace = None + _replace = None search = None if is_list_of_tuples(search_and_replace): params = list(zip(*search_and_replace)) search = list(params[0]) - replace = list(params[1]) + _replace = list(params[1]) elif is_list(search_and_replace): search = search_and_replace - replace = value + _replace = value elif is_one_element(search_and_replace): search = val_to_list(search_and_replace) - replace = value + _replace = value if regex: search = search_and_replace - replace = value + _replace = value # if regex or normal replace we use regexp or replace functions # TODO check if .contains can be used instead of regexp @@ -1041,7 +1063,7 @@ def func_replace(_df, _col_name, _search, _replace): columns = parse_columns(self, columns, filter_by_column_dtypes="string") for c in columns: - df = func(df, c, search, replace) + df = func(df, c, search, _replace) return df diff --git a/tests/test_cols.py b/tests/test_cols.py index 0f99c55df..3c182fa4b 100644 --- a/tests/test_cols.py +++ b/tests/test_cols.py @@ -6,7 +6,6 @@ from optimus import Optimus op = Optimus() -sc = op.sc class TestDataFrameCols(object): @@ -410,7 +409,7 @@ def test_cast_vector(): expected_df = op.create.df( rows=[ - ("happy",[1, 2, 3]), + ("happy", [1, 2, 3]), ("excited", 2) ], cols=[ @@ -556,3 +555,63 @@ def test_sort(): ) assert (actual_df.collect() == expected_df.collect()) + + @staticmethod + def test_fill_na(): + source_df = op.create.df( + rows=[ + ("happy", 1, None), + ("excited", 2, 8) + ], + cols=[ + ("emotion", StringType(), True), + ("num1", IntegerType(), True), + ("num2", IntegerType(), True) + ] + ) + + actual_df = source_df.cols.fill_na("*", "N/A") + + expected_df = op.create.df( + rows=[ + ("happy", 1, "N/A"), + ("excited", 2, 8) + ], + cols=[ + ("emotion", StringType(), True), + ("num1", IntegerType(), True), + ("num2", IntegerType(), True) + ] + ) + + assert (actual_df.collect() == expected_df.collect()) + + @staticmethod + def test_is_na(): + source_df = op.create.df( + rows=[ + ("happy", None, 1), + ("excited", 2, 8) + ], + cols=[ + ("emotion", StringType(), True), + ("num1", IntegerType(), True), + ("num2", IntegerType(), True) + ] + ) + + actual_df = source_df.cols.fill_na("*", "N/A") + + expected_df = op.create.df( + rows=[ + (False, True, False), + (False, False, False) + ], + cols=[ + ("emotion", StringType(), True), + ("num1", IntegerType(), True), + ("num2", IntegerType(), True) + ] + ) + + assert (actual_df.collect() == expected_df.collect()) From 1661e96efb8985219a05d631f966179a9e3c817f Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Sun, 2 Sep 2018 19:12:04 -0500 Subject: [PATCH 17/94] CSS style improved in case that you are outside a jupyter notebook --- optimus/templates/table.html | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/optimus/templates/table.html b/optimus/templates/table.html index 4956d4709..387a0694e 100644 --- a/optimus/templates/table.html +++ b/optimus/templates/table.html @@ -1,16 +1,33 @@ @@ -19,16 +36,18 @@ {% endmacro %} {{header_footer(limit, total_rows, total_cols)}} - +
{%for col in cols: %} {% endfor %} From 0f9a23ae9eb7881d8e74b29e1fe0f6ff020e2930 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Sun, 2 Sep 2018 19:16:23 -0500 Subject: [PATCH 18/94] Docstrings and imports cleanup --- examples/new-api-sandbox.ipynb | 3265 -------------------------------- optimus/helpers/functions.py | 6 +- tests/test_optimus.py | 30 +- 3 files changed, 30 insertions(+), 3271 deletions(-) diff --git a/examples/new-api-sandbox.ipynb b/examples/new-api-sandbox.ipynb index 8fcfe267e..f7ccfcfbb 100644 --- a/examples/new-api-sandbox.ipynb +++ b/examples/new-api-sandbox.ipynb @@ -825,3271 +825,6 @@ "source": [ "df.table()" ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| id| firstName| lastName|billingId| product|price| birth|dummyCol|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| 1| Luis| Alvarez$$%!| 123| Cake| 10|1980/07/07| never|\n", - "| 2| André| Ampère| 423| piza| 8|1950/07/08| gonna|\n", - "| 3| NiELS| Böhr//((%%| 551| pizza| 8|1990/07/09| give|\n", - "| 4| PAUL| dirac$| 521| pizza| 8|1954/07/10| you|\n", - "| 5| Albert| Einstein| 634| pizza| 8|1990/07/11| up|\n", - "| 6| Galileo| GALiLEI| 672| arepa| 5|1930/08/12| never|\n", - "| 7| CaRL| Ga%%%uss| 323| taco| 3|1970/07/13| gonna|\n", - "| 8| David| H$$$ilbert| 624| taaaccoo| 3|1950/07/14| let|\n", - "| 9| Johannes| KEPLER| 735| taco| 3|1920/04/22| you|\n", - "| 10| JaMES| M$$ax%%well| 875| taco| 3|1923/03/12| down|\n", - "| 11| Isaac| Newton| 992| pasta| 9|1999/02/15| never |\n", - "| 12| Emmy%%| Nöether$| 234| pasta| 9|1993/12/08| gonna|\n", - "| 13| Max!!!| Planck!!!| 111|hamburguer| 4|1994/01/04| run |\n", - "| 14| Fred| Hoy&&&le| 553| pizzza| 8|1997/06/27| around|\n", - "| 15|((( Heinrich )))))| Hertz| 116| pizza| 8|1956/11/30| and|\n", - "| 16| William| Gilbert###| 886| BEER| 2|1958/03/26| desert|\n", - "| 17| Marie| CURIE| 912| Rice| 1|2000/03/22| you|\n", - "| 18| Arthur| COM%%%pton| 812| 110790| 5|1899/01/01| #|\n", - "| 19| JAMES| Chadwick| 467| null| 10|1921/05/03| #|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "\n" - ] - } - ], - "source": [ - "df.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---+---------+--------+---------+-----------------+-----+----------+----------+-------------+\n", - "| id|firstname|lastname|billingid| product|price| birth| new_date|years_between|\n", - "+---+---------+--------+---------+-----------------+-----+----------+----------+-------------+\n", - "| 10| james| maxwell| 875| taco| 3|1923/03/12|12-03-1923| 95.4355|\n", - "| 11| isaac| newton| 992| pasta| 9|1999/02/15|15-02-1999| 19.5108|\n", - "| 12| emmy| noether| 234| pasta| 9|1993/12/08|08-12-1993| 24.6962|\n", - "| 13| max| planck| 111| hamburguer| 4|1994/01/04|04-01-1994| 24.6237|\n", - "| 14| fred| hoyle| 553| pizza| 8|1997/06/27|27-06-1997| 21.1452|\n", - "| 15| heinrich| hertz| 116| pizza| 8|1956/11/30|30-11-1956| 61.7204|\n", - "| 16| william| gilbert| 886| BEER| 2|1958/03/26|26-03-1958| 60.3978|\n", - "| 17| marie| curie| 912| Rice| 1|2000/03/22|22-03-2000| 18.4086|\n", - "| 18| arthur| compton| 812|this was a number| 5|1899/01/01|01-01-1899| 119.6317|\n", - "| 19| james|chadwick| 467| null| 10|1921/05/03|03-05-1921| 97.293|\n", - "| 7| carl| gauss| 323| taco| 3|1970/07/13|13-07-1970| 48.0995|\n", - "| 8| david| hilbert| 624| taco| 3|1950/07/14|14-07-1950| 68.0968|\n", - "| 9| johannes| kepler| 735| taco| 3|1920/04/22|22-04-1920| 98.3253|\n", - "+---+---------+--------+---------+-----------------+-----+----------+----------+-------------+\n", - "\n" - ] - } - ], - "source": [ - "# This is a custom function\n", - "def num_to_string(value, arg):\n", - " return \"this was a number\"\n", - " \n", - "df\\\n", - " .rows.sort(\"product\",\"desc\")\\\n", - " .cols.lower([\"firstName\",\"lastName\"])\\\n", - " .cols.date_transform(\"birth\", \"new_date\", \"yyyy/MM/dd\", \"dd-MM-YYYY\")\\\n", - " .cols.years_between(\"birth\", \"years_between\", \"yyyy/MM/dd\")\\\n", - " .cols.remove_accents(\"lastName\")\\\n", - " .cols.remove_special_chars([\"firstName\",\"lastName\"])\\\n", - " .cols.replace(\"product\",\"taaaccoo\",\"taco\")\\\n", - " .cols.replace(\"product\",[\"piza\",\"pizzza\"],\"pizza\")\\\n", - " .rows.drop(df[\"id\"]<7)\\\n", - " .cols.drop(\"dummyCol\")\\\n", - " .cols.rename(str.lower)\\\n", - " .cols.apply_by_dtypes(\"product\",num_to_string,\"string\", data_type=\"integer\")\\\n", - " .cols.trim(\"*\")\\\n", - " .rows.sort(\"id\")\\\n", - " .show()" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| id| firstName| lastName|billingId| product|price| birth|dummyCol|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| 1| Luis| Alvarez$$%!| 123| Cake| 10|1980/07/07| never|\n", - "| 2| André| Ampère| 423| piza| 8|1950/07/08| gonna|\n", - "| 3| NiELS| Böhr//((%%| 551| pizza| 8|1990/07/09| give|\n", - "| 4| PAUL| dirac$| 521| pizza| 8|1954/07/10| you|\n", - "| 5| Albert| Einstein| 634| pizza| 8|1990/07/11| up|\n", - "| 6| Galileo| GALiLEI| 672| arepa| 5|1930/08/12| never|\n", - "| 7| CaRL| Ga%%%uss| 323| taco| 3|1970/07/13| gonna|\n", - "| 8| David| H$$$ilbert| 624| taaaccoo| 3|1950/07/14| let|\n", - "| 9| Johannes| KEPLER| 735| taco| 3|1920/04/22| you|\n", - "| 10| JaMES| M$$ax%%well| 875| taco| 3|1923/03/12| down|\n", - "| 11| Isaac| Newton| 992| pasta| 9|1999/02/15| never |\n", - "| 12| Emmy%%| Nöether$| 234| pasta| 9|1993/12/08| gonna|\n", - "| 13| Max!!!| Planck!!!| 111|hamburguer| 4|1994/01/04| run |\n", - "| 14| Fred| Hoy&&&le| 553| pizzza| 8|1997/06/27| around|\n", - "| 15|((( Heinrich )))))| Hertz| 116| pizza| 8|1956/11/30| and|\n", - "| 16| William| Gilbert###| 886| BEER| 2|1958/03/26| desert|\n", - "| 17| Marie| CURIE| 912| Rice| 1|2000/03/22| you|\n", - "| 18| Arthur| COM%%%pton| 812| 110790| 5|1899/01/01| #|\n", - "| 19| JAMES| Chadwick| 467| null| 10|1921/05/03| #|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| id| firstName| lastName|billingid| product|price| birth|dummyCol|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| 1| Luis| Alvarez$$%!| 126| Cake| 10|1980/07/07| never|\n", - "| 2| André| Ampère| 426| piza| 8|1950/07/08| gonna|\n", - "| 3| NiELS| Böhr//((%%| 554| pizza| 8|1990/07/09| give|\n", - "| 4| PAUL| dirac$| 524| pizza| 8|1954/07/10| you|\n", - "| 5| Albert| Einstein| 637| pizza| 8|1990/07/11| up|\n", - "| 6| Galileo| GALiLEI| 675| arepa| 5|1930/08/12| never|\n", - "| 7| CaRL| Ga%%%uss| 326| taco| 3|1970/07/13| gonna|\n", - "| 8| David| H$$$ilbert| 627| taaaccoo| 3|1950/07/14| let|\n", - "| 9| Johannes| KEPLER| 738| taco| 3|1920/04/22| you|\n", - "| 10| JaMES| M$$ax%%well| 878| taco| 3|1923/03/12| down|\n", - "| 11| Isaac| Newton| 995| pasta| 9|1999/02/15| never |\n", - "| 12| Emmy%%| Nöether$| 237| pasta| 9|1993/12/08| gonna|\n", - "| 13| Max!!!| Planck!!!| 114|hamburguer| 4|1994/01/04| run |\n", - "| 14| Fred| Hoy&&&le| 556| pizzza| 8|1997/06/27| around|\n", - "| 15|((( Heinrich )))))| Hertz| 119| pizza| 8|1956/11/30| and|\n", - "| 16| William| Gilbert###| 889| BEER| 2|1958/03/26| desert|\n", - "| 17| Marie| CURIE| 915| Rice| 1|2000/03/22| you|\n", - "| 18| Arthur| COM%%%pton| 815| 110790| 5|1899/01/01| #|\n", - "| 19| JAMES| Chadwick| 470| null| 10|1921/05/03| #|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "\n" - ] - } - ], - "source": [ - "df.show()\n", - "def func(value, args):\n", - " return value +args[0] + args[1]\n", - "\n", - "\n", - "df.cols.apply(\"billingid\",func,\"int\", [1,2]).show()\n", - "\n", - "#df.cols.apply([\"num\", \"new_col_1\"], func, \"int\", 32 ,\"udf\").table()" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| id| firstName| lastName|billingid| product|price| birth|dummyCol|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| 1| Luis| Alvarez$$%!| 6.15| Cake| 10|1980/07/07| never|\n", - "| 2| André| Ampère| 21.15| piza| 8|1950/07/08| gonna|\n", - "| 3| NiELS| Böhr//((%%| 27.55| pizza| 8|1990/07/09| give|\n", - "| 4| PAUL| dirac$| 26.05| pizza| 8|1954/07/10| you|\n", - "| 5| Albert| Einstein| 31.7| pizza| 8|1990/07/11| up|\n", - "| 6| Galileo| GALiLEI| 33.6| arepa| 5|1930/08/12| never|\n", - "| 7| CaRL| Ga%%%uss| 16.15| taco| 3|1970/07/13| gonna|\n", - "| 8| David| H$$$ilbert| 31.2| taaaccoo| 3|1950/07/14| let|\n", - "| 9| Johannes| KEPLER| 36.75| taco| 3|1920/04/22| you|\n", - "| 10| JaMES| M$$ax%%well| 43.75| taco| 3|1923/03/12| down|\n", - "| 11| Isaac| Newton| 49.6| pasta| 9|1999/02/15| never |\n", - "| 12| Emmy%%| Nöether$| 11.7| pasta| 9|1993/12/08| gonna|\n", - "| 13| Max!!!| Planck!!!| 5.55|hamburguer| 4|1994/01/04| run |\n", - "| 14| Fred| Hoy&&&le| 27.65| pizzza| 8|1997/06/27| around|\n", - "| 15|((( Heinrich )))))| Hertz| 5.8| pizza| 8|1956/11/30| and|\n", - "| 16| William| Gilbert###| 44.3| BEER| 2|1958/03/26| desert|\n", - "| 17| Marie| CURIE| 45.6| Rice| 1|2000/03/22| you|\n", - "| 18| Arthur| COM%%%pton| 40.6| 110790| 5|1899/01/01| #|\n", - "| 19| JAMES| Chadwick| 23.35| null| 10|1921/05/03| #|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "\n" - ] - } - ], - "source": [ - "from pyspark.sql import functions as F\n", - "\n", - "def func(col_name, args):\n", - " return F.col(col_name)/20\n", - "\n", - "df.cols.apply_expr(\"billingid\", func, 20).show()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Row(double=1, single=1),\n", - " Row(double=4, single=2),\n", - " Row(double=9, single=3),\n", - " Row(double=16, single=4),\n", - " Row(double=25, single=5)]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pyspark.sql import SparkSession\n", - "\n", - "spark = SparkSession \\\n", - " .builder \\\n", - " .appName(\"Python Spark SQL basic example\") \\\n", - " .config(\"spark.some.config.option\", \"some-value\") \\\n", - " .getOrCreate()\n", - "\n", - "sc = spark.sparkContext\n", - "\n", - "spark.createDataFrame(sc.parallelize(range(1, 6)).map(lambda i: Row(single=i, double=i ** 2))).collect()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Cheat Sheet" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "df = op.create.df(\n", - " [\n", - " (\"words\", \"str\", True),\n", - " (\"num\", \"int\", True),\n", - " ],\n", - " [\n", - " (\" I like fish \", 1),\n", - " (\" zombies\", 2, ),\n", - " (\"simpsons cat lady\", 2),\n", - " (None, 3)\n", - " ])" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
Viewing 9 of 9
\n", - "
{{col[0]}}
{{loop.index}} ({{col[1]}})
- {% if col[2] == true %} -
nullable
- {% endif %} +
+ {% if col[2] == true %} + nullable + {% endif %} +
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
id
\n", - "
(int)
\n", - "\n", - "
\n", - "
firstName
\n", - "
(string)
\n", - "\n", - "
\n", - "
lastName
\n", - "
(string)
\n", - "\n", - "
\n", - "
billingId
\n", - "
(int)
\n", - "\n", - "
\n", - "
product
\n", - "
(string)
\n", - "\n", - "
\n", - "
price
\n", - "
(int)
\n", - "\n", - "
\n", - "
birth
\n", - "
(string)
\n", - "\n", - "
\n", - "
dummyCol
\n", - "
(string)
\n", - "\n", - "
\n", - " 2\n", - " \n", - " André\n", - " \n", - " Ampère\n", - " \n", - " 423\n", - " \n", - " piza\n", - " \n", - " 8\n", - " \n", - " 1950/07/08\n", - " \n", - " gonna\n", - "
\n", - " 7\n", - " \n", - " CaRL\n", - " \n", - " Ga%%%uss\n", - " \n", - " 323\n", - " \n", - " taco\n", - " \n", - " 3\n", - " \n", - " 1970/07/13\n", - " \n", - " gonna\n", - "
\n", - " 9\n", - " \n", - " Johannes\n", - " \n", - " KEPLER\n", - " \n", - " 735\n", - " \n", - " taco\n", - " \n", - " 3\n", - " \n", - " 1920/04/22\n", - " \n", - " you\n", - "
\n", - " 10\n", - " \n", - " JaMES\n", - " \n", - " M$$ax%%well\n", - " \n", - " 875\n", - " \n", - " taco\n", - " \n", - " 3\n", - " \n", - " 1923/03/12\n", - " \n", - " down\n", - "
\n", - " 11\n", - " \n", - " Isaac\n", - " \n", - " Newton\n", - " \n", - " 992\n", - " \n", - " pasta\n", - " \n", - " 9\n", - " \n", - " 1999/02/15\n", - " \n", - " never⸱\n", - "
\n", - " 14\n", - " \n", - " Fred\n", - " \n", - " Hoy&&&le\n", - " \n", - " 553\n", - " \n", - " pizzza\n", - " \n", - " 8\n", - " \n", - " 1997/06/27\n", - " \n", - " around\n", - "
\n", - " 16\n", - " \n", - " William\n", - " \n", - " Gilbert###\n", - " \n", - " 886\n", - " \n", - " BEER\n", - " \n", - " 2\n", - " \n", - " 1958/03/26\n", - " \n", - " desert\n", - "
\n", - " 17\n", - " \n", - " Marie\n", - " \n", - " CURIE\n", - " \n", - " 912\n", - " \n", - " Rice\n", - " \n", - " 1\n", - " \n", - " 2000/03/22\n", - " \n", - " you\n", - "
\n", - " 18\n", - " \n", - " Arthur\n", - " \n", - " COM%%%pton\n", - " \n", - " 812\n", - " \n", - " 110790\n", - " \n", - " 5\n", - " \n", - " 1899/01/01\n", - " \n", - " #\n", - "
\n", - "
Viewing 9 of 9
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df.sample_by(10, False).table()" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+--------------------+--------------------+---------+\n", - "| firstName| lastName|billingId|\n", - "+--------------------+--------------------+---------+\n", - "| Luis| Alvarez$$%!| 123|\n", - "| André| Ampère| 423|\n", - "| NiELS| Böhr//((%%| 551|\n", - "| PAUL| dirac$| 521|\n", - "| Albert| Einstein| 634|\n", - "| Galileo| GALiLEI| 672|\n", - "| CaRL| Ga%%%uss| 323|\n", - "| David| H$$$ilbert| 624|\n", - "| Johannes| KEPLER| 735|\n", - "| JaMES| M$$ax%%well| 875|\n", - "| Isaac| Newton| 992|\n", - "| Emmy%%| Nöether$| 234|\n", - "| Max!!!| Planck!!!| 111|\n", - "| Fred| Hoy&&&le| 553|\n", - "|((( Heinrich )))))| Hertz| 116|\n", - "| William| Gilbert###| 886|\n", - "| Marie| CURIE| 912|\n", - "| Arthur| COM%%%pton| 812|\n", - "| JAMES| Chadwick| 467|\n", - "+--------------------+--------------------+---------+\n", - "\n" - ] - } - ], - "source": [ - "df.cols.select([\"firstName\",2,3]).show()" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "value = [\"1\"]\n", - "bool(value) and isinstance(value, list) and all(isinstance(elem, (int, str)) for elem in value)" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "DataFrame[id: int, firstName: string, lastName: string, billingId: int, product: string, price: int, birth: string, dummyCol: string]" - ] - }, - "execution_count": 67, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
Viewing 19 of 19
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
id
\n", - "
(int)
\n", - "\n", - "
\n", - "
firstName
\n", - "
(string)
\n", - "\n", - "
\n", - "
lastName
\n", - "
(string)
\n", - "\n", - "
\n", - "
billingId
\n", - "
(int)
\n", - "\n", - "
\n", - "
product
\n", - "
(string)
\n", - "\n", - "
\n", - "
price
\n", - "
(int)
\n", - "\n", - "
\n", - "
birth
\n", - "
(string)
\n", - "\n", - "
\n", - "
dummyCol
\n", - "
(string)
\n", - "\n", - "
\n", - "
b
\n", - "
(double)
\n", - "\n", - "
\n", - " 1\n", - " \n", - " Luis\n", - " \n", - " Alvarez$$%!\n", - " \n", - " 123\n", - " \n", - " Cake\n", - " \n", - " 10\n", - " \n", - " 1980/07/07\n", - " \n", - " never\n", - " \n", - " 0.0\n", - "
\n", - " 2\n", - " \n", - " André\n", - " \n", - " Ampère\n", - " \n", - " 423\n", - " \n", - " piza\n", - " \n", - " 8\n", - " \n", - " 1950/07/08\n", - " \n", - " gonna\n", - " \n", - " 0.0\n", - "
\n", - " 3\n", - " \n", - " NiELS\n", - " \n", - " Böhr//((%%\n", - " \n", - " 551\n", - " \n", - " pizza\n", - " \n", - " 8\n", - " \n", - " 1990/07/09\n", - " \n", - " give\n", - " \n", - " 1.0\n", - "
\n", - " 4\n", - " \n", - " PAUL\n", - " \n", - " dirac$\n", - " \n", - " 521\n", - " \n", - " pizza\n", - " \n", - " 8\n", - " \n", - " 1954/07/10\n", - " \n", - " you\n", - " \n", - " 1.0\n", - "
\n", - " 5\n", - " \n", - " Albert\n", - " \n", - " Einstein\n", - " \n", - " 634\n", - " \n", - " pizza\n", - " \n", - " 8\n", - " \n", - " 1990/07/11\n", - " \n", - " up\n", - " \n", - " 1.0\n", - "
\n", - " 6\n", - " \n", - " Galileo\n", - " \n", - " ⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱GALiLEI\n", - " \n", - " 672\n", - " \n", - " arepa\n", - " \n", - " 5\n", - " \n", - " 1930/08/12\n", - " \n", - " never\n", - " \n", - " 2.0\n", - "
\n", - " 7\n", - " \n", - " CaRL\n", - " \n", - " Ga%%%uss\n", - " \n", - " 323\n", - " \n", - " taco\n", - " \n", - " 3\n", - " \n", - " 1970/07/13\n", - " \n", - " gonna\n", - " \n", - " 0.0\n", - "
\n", - " 8\n", - " \n", - " David\n", - " \n", - " H$$$ilbert\n", - " \n", - " 624\n", - " \n", - " taaaccoo\n", - " \n", - " 3\n", - " \n", - " 1950/07/14\n", - " \n", - " let\n", - " \n", - " 1.0\n", - "
\n", - " 9\n", - " \n", - " Johannes\n", - " \n", - " KEPLER\n", - " \n", - " 735\n", - " \n", - " taco\n", - " \n", - " 3\n", - " \n", - " 1920/04/22\n", - " \n", - " you\n", - " \n", - " 2.0\n", - "
\n", - " 10\n", - " \n", - " JaMES\n", - " \n", - " M$$ax%%well\n", - " \n", - " 875\n", - " \n", - " taco\n", - " \n", - " 3\n", - " \n", - " 1923/03/12\n", - " \n", - " down\n", - " \n", - " 2.0\n", - "
\n", - " 11\n", - " \n", - " Isaac\n", - " \n", - " Newton\n", - " \n", - " 992\n", - " \n", - " pasta\n", - " \n", - " 9\n", - " \n", - " 1999/02/15\n", - " \n", - " never⸱\n", - " \n", - " 2.0\n", - "
\n", - " 12\n", - " \n", - " Emmy%%\n", - " \n", - " Nöether$\n", - " \n", - " 234\n", - " \n", - " pasta\n", - " \n", - " 9\n", - " \n", - " 1993/12/08\n", - " \n", - " gonna\n", - " \n", - " 0.0\n", - "
\n", - " 13\n", - " \n", - " Max!!!\n", - " \n", - " Planck!!!\n", - " \n", - " 111\n", - " \n", - " hamburguer\n", - " \n", - " 4\n", - " \n", - " 1994/01/04\n", - " \n", - " run⸱\n", - " \n", - " 0.0\n", - "
\n", - " 14\n", - " \n", - " Fred\n", - " \n", - " Hoy&&&le\n", - " \n", - " 553\n", - " \n", - " pizzza\n", - " \n", - " 8\n", - " \n", - " 1997/06/27\n", - " \n", - " around\n", - " \n", - " 1.0\n", - "
\n", - " 15\n", - " \n", - " (((⸱⸱⸱Heinrich⸱)))))\n", - " \n", - " Hertz\n", - " \n", - " 116\n", - " \n", - " pizza\n", - " \n", - " 8\n", - " \n", - " 1956/11/30\n", - " \n", - " and\n", - " \n", - " 0.0\n", - "
\n", - " 16\n", - " \n", - " William\n", - " \n", - " Gilbert###\n", - " \n", - " 886\n", - " \n", - " BEER\n", - " \n", - " 2\n", - " \n", - " 1958/03/26\n", - " \n", - " desert\n", - " \n", - " 2.0\n", - "
\n", - " 17\n", - " \n", - " Marie\n", - " \n", - " CURIE\n", - " \n", - " 912\n", - " \n", - " Rice\n", - " \n", - " 1\n", - " \n", - " 2000/03/22\n", - " \n", - " you\n", - " \n", - " 2.0\n", - "
\n", - " 18\n", - " \n", - " Arthur\n", - " \n", - " COM%%%pton\n", - " \n", - " 812\n", - " \n", - " 110790\n", - " \n", - " 5\n", - " \n", - " 1899/01/01\n", - " \n", - " #\n", - " \n", - " 2.0\n", - "
\n", - " 19\n", - " \n", - " JAMES\n", - " \n", - " Chadwick\n", - " \n", - " 467\n", - " \n", - " null\n", - " \n", - " 10\n", - " \n", - " 1921/05/03\n", - " \n", - " #\n", - " \n", - " 1.0\n", - "
\n", - "
Viewing 19 of 19
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from pyspark.sql import functions as F\n", - "c = \"firstName\"\n", - "df.cols.qcut(\"billingId\",\"b\",3).table()" - ] - }, - { - "cell_type": "code", - "execution_count": 98, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| id| firstName| lastName|billingId| product|price| birth|dummyCol|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| 1| Luis| Alvarez$$%!| 200| Cake| 10|1980/07/07| never|\n", - "| 2| André| Ampère| 400| piza| 8|1950/07/08| gonna|\n", - "| 3| NiELS| Böhr//((%%| 400| pizza| 8|1990/07/09| give|\n", - "| 4| PAUL| dirac$| 400| pizza| 8|1954/07/10| you|\n", - "| 5| Albert| Einstein| 400| pizza| 8|1990/07/11| up|\n", - "| 6| Galileo| GALiLEI| 400| arepa| 5|1930/08/12| never|\n", - "| 7| CaRL| Ga%%%uss| 323| taco| 3|1970/07/13| gonna|\n", - "| 8| David| H$$$ilbert| 400| taaaccoo| 3|1950/07/14| let|\n", - "| 9| Johannes| KEPLER| 400| taco| 3|1920/04/22| you|\n", - "| 10| JaMES| M$$ax%%well| 400| taco| 3|1923/03/12| down|\n", - "| 11| Isaac| Newton| 400| pasta| 9|1999/02/15| never |\n", - "| 12| Emmy%%| Nöether$| 234| pasta| 9|1993/12/08| gonna|\n", - "| 13| Max!!!| Planck!!!| 200|hamburguer| 4|1994/01/04| run |\n", - "| 14| Fred| Hoy&&&le| 400| pizzza| 8|1997/06/27| around|\n", - "| 15|((( Heinrich )))))| Hertz| 200| pizza| 8|1956/11/30| and|\n", - "| 16| William| Gilbert###| 400| BEER| 2|1958/03/26| desert|\n", - "| 17| Marie| CURIE| 400| Rice| 1|2000/03/22| you|\n", - "| 18| Arthur| COM%%%pton| 400| 110790| 5|1899/01/01| #|\n", - "| 19| JAMES| Chadwick| 400| null| 10|1921/05/03| #|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "\n" - ] - } - ], - "source": [ - "df.cols.clip(\"billingId\",200, 400).show()" - ] - }, - { - "cell_type": "code", - "execution_count": 89, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| id| firstName| lastName|billingId| product|price| birth|dummyCol|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "| 1| Luis| Alvarez$$%!| 123| Cake| 10|1980/07/07| never|\n", - "| 2| André| Ampère| 423| piza| 8|1950/07/08| gonna|\n", - "| 3| NiELS| Böhr//((%%| 551| pizza| 8|1990/07/09| give|\n", - "| 4| PAUL| dirac$| 521| pizza| 8|1954/07/10| you|\n", - "| 5| Albert| Einstein| 634| pizza| 8|1990/07/11| up|\n", - "| 6| Galileo| GALiLEI| 672| arepa| 5|1930/08/12| never|\n", - "| 7| CaRL| Ga%%%uss| 323| taco| 3|1970/07/13| gonna|\n", - "| 8| David| H$$$ilbert| 624| taaaccoo| 3|1950/07/14| let|\n", - "| 9| Johannes| KEPLER| 735| taco| 3|1920/04/22| you|\n", - "| 10| JaMES| M$$ax%%well| 875| taco| 3|1923/03/12| down|\n", - "| 11| Isaac| Newton| 992| pasta| 9|1999/02/15| never |\n", - "| 12| Emmy%%| Nöether$| 234| pasta| 9|1993/12/08| gonna|\n", - "| 13| Max!!!| Planck!!!| 111|hamburguer| 4|1994/01/04| run |\n", - "| 14| Fred| Hoy&&&le| 553| pizzza| 8|1997/06/27| around|\n", - "| 15|((( Heinrich )))))| Hertz| 116| pizza| 8|1956/11/30| and|\n", - "| 16| William| Gilbert###| 886| BEER| 2|1958/03/26| desert|\n", - "| 17| Marie| CURIE| 912| Rice| 1|2000/03/22| you|\n", - "| 18| Arthur| COM%%%pton| 812| 110790| 5|1899/01/01| #|\n", - "| 19| JAMES| Chadwick| 467| null| 10|1921/05/03| #|\n", - "+---+--------------------+--------------------+---------+----------+-----+----------+--------+\n", - "\n" - ] - } - ], - "source": [ - "df.cols.abs(\"billingId\").show()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
Viewing 100 of 569
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
label
\n", - "
1 (double)
\n", - "\n", - "
\n", - "
prediction
\n", - "
2 (double)
\n", - "\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 0.0\n", - " \n", - " 0.0\n", - "
\n", - " 1.0\n", - " \n", - " 1.0\n", - "
\n", - "
Viewing 100 of 569
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from optimus import Optimus\n", - "\n", - "op = Optimus()\n", - "df_cancer =op.load.url(\"https://raw.githubusercontent.com/ironmussa/Optimus/master/tests/data_cancer.csv\")\n", - "\n", - "columns = ['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',\n", - " 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean',\n", - " 'fractal_dimension_mean']\n", - "\n", - "df_model, rf_model = op.ml.random_forest(df_cancer, columns, \"diagnosis\")\n", - "df_model.cols.select([\"label\",\"prediction\"]).table()" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "value = [{'name': 'Asuka 881627', 'id': '4336', 'nametype': 'Valid', 'recclass': 'L6', 'mass (g)': '42.68', 'fall': 'Found', 'year': '01/01/1988 12:00:00 AM', 'reclat': '-72.000000', 'reclong': '26.000000', 'GeoLocation': '(-72.000000, 26.000000)'}, {'name': 'Dhofar 1401', 'id': '35491', 'nametype': 'Valid', 'recclass': 'LL~6', 'mass (g)': '42.03', 'fall': 'Found', 'year': '01/01/2001 12:00:00 AM', 'reclat': '19.191350', 'reclong': '54.655450', 'GeoLocation': '(19.191350, 54.655450)'}, {'name': 'Elephant Moraine 87745', 'id': '8295', 'nametype': 'Valid', 'recclass': 'H5', 'mass (g)': '123', 'fall': 'Found', 'year': '01/01/1987 12:00:00 AM', 'reclat': '-76.183330', 'reclong': '157.166670', 'GeoLocation': '(-76.183330, 157.166670)'}, {'name': 'Frontier Mountain 90153', 'id': '10545', 'nametype': 'Valid', 'recclass': 'H4-6', 'mass (g)': '8.5', 'fall': 'Found', 'year': '01/01/1990 12:00:00 AM', 'reclat': '-72.954040', 'reclong': '160.538110', 'GeoLocation': '(-72.954040, 160.538110)'}, {'name': 'Larkman Nunatak 06750', 'id': '48858', 'nametype': 'Valid', 'recclass': 'L5', 'mass (g)': '66.8', 'fall': 'Found', 'year': '01/01/2006 12:00:00 AM', 'reclat': None, 'reclong': None, 'GeoLocation': None}, {'name': 'MacAlpine Hills 02539', 'id': '14845', 'nametype': 'Valid', 'recclass': 'LL6', 'mass (g)': '96.6', 'fall': 'Found', 'year': '01/01/2002 12:00:00 AM', 'reclat': '-84.216670', 'reclong': '160.500000', 'GeoLocation': '(-84.216670, 160.500000)'}, {'name': 'Miller Range 07124', 'id': '51180', 'nametype': 'Valid', 'recclass': 'L5', 'mass (g)': '378.6', 'fall': 'Found', 'year': '01/01/2007 12:00:00 AM', 'reclat': '0.000000', 'reclong': '0.000000', 'GeoLocation': '(0.000000, 0.000000)'}, {'name': 'Northwest Africa 4032', 'id': '34304', 'nametype': 'Valid', 'recclass': 'Eucrite-pmict', 'mass (g)': '10.5', 'fall': 'Found', 'year': '01/01/2004 12:00:00 AM', 'reclat': None, 'reclong': None, 'GeoLocation': None}, {'name': 'Northwest Africa 5953', 'id': '50839', 'nametype': 'Valid', 'recclass': 'LL4', 'mass (g)': '1450', 'fall': 'Found', 'year': '01/01/2005 12:00:00 AM', 'reclat': '0.000000', 'reclong': '0.000000', 'GeoLocation': '(0.000000, 0.000000)'}, {'name': 'Pecora Escarpment 91310', 'id': '18601', 'nametype': 'Valid', 'recclass': 'L5', 'mass (g)': '134.69999999999999', 'fall': 'Found', 'year': '01/01/1991 12:00:00 AM', 'reclat': '-85.682450', 'reclong': '-68.745390', 'GeoLocation': '(-85.682450, -68.745390)'}, {'name': 'Tungsten Mountain 006', 'id': '24077', 'nametype': 'Valid', 'recclass': 'L6', 'mass (g)': '8.800000000000001', 'fall': 'Found', 'year': '01/01/2001 12:00:00 AM', 'reclat': '39.684360', 'reclong': '-117.620180', 'GeoLocation': '(39.684360, -117.620180)'}]" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'name': 'Asuka 881627',\n", - " 'id': '4336',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'L6',\n", - " 'mass (g)': '42.68',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/1988 12:00:00 AM',\n", - " 'reclat': '-72.000000',\n", - " 'reclong': '26.000000',\n", - " 'GeoLocation': '(-72.000000, 26.000000)'},\n", - " {'name': 'Dhofar 1401',\n", - " 'id': '35491',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'LL~6',\n", - " 'mass (g)': '42.03',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/2001 12:00:00 AM',\n", - " 'reclat': '19.191350',\n", - " 'reclong': '54.655450',\n", - " 'GeoLocation': '(19.191350, 54.655450)'},\n", - " {'name': 'Elephant Moraine 87745',\n", - " 'id': '8295',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'H5',\n", - " 'mass (g)': '123',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/1987 12:00:00 AM',\n", - " 'reclat': '-76.183330',\n", - " 'reclong': '157.166670',\n", - " 'GeoLocation': '(-76.183330, 157.166670)'},\n", - " {'name': 'Frontier Mountain 90153',\n", - " 'id': '10545',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'H4-6',\n", - " 'mass (g)': '8.5',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/1990 12:00:00 AM',\n", - " 'reclat': '-72.954040',\n", - " 'reclong': '160.538110',\n", - " 'GeoLocation': '(-72.954040, 160.538110)'},\n", - " {'name': 'Larkman Nunatak 06750',\n", - " 'id': '48858',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'L5',\n", - " 'mass (g)': '66.8',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/2006 12:00:00 AM',\n", - " 'reclat': None,\n", - " 'reclong': None,\n", - " 'GeoLocation': None},\n", - " {'name': 'MacAlpine Hills 02539',\n", - " 'id': '14845',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'LL6',\n", - " 'mass (g)': '96.6',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/2002 12:00:00 AM',\n", - " 'reclat': '-84.216670',\n", - " 'reclong': '160.500000',\n", - " 'GeoLocation': '(-84.216670, 160.500000)'},\n", - " {'name': 'Miller Range 07124',\n", - " 'id': '51180',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'L5',\n", - " 'mass (g)': '378.6',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/2007 12:00:00 AM',\n", - " 'reclat': '0.000000',\n", - " 'reclong': '0.000000',\n", - " 'GeoLocation': '(0.000000, 0.000000)'},\n", - " {'name': 'Northwest Africa 4032',\n", - " 'id': '34304',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'Eucrite-pmict',\n", - " 'mass (g)': '10.5',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/2004 12:00:00 AM',\n", - " 'reclat': None,\n", - " 'reclong': None,\n", - " 'GeoLocation': None},\n", - " {'name': 'Northwest Africa 5953',\n", - " 'id': '50839',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'LL4',\n", - " 'mass (g)': '1450',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/2005 12:00:00 AM',\n", - " 'reclat': '0.000000',\n", - " 'reclong': '0.000000',\n", - " 'GeoLocation': '(0.000000, 0.000000)'},\n", - " {'name': 'Pecora Escarpment 91310',\n", - " 'id': '18601',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'L5',\n", - " 'mass (g)': '134.69999999999999',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/1991 12:00:00 AM',\n", - " 'reclat': '-85.682450',\n", - " 'reclong': '-68.745390',\n", - " 'GeoLocation': '(-85.682450, -68.745390)'},\n", - " {'name': 'Tungsten Mountain 006',\n", - " 'id': '24077',\n", - " 'nametype': 'Valid',\n", - " 'recclass': 'L6',\n", - " 'mass (g)': '8.800000000000001',\n", - " 'fall': 'Found',\n", - " 'year': '01/01/2001 12:00:00 AM',\n", - " 'reclat': '39.684360',\n", - " 'reclong': '-117.620180',\n", - " 'GeoLocation': '(39.684360, -117.620180)'}]" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "value" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": {}, - "outputs": [], - "source": [ - "data = []\n", - "for l in value:\n", - " data.append([v for k,v in l.items()])\n", - "result = [{\"columns\":df.columns}, {\"data\":data}]\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting fastavro\n", - " Downloading https://files.pythonhosted.org/packages/3a/a5/b357909eb300ae3a8499f1718b3887b379e743553bcc2dc2ed325902072b/fastavro-0.21.4-cp36-cp36m-win_amd64.whl (282kB)\n", - "Installing collected packages: fastavro\n", - "Successfully installed fastavro-0.21.4\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "You are using pip version 10.0.1, however version 18.0 is available.\n", - "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n" - ] - } - ], - "source": [ - "!pip install fastavro" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pyspark\\sql\\session.py:360: UserWarning: Using RDD of dict to inferSchema is deprecated. Use pyspark.sql.Row instead\n", - " warnings.warn(\"Using RDD of dict to inferSchema is deprecated. \"\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+----------+--------------------+----------+\n", - "| timestamp| tweet| username|\n", - "+----------+--------------------+----------+\n", - "|1366150681|Rock: Nerf paper,...| miguno|\n", - "|1366154481|Works as intended...|BlizzardCS|\n", - "+----------+--------------------+----------+\n", - "\n" - ] - }, - { - "data": { - "text/plain": [ - "1366150681" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from io import BytesIO\n", - "import fastavro\n", - "\n", - "df = op.sc.binaryFiles(\"twitter.avro\")\\\n", - " .flatMap(lambda args: fastavro.reader(BytesIO(args[1]))).toDF()\n", - "\n", - "df.show()\n", - "# optimus function get the min value\n", - "df.cols.min(\"timestamp\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'station': '011990-99999', 'time': 1433269388, 'temp': 0}\n", - "{'station': '011990-99999', 'time': 1433270389, 'temp': 22}\n", - "{'station': '011990-99999', 'time': 1433273379, 'temp': -11}\n", - "{'station': '012650-99999', 'time': 1433275478, 'temp': 111}\n" - ] - } - ], - "source": [ - "from fastavro import writer, reader, parse_schema\n", - "\n", - "schema = {\n", - " 'doc': 'A weather reading.',\n", - " 'name': 'Weather',\n", - " 'namespace': 'test',\n", - " 'type': 'record',\n", - " 'fields': [\n", - " {'name': 'station', 'type': 'string'},\n", - " {'name': 'time', 'type': 'long'},\n", - " {'name': 'temp', 'type': 'int'},\n", - " ],\n", - "}\n", - "parsed_schema = parse_schema(schema)\n", - "\n", - "# 'records' can be an iterable (including generator)\n", - "records = [\n", - " {u'station': u'011990-99999', u'temp': 0, u'time': 1433269388},\n", - " {u'station': u'011990-99999', u'temp': 22, u'time': 1433270389},\n", - " {u'station': u'011990-99999', u'temp': -11, u'time': 1433273379},\n", - " {u'station': u'012650-99999', u'temp': 111, u'time': 1433275478},\n", - "]\n", - "\n", - "# Writing\n", - "with open('weather.avro', 'wb') as out:\n", - " writer(out, parsed_schema, records)\n", - "\n", - "# Reading\n", - "with open('weather.avro', 'rb') as fo:\n", - " for record in reader(fo):\n", - " print(record)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/optimus/helpers/functions.py b/optimus/helpers/functions.py index 537bdf52b..0bd63d6ce 100644 --- a/optimus/helpers/functions.py +++ b/optimus/helpers/functions.py @@ -16,7 +16,7 @@ def random_int(n=5): """ - Create a unique filename + Create a random string of ints :return: """ return str(random.randint(1, 10 ** n)) @@ -185,8 +185,8 @@ def _format_dict(_val): def validate_columns_names(df, col_names, index=0): """ Check if a string or list of string are valid dataframe columns - :param df: - :param col_names: + :param df: Data frame to be analyzed + :param col_names: columns names to be checked :param index: :return: """ diff --git a/tests/test_optimus.py b/tests/test_optimus.py index 5246b3a8a..407bd0956 100644 --- a/tests/test_optimus.py +++ b/tests/test_optimus.py @@ -1,6 +1,4 @@ -from pyspark.ml.linalg import Vectors, VectorUDT -from pyspark.sql import Row -from pyspark.sql import functions as F +import pandas as pd from pyspark.sql.types import * from optimus import Optimus @@ -96,3 +94,29 @@ def test_create_data_frames_nullable(): ) assert (expected_df.collect() == actual_df.collect()) + + @staticmethod + def test_create_data_frames_pandas(): + labels = ["name", "age"] + + data = [("BOB", 1), + ("JoSe", 2)] + + # Create pandas dataframe + pdf = pd.DataFrame.from_records(data, columns=labels) + + actual_df = op.create.df(pdf) + + expected_df = op.create.df( + rows=[ + ("BOB", 1), + ("JoSe", 2) + ], + cols= + [ + ("name", StringType(), True), + ("age", IntegerType(), True) + ] + ) + + assert (expected_df.collect() == actual_df.collect()) From 122cfd2692566a307c7c1169d788f0cae677102f Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Sun, 2 Sep 2018 19:18:14 -0500 Subject: [PATCH 19/94] Now a Spark Dataframe can be created from a Pandas Dataframe --- optimus/create.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/optimus/create.py b/optimus/create.py index 97d6e2533..78d06bd89 100644 --- a/optimus/create.py +++ b/optimus/create.py @@ -1,15 +1,18 @@ +from multipledispatch import dispatch from pyspark.sql.types import StructField, StructType, StringType # Helpers from optimus.helpers.checkit import is_tuple -from optimus.helpers.constants import SPARK_DTYPES from optimus.helpers.functions import get_spark_dtypes_object from optimus.spark import Spark +import pandas as pdf + class Create: @staticmethod + @dispatch(list, list) def data_frame(cols, rows): """ Helper to create a Spark dataframe: @@ -42,9 +45,19 @@ def data_frame(cols, rows): # If tuple has not the third param with put it to true to accepts Null in columns specs.append([col_name, var_type, nullable]) - struct_fields = list(map(lambda x: StructField(*x), specs)) return Spark.instance.spark.createDataFrame(rows, StructType(struct_fields)) + @staticmethod + @dispatch(object) + def data_frame(pdf): + """ + Helper to create a Spark dataframe: + :param pdf: List of Tuple with name, data type and a flag to accept null + :return: Dataframe + """ + + return Spark.instance.spark.createDataFrame(pdf) + df = data_frame From 6249dea1e0d10ddf6eea7a6990a51d1138baadf7 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Sun, 2 Sep 2018 19:19:01 -0500 Subject: [PATCH 20/94] Added Function to column with unique id per row --- optimus/dataframe/extension.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/optimus/dataframe/extension.py b/optimus/dataframe/extension.py index 2ed80be61..ad3c91954 100644 --- a/optimus/dataframe/extension.py +++ b/optimus/dataframe/extension.py @@ -19,10 +19,9 @@ @add_method(DataFrame) -def rollout(self): +def rollout(): """ Just a function to check if the Spark dataframe has been Monkey Patched - :param self: :return: """ print("Yes!") @@ -280,3 +279,15 @@ def correlation(self, columns, method="pearson", strategy="mean", output="json") result = sorted(result, key=lambda k: k['value'], reverse=True) return result + + +@add_method(DataFrame) +def create_id(self, column="id"): + """ + Create a unique id for every row. + :param self: + :param column: + :return: + """ + + return self.withColumn(column, F.monotonically_increasing_id()) From 24b14947f5ed5003a385ff67d2461a284e4dcfc4 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Sun, 2 Sep 2018 21:25:17 -0500 Subject: [PATCH 21/94] Playing with queue implementation --- examples/new-api-sandbox.ipynb | 181 +++++++++++++++++++++++++++++++-- 1 file changed, 174 insertions(+), 7 deletions(-) diff --git a/examples/new-api-sandbox.ipynb b/examples/new-api-sandbox.ipynb index 92cefd2b6..46b1fd545 100644 --- a/examples/new-api-sandbox.ipynb +++ b/examples/new-api-sandbox.ipynb @@ -2,9 +2,18 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 34, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], "source": [ "%load_ext autoreload\n", "%autoreload 2" @@ -12,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -22,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -31,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -67,10 +76,10 @@ "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mop\u001b[0m\u001b[1;33m=\u001b[0m \u001b[0mOptimus\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mverbose\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mserver\u001b[0m\u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mop\u001b[0m\u001b[1;33m=\u001b[0m \u001b[0mOptimus\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mverbose\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mserver\u001b[0m\u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;32m~\\Documents\\Optimus\\optimus\\optimus.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, master, app_name, checkpoint, path, file_system, verbose, dl, server)\u001b[0m\n\u001b[0;32m 66\u001b[0m \u001b[0mlogging\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Starting Optimus Server...\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 67\u001b[0m \u001b[0ms\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mServer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 68\u001b[1;33m \u001b[0ms\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 69\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mserver_instance\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0ms\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 70\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Documents\\Optimus\\optimus\\server\\server.py\u001b[0m in \u001b[0;36mstart\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 13\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 14\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mstart\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 15\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mprocess\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mProcess\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 16\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 17\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mstop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Documents\\Optimus\\optimus\\server\\process.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, path)\u001b[0m\n\u001b[0;32m 31\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 32\u001b[0m \u001b[1;31m# Ensure that a child process has completed before the main process\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 33\u001b[1;33m \u001b[0mprocess\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 34\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 35\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mprocess\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mprocess\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\server\\process.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, path)\u001b[0m\n\u001b[0;32m 32\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 33\u001b[0m \u001b[1;31m# Ensure that a child process has completed before the main process\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 34\u001b[1;33m \u001b[0mprocess\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 35\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 36\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mprocess\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mprocess\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mAttributeError\u001b[0m: 'Popen' object has no attribute 'join'" ] } @@ -858,6 +867,164 @@ "df.table()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Queue implementation." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting pika\n", + " Downloading https://files.pythonhosted.org/packages/bf/48/72de47f63ba353bacd74b76bb65bc63620b0706d8b0471798087cd5a4916/pika-0.12.0-py2.py3-none-any.whl (108kB)\n", + "Installing collected packages: pika\n", + "Successfully installed pika-0.12.0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "blaze 0.11.3 requires flask-cors, which is not installed.\n", + "tensorflow 1.10.0 has requirement numpy<=1.14.5,>=1.13.3, but you'll have numpy 1.15.1 which is incompatible.\n", + "You are using pip version 10.0.1, however version 18.0 is available.\n", + "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n" + ] + } + ], + "source": [ + "!pip install pika" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Pika version 0.12.0 connecting to 54.173.199.170:5672\n", + "Created channel=1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " [x] Sent 'Hello World1!'\n" + ] + } + ], + "source": [ + "import pika, os\n", + "\n", + "# Access the CLODUAMQP_URL environment variable and parse it (fallback to localhost)\n", + "url = os.environ.get('CLOUDAMQP_URL', 'amqp://eujwlcwg:QwZVFnWSqsJFodlF-8xWCWi7Rg6WPSwj@chimpanzee.rmq.cloudamqp.com/eujwlcwg')\n", + "params = pika.URLParameters(url)\n", + "connection = pika.BlockingConnection(params)\n", + "channel = connection.channel() # start a channel\n", + "channel.queue_declare(queue='hello') # Declare a queue\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " [x] Sent 'Hello World1!'\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\" [x] Sent 'Hello World1!'\")\n", + "\n", + "channel.basic_publish(exchange='',\n", + " routing_key='hello',\n", + " body='Hello CloudAMQP!')" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " [*] Waiting for messages:\n", + " [x] Received b'Hello CloudAMQP!'\n", + " [x] Received b'Hello CloudAMQP!'\n", + " [x] Received b'Hello CloudAMQP!'\n", + " [x] Received b'Hello CloudAMQP!'\n", + " [x] Received b'Hello CloudAMQP!'\n", + " [x] Received b'Hello CloudAMQP!'\n", + " [x] Received b'Hello CloudAMQP!'\n", + " [x] Received b'Hello CloudAMQP!'\n", + " [x] Received b'Hello CloudAMQP!'\n", + " [x] Received b'Hello CloudAMQP!'\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m' [*] Waiting for messages:'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 10\u001b[1;33m \u001b[0mchannel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstart_consuming\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 11\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pika\\adapters\\blocking_connection.py\u001b[0m in \u001b[0;36mstart_consuming\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1820\u001b[0m \u001b[1;31m# Process events as long as consumers exist on this channel\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1821\u001b[0m \u001b[1;32mwhile\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_consumer_infos\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1822\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mprocess_data_events\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtime_limit\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1823\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1824\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mstop_consuming\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconsumer_tag\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pika\\adapters\\blocking_connection.py\u001b[0m in \u001b[0;36mprocess_data_events\u001b[1;34m(self, time_limit)\u001b[0m\n\u001b[0;32m 747\u001b[0m (self._channels_pending_dispatch or self._ready_events))\n\u001b[0;32m 748\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mtime_limit\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 749\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_flush_output\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommon_terminator\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 750\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 751\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0m_IoloopTimerContext\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtime_limit\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_impl\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mtimer\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pika\\adapters\\blocking_connection.py\u001b[0m in \u001b[0;36m_flush_output\u001b[1;34m(self, *waiters)\u001b[0m\n\u001b[0;32m 456\u001b[0m \u001b[1;31m# Process I/O until our completion condition is satisified\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 457\u001b[0m \u001b[1;32mwhile\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mis_done\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 458\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_impl\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mioloop\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpoll\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 459\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_impl\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mioloop\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mprocess_timeouts\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 460\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pika\\adapters\\select_connection.py\u001b[0m in \u001b[0;36mpoll\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 493\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 494\u001b[0m \"\"\"\n\u001b[1;32m--> 495\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_poller\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpoll\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 496\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 497\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pika\\adapters\\select_connection.py\u001b[0m in \u001b[0;36mpoll\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 876\u001b[0m read, write, error = select.select(\n\u001b[0;32m 877\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fd_events\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mREAD\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fd_events\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mWRITE\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 878\u001b[1;33m self._fd_events[ERROR], self._get_max_wait())\n\u001b[0m\u001b[0;32m 879\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 880\u001b[0m \u001b[1;31m# NOTE When called without any FDs, select fails on\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "\n", + "def callback(ch, method, properties, body):\n", + " print(\" [x] Received %r\" % body)\n", + "\n", + "channel.basic_consume(callback,\n", + " queue='hello',\n", + " no_ack=True)\n", + "\n", + "print(' [*] Waiting for messages:')\n", + "channel.start_consuming()\n", + "connection.close()" + ] + }, { "cell_type": "code", "execution_count": null, From 227f8c1fc1dffd1b23aeaa7f02be7f7ffebf2ad0 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Sun, 2 Sep 2018 23:50:58 -0500 Subject: [PATCH 22/94] Now optimus can handle jars, packages and repositories in the constructor. Examples added --- examples/new-api-optimus.ipynb | 59 ++++++++++-- optimus/optimus.py | 168 ++++++++++++++++++++++++++++----- 2 files changed, 195 insertions(+), 32 deletions(-) diff --git a/examples/new-api-optimus.ipynb b/examples/new-api-optimus.ipynb index bd665c121..969a13997 100644 --- a/examples/new-api-optimus.ipynb +++ b/examples/new-api-optimus.ipynb @@ -49,7 +49,7 @@ "-----\n", "SPARK_HOME=C:\\opt\\spark\\spark-2.3.1-bin-hadoop2.7\n", "HADOOP_HOME=C:\\opt\\spark\\spark-2.3.1-bin-hadoop2.7\n", - "You don't have PYSPARK_PYTHON set\n", + "PYSPARK_PYTHON=C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\python.exe\n", "You don't have PYSPARK_DRIVER_PYTHON set\n", "JAVA_HOME=C:\\Program Files\\Java\\jdk1.8.0_181\n", "Pyarrow Installed\n", @@ -73,6 +73,15 @@ "op = Optimus(master=\"local\", app_name= \"optimus\", verbose = True)" ] }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "op = Optimus(packages=\"datastax:spark-cassandra-connector:1.6.1-s_2.10\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -90,7 +99,7 @@ "text/html": [ "\n", "
\n", - "

SparkSession - in-memory

\n", + "

SparkSession - hive

\n", " \n", "
\n", "

SparkContext

\n", @@ -101,7 +110,7 @@ "
Version
\n", "
v2.3.1
\n", "
Master
\n", - "
local
\n", + "
local[*]
\n", "
AppName
\n", "
optimus
\n", " \n", @@ -111,7 +120,7 @@ " " ], "text/plain": [ - "" + "" ] }, "execution_count": 5, @@ -148,7 +157,7 @@ "
Version
\n", "
v2.3.1
\n", "
Master
\n", - "
local
\n", + "
local[*]
\n", "
AppName
\n", "
optimus
\n", " \n", @@ -156,7 +165,7 @@ " " ], "text/plain": [ - "" + "" ] }, "execution_count": 6, @@ -168,6 +177,41 @@ "op.sc" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Packages loaded" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['datastax:spark-cassandra-connector:1.6.1-s_2.10',\n", + " 'com.databricks:spark-avro_2.11:4.0.0']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "op.packages" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create dataframe" + ] + }, { "cell_type": "code", "execution_count": 7, @@ -1154,8 +1198,7 @@ " 'B': {0: 1, 1: 3, 2: 5},\n", " 'C': {0: 2, 1: 4, 2: 6}})\n", "\n", - "sdf = op.spark.createDataFrame(pdf)\n", - "sdf.table()\n", + "sdf = op.create.df(pdf)\n", "\n", "df.melt(sdf, id_vars=['A'], value_vars=['B', 'C']).table()" ] diff --git a/optimus/optimus.py b/optimus/optimus.py index 8286aec5c..e29722966 100644 --- a/optimus/optimus.py +++ b/optimus/optimus.py @@ -1,10 +1,11 @@ -import logging import os +import sys from shutil import rmtree from optimus.create import Create from optimus.functions import concat from optimus.helpers.constants import * +from optimus.helpers.functions import val_to_list from optimus.helpers.raiseit import RaiseIt from optimus.io.load import Load from optimus.ml.models import ML @@ -17,7 +18,12 @@ class Optimus: def __init__(self, master="local[*]", app_name="optimus", checkpoint=False, path=None, file_system="local", - verbose=False, dl=False): + verbose=False, dl=False, + repositories=None, + packages=None, + jars=None, + options=None, + additional_options=None): """ Transform and roll out @@ -26,24 +32,59 @@ def __init__(self, master="local[*]", app_name="optimus", checkpoint=False, path :param path: path to the checkpoint folder :param checkpoint: If True create a checkpoint folder :param file_system: 'local' or 'hadoop' + :param additional_options: + + :param options: Configuration options that are passed to spark-submit. + See `the list of possible options + `_. + Note that any options set already through PYSPARK_SUBMIT_ARGS will override + these. + :type options: (dict[str,str]) + :param repositories: List of additional maven repositories for package lookup. + :type repositories: (list[str]) + + :param packages: Spark packages that should be installed. + :type packages: (list[str]) + + :param jars: Full paths to jar files that we want to include to the session. + :type jars: (list[str]) + """ + self.master = master + self.app_name = app_name - if verbose is True: - logging.basicConfig(format="%(message)s", level=logging.INFO) - elif verbose is False: - logging.propagate = False - logging.disable(logging.NOTSET) + if options is None: + options = {} + + self.options = options + + if packages is None: + packages = [] + else: + packages = val_to_list(packages) + + self.packages = packages + self.repositories = repositories + + if jars is None: + jars = {} + + self.jars = jars + self.additional_options = additional_options + + self.verbose(verbose) if dl is True: - Optimus.add_spark_packages(["databricks:spark-deep-learning:1.1.0-spark2.3-s_2.11 pyspark-shell"]) + self._add_spark_packages( + ["databricks:spark-deep-learning:1.1.0-spark2.3-s_2.11", "com.databricks:spark-avro_2.11:4.0.0"]) + + self._start_session() - Spark.instance = Spark(master, app_name) from optimus.dl.models import DL self.dl = DL() else: - - Spark.instance = Spark(master, app_name) - pass + self._add_spark_packages(["com.databricks:spark-avro_2.11:4.0.0"]) + self._start_session() if path is None: path = os.getcwd() @@ -60,7 +101,7 @@ def __init__(self, master="local[*]", app_name="optimus", checkpoint=False, path logging.info(STARTING_OPTIMUS) if checkpoint is True: - self.set_check_point_folder(path, file_system) + self._set_check_point_folder(path, file_system) logging.info(SUCCESS) @@ -86,7 +127,8 @@ def sc(self): """ return Spark.instance.sc - def stop(self): + @staticmethod + def stop(): """ Stop Spark Session :return: @@ -94,16 +136,7 @@ def stop(self): Spark.instance.spark.stop() @staticmethod - def add_spark_packages(packages): - """ - Define the Spark packages that must be loaded at start time - :param packages: - :return: - """ - os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages " + " ".join(packages) - - @staticmethod - def set_check_point_folder(path, file_system): + def _set_check_point_folder(path, file_system): """ Function that receives a workspace path where a folder is created. This folder will store temporal dataframes when user writes the .checkPoint(). @@ -181,4 +214,91 @@ def delete_check_point_folder(path, file_system): @staticmethod def concat(dfs, like): + """ + Concat multiple dataframes + :param dfs: List of Dataframes + :param like: concat as columns or rows + :return: + """ return concat(dfs, like) + + def _add_spark_packages(self, packages): + """ + Define the Spark packages that must be loaded at start time + :param packages: + :return: + """ + for p in packages: + self.packages.append(p) + + def _setup_repositories(self): + if self.repositories: + return '--repositories {}'.format(','.join(self.repositories)) + else: + return '' + + def _setup_packages(self): + if self.packages: + return '--packages {}'.format(','.join(self.packages)) + else: + return '' + + def _setup_jars(self): + if self.jars: + return '--jars {}'.format(','.join(self.jars)) + else: + return '' + + def _setup_options(self, additional_options): + options = {} + + options.update(self.options) + + if additional_options: + options.update(additional_options) + + if 'spark.sql.catalogImplementation' not in options: + options['spark.sql.catalogImplementation'] = 'hive' + + # Here we massage conf properties with the intent to pass them to + # spark-submit; this is convenient as it is unified with the approach + # we take for repos, packages and jars, and it also handles precedence + # of conf properties already defined by the user in a very + # straightforward way (since we always append to PYSPARK_SUBMIT_ARGS) + return ' '.join('--conf "{}={}"'.format(*o) for o in sorted(options.items())) + + def _start_session(self): + """ + + :return: + """ + ## Get python.exe fullpath + os.environ['PYSPARK_PYTHON'] = sys.executable + + submit_args = [ + # options that were already defined through PYSPARK_SUBMIT_ARGS + # take precedence over SparklySession's + os.environ.get('PYSPARK_SUBMIT_ARGS', '').replace('pyspark-shell', ''), + self._setup_repositories(), + self._setup_packages(), + self._setup_jars(), + self._setup_options(self.additional_options), + 'pyspark-shell', + ] + + env = ' '.join(filter(None, submit_args)) + os.environ['PYSPARK_SUBMIT_ARGS'] = env + Spark.instance = Spark(self.master, self.app_name) + + @staticmethod + def verbose(verbose): + """ + Enable verbose mode + :param verbose: + :return: + """ + if verbose is True: + logging.basicConfig(format="%(message)s", level=logging.INFO) + elif verbose is False: + logging.propagate = False + logging.disable(logging.NOTSET) From e3e92b8746e050ec15a02a38888ae08a39015d13 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Sun, 2 Sep 2018 23:53:20 -0500 Subject: [PATCH 23/94] Avro loader enable --- examples/new-api-optimus.ipynb | 7 +++++++ optimus/io/load.py | 9 +++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/examples/new-api-optimus.ipynb b/examples/new-api-optimus.ipynb index 969a13997..97d79f693 100644 --- a/examples/new-api-optimus.ipynb +++ b/examples/new-api-optimus.ipynb @@ -73,6 +73,13 @@ "op = Optimus(master=\"local\", app_name= \"optimus\", verbose = True)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Adding external packages" + ] + }, { "cell_type": "code", "execution_count": 4, diff --git a/optimus/io/load.py b/optimus/io/load.py index 8be283083..8dcd2261d 100644 --- a/optimus/io/load.py +++ b/optimus/io/load.py @@ -57,6 +57,7 @@ def json(path): :return: """ try: + # TODO: Check a better way to handle this Spark.instance.spark. Very verbose. df = Spark.instance.spark.read.json(path) except IOError as error: logging.error(error) @@ -107,9 +108,13 @@ def parquet(path, *args, **kwargs): @staticmethod def avro(path, *args, **kwargs): - print("Not yet implemented") - return + try: + df = Spark.instance.spark.read.format("com.databricks.spark.avro").load(path, *args, **kwargs) + except IOError as error: + logging.error(error) + raise + return df class Downloader(object): From 6c07fae931db27f896c38196e20e054f66c3606f Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Mon, 3 Sep 2018 00:39:06 -0500 Subject: [PATCH 24/94] Test package, jars and repositories --- optimus/optimus.py | 20 ++++- tests/test_session.py | 197 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 216 insertions(+), 1 deletion(-) create mode 100644 tests/test_session.py diff --git a/optimus/optimus.py b/optimus/optimus.py index e29722966..7b6996c3c 100644 --- a/optimus/optimus.py +++ b/optimus/optimus.py @@ -269,7 +269,7 @@ def _setup_options(self, additional_options): def _start_session(self): """ - + Start a Spark session using jar, packages, repositories and options given :return: """ ## Get python.exe fullpath @@ -290,6 +290,24 @@ def _start_session(self): os.environ['PYSPARK_SUBMIT_ARGS'] = env Spark.instance = Spark(self.master, self.app_name) + def has_package(self, package_prefix): + """ + Check if the package is available in the session. + :param package_prefix: E.g. "org.elasticsearch:elasticsearch-spark". + :type package_prefix: str + :return bool + """ + return any(package for package in self.packages if package.startswith(package_prefix)) + + def has_jar(self, jar_name): + """ + Check if the jar is available in the session. + :param jar_name: E.g. "mysql-connector-java". + :type jar_name: str + :return: bool + """ + return any(jar for jar in self.jars if jar_name in jar) + @staticmethod def verbose(verbose): """ diff --git a/tests/test_session.py b/tests/test_session.py new file mode 100644 index 000000000..cb9a74d79 --- /dev/null +++ b/tests/test_session.py @@ -0,0 +1,197 @@ +import sys +import unittest + +try: + from unittest import mock +except ImportError: + import mock + +from pyspark import SparkContext + +from optimus import Optimus + + +class TestOptimusSession(unittest.TestCase): + maxDiff = None + + def setUp(self): + super(TestOptimusSession, self).setUp() + self.spark_context_mock = mock.Mock(spec=SparkContext) + + self.patches = [ + mock.patch('optimus.sc', self.spark_context_mock), + ] + [p.start() for p in self.patches] + + def tearDown(self): + [p.stop() for p in self.patches] + super(TestOptimusSession, self).tearDown() + + def test_has_package(self): + op = Optimus() + self.assertFalse(op.has_package('datastax:spark-cassandra-connector')) + + op.packages = ['datastax:spark-cassandra-connector:1.6.1-s_2.10'] + self.assertTrue(op.has_package('datastax:spark-cassandra-connector')) + + def test_has_jar(self): + op = Optimus() + self.assertFalse(op.has_jar('mysql-connector-java')) + + op.jars = ['mysql-connector-java-5.1.39-bin.jar'] + self.assertTrue(op.has_jar('mysql-connector-java')) + + @mock.patch('optimus.os') + def test_session_with_packages(self, os_mock): + os_mock.environ = {} + + Optimus(packages=['package1', 'package2']) + + self.assertEqual(os_mock.environ, { + 'PYSPARK_PYTHON': sys.executable, + 'PYSPARK_SUBMIT_ARGS': ( + '--packages package1,package2 ' + '--conf "spark.sql.catalogImplementation=hive" ' + 'pyspark-shell' + ), + }) + + @mock.patch('optimus.os') + def test_session_with_repositories(self, os_mock): + os_mock.environ = {} + + Optimus( + packages=['package1', 'package2'], + repositories=[ + 'http://my.maven.repo', + 'http://another.maven.repo', + ]) + + self.assertEqual(os_mock.environ, { + 'PYSPARK_PYTHON': sys.executable, + 'PYSPARK_SUBMIT_ARGS': ( + '--repositories http://my.maven.repo,http://another.maven.repo ' + '--packages package1,package2 ' + '--conf "spark.sql.catalogImplementation=hive" ' + 'pyspark-shell' + ), + }) + + @mock.patch('optimus.os') + def test_session_with_jars(self, os_mock): + os_mock.environ = {} + + Optimus(jars=['file_a.jar', 'file_b.jar']) + + self.assertEqual(os_mock.environ, { + 'PYSPARK_PYTHON': sys.executable, + 'PYSPARK_SUBMIT_ARGS': ( + '--jars file_a.jar,file_b.jar ' + '--conf "spark.sql.catalogImplementation=hive" ' + 'pyspark-shell' + ), + }) + + @mock.patch('optimus.os') + def test_session_with_options(self, os_mock): + os_mock.environ = {} + + # test options attached to class definition + Optimus( + options={ + 'spark.option.a': 'value_a', + 'spark.option.b': 'value_b', + }) + + self.assertEqual(os_mock.environ, { + 'PYSPARK_PYTHON': sys.executable, + 'PYSPARK_SUBMIT_ARGS': ( + '--conf "spark.option.a=value_a" ' + '--conf "spark.option.b=value_b" ' + '--conf "spark.sql.catalogImplementation=hive" ' + 'pyspark-shell' + ), + }) + + # test additional_options override/extend options attached to class definition + os_mock.environ = {} + + Optimus(additional_options={ + 'spark.option.b': 'value_0', + 'spark.option.c': 'value_c', + }) + + self.assertEqual(os_mock.environ, { + 'PYSPARK_PYTHON': sys.executable, + 'PYSPARK_SUBMIT_ARGS': ( + '--conf "spark.option.a=value_a" ' + '--conf "spark.option.b=value_0" ' + '--conf "spark.option.c=value_c" ' + '--conf "spark.sql.catalogImplementation=hive" ' + 'pyspark-shell' + ), + }) + + # test catalog implementation is respected + os_mock.environ = {} + + Optimus(options={ + 'spark.sql.catalogImplementation': 'my_fancy_catalog', + }) + + self.assertEqual(os_mock.environ, { + 'PYSPARK_PYTHON': sys.executable, + 'PYSPARK_SUBMIT_ARGS': ( + '--conf "spark.sql.catalogImplementation=my_fancy_catalog" ' + 'pyspark-shell' + ), + }) + + @mock.patch('optimus.os') + def test_session_without_packages_jars_and_options(self, os_mock): + os_mock.environ = {} + + Optimus() + + self.assertEqual(os_mock.environ, { + 'PYSPARK_PYTHON': sys.executable, + 'PYSPARK_SUBMIT_ARGS': '--conf "spark.sql.catalogImplementation=hive" pyspark-shell', + }) + + @mock.patch('optimus.os') + def test_session_appends_to_pyspark_submit_args(self, os_mock): + os_mock.environ = { + 'PYSPARK_SUBMIT_ARGS': '--conf "my.conf.here=5g" --and-other-properties', + } + + Optimus() + + self.assertEqual(os_mock.environ, { + 'PYSPARK_PYTHON': sys.executable, + 'PYSPARK_SUBMIT_ARGS': ( + '--conf "my.conf.here=5g" --and-other-properties ' + '--conf "spark.sql.catalogImplementation=hive" ' + 'pyspark-shell' + ), + }) + + # test more complicated session + os_mock.environ = { + 'PYSPARK_SUBMIT_ARGS': '--conf "my.conf.here=5g" --and-other-properties', + } + + Optimus(options={'my.conf.here': '10g'}) + + self.assertEqual(os_mock.environ, { + 'PYSPARK_PYTHON': sys.executable, + 'PYSPARK_SUBMIT_ARGS': ( + '--conf "my.conf.here=5g" --and-other-properties ' + # Note that spark honors the first conf it sees when multiple + # are defined + '--conf "my.conf.here=10g" ' + '--conf "spark.sql.catalogImplementation=hive" ' + 'pyspark-shell' + ), + }) + + From f1f36d771f173d6591a3e9e9cef86105d1d36473 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Mon, 3 Sep 2018 16:16:23 -0500 Subject: [PATCH 25/94] is_data_type() removed. Now fbdt() is used. --- optimus/dataframe/columns.py | 27 +++++++++++----------- optimus/functions.py | 17 ++++++++++++-- optimus/helpers/checkit.py | 43 ------------------------------------ optimus/profiler/profiler.py | 17 +++++++++----- 4 files changed, 41 insertions(+), 63 deletions(-) diff --git a/optimus/dataframe/columns.py b/optimus/dataframe/columns.py index 8f5ae68f3..c1e14a371 100644 --- a/optimus/dataframe/columns.py +++ b/optimus/dataframe/columns.py @@ -18,7 +18,7 @@ from optimus.functions import abstract_udf as audf, concat from optimus.functions import filter_row_by_data_type as fbdt from optimus.helpers.checkit \ - import is_num_or_str, is_list, is_, is_tuple, is_list_of_dataframes, is_list_of_tuples, \ + import is_num_or_str, is_list, is_tuple, is_list_of_dataframes, is_list_of_tuples, \ is_function, is_one_element, is_type, is_int, is_dict, is_str, is_ # Helpers from optimus.helpers.constants import * @@ -1049,7 +1049,7 @@ def func_regex(_df, _col_name, _search, _replace): return _df.withColumn(c, F.regexp_replace(_col_name, _search, _replace)) def func_replace(_df, _col_name, _search, _replace): - data_type = self.cols.dtypes(_col_name) + data_type = self.cols.dtype(_col_name) _search = [PYTHON_TYPES_[data_type](s) for s in _search] _df = _df.replace(_search, _replace, _col_name) return _df @@ -1221,7 +1221,7 @@ def _split(col_name, args): def cell(column): """ Get the value for the first cell from a column in a data frame - :param column: Column to be + :param column: Column to be processed :return: """ return self.cols.select(column).first()[0] @@ -1282,7 +1282,7 @@ def hist(columns, buckets=10): def frequency(columns, buckets=10): """ Output values frequency in json format - :param columns: Column to be processed + :param columns: Columns to be processed :param buckets: Number of buckets :return: """ @@ -1297,18 +1297,18 @@ def frequency(columns, buckets=10): @add_attr(cols) def schema_dtypes(columns): """ - Return the columns data type as Type - :param columns: + Return the column(s) data type as Type + :param columns: Columns to be processed :return: """ columns = parse_columns(self, columns) return format_dict([self.schema[col_name].dataType for col_name in columns]) @add_attr(cols) - def dtypes(columns): + def dtype(columns): """ - Return the column data type as string - :param columns: + Return the column(s) data type as string + :param columns: Columns to be processed :return: """ @@ -1320,10 +1320,10 @@ def dtypes(columns): @add_attr(cols) def qcut(input_col, output_col, num_buckets): """ - Bin columns into n buckets - :param input_col: - :param output_col: - :param num_buckets: + Bin columns into n buckets. Quantile Discretizer + :param input_col: Input column to processed + :param output_col: Output columns with the bin number + :param num_buckets: Number of buckets in which the column will be divided :return: """ discretizer = QuantileDiscretizer(numBuckets=num_buckets, inputCol=input_col, outputCol=output_col) @@ -1365,6 +1365,7 @@ def abs(columns): df = df.withColumn(col_name, F.abs(F.col(col_name))) return df + return cols diff --git a/optimus/functions.py b/optimus/functions.py index d7872dd73..183e5e140 100644 --- a/optimus/functions.py +++ b/optimus/functions.py @@ -10,7 +10,6 @@ from pyspark.sql import DataFrame from pyspark.sql import functions as F -from optimus.helpers.checkit import is_data_type from optimus.helpers.functions import is_pyarrow_installed, parse_python_dtypes, random_int, one_list_to_val, \ get_spark_dtypes_object from optimus.helpers.raiseit import RaiseIt @@ -100,7 +99,7 @@ def filter_row_by_data_type_audf(col_name, data_type): """ data_type = parse_python_dtypes(data_type) - return abstract_udf(col_name, is_data_type, "boolean", data_type) + return abstract_udf(col_name, filter_row_by_data_type, "boolean", data_type) def concat(dfs, like="columns"): @@ -270,6 +269,18 @@ def str_to_date(value): except ValueError: pass + def str_to_array(value): + """ + Because Spark can handle tuples we will try to transform tuples to arrays + :param value: + :return: + """ + try: + dateutil.parser.parse(value) + return True + except ValueError: + pass + def func(value): """ Check if a value can be casted to a specific @@ -290,6 +301,8 @@ def func(value): _data_type = "bool" elif str_to_date(value): _data_type = "date" + elif str_to_array(value): + _data_type = "array" else: _data_type = "string" else: diff --git a/optimus/helpers/checkit.py b/optimus/helpers/checkit.py index 0fc320337..a847b5f27 100644 --- a/optimus/helpers/checkit.py +++ b/optimus/helpers/checkit.py @@ -253,46 +253,3 @@ def is_dataframe(value): """ return isinstance(value, DataFrame) - -def is_data_type(value, data_type): - """ - Check if a value can be casted to a specific - :param value: value to be checked - :param data_type: - :return: - """ - - _data_type = "string" - if isinstance(value, int): # Check if value is integer - _data_type = "int" - elif isinstance(value, float): - _data_type = "float" - elif isinstance(value, bool): - _data_type = "boolean" - # if string we try to parse it to int, float or bool - elif isinstance(value, str): - try: - int(value) - _data_type = "int" - except ValueError: - pass - try: - float(value) - _data_type = "float" - except ValueError: - pass - try: - # int(value) - value = value.lower() - if value == "true" or value == "false": - _data_type = "bool" - except ValueError: - pass - - else: - _data_type = "null" - - if _data_type == data_type: - return True - else: - return False diff --git a/optimus/profiler/profiler.py b/optimus/profiler/profiler.py index 01fc026ac..a9c4e38b0 100644 --- a/optimus/profiler/profiler.py +++ b/optimus/profiler/profiler.py @@ -72,9 +72,17 @@ def _count_data_types(col_name): :param col_name: :return: """ + + # If String, process the data to try to infer which data type is inside. This a kind of optimization. + # We do not need to analyze the data if the column data type is integer or boolean.etc temp = col_name + "_type" - # Count by data type - types = df.withColumn(temp, fbdt(col_name, get_type=True)).groupBy(temp).count().collect() + + col_data_type = df.cols.dtype(col_name) + if col_data_type is "string": + types = df.withColumn(temp, fbdt(col_name, get_type=True)).groupBy(temp).count().collect() + else: + # All the columns elements has the sime type + types = {col_data_type: df.count()} count_by_data_type = {} @@ -88,7 +96,6 @@ def _count_data_types(col_name): count_empty_strings = df.where(F.col(col_name) == '').count() count_by_data_type['string'] = count_by_data_type['string'] - count_empty_strings - # if the data type is string we try to infer data_types_count = {"string": count_by_data_type['string'], "bool": count_by_data_type['bool'], "int": count_by_data_type['int'], @@ -146,10 +153,10 @@ def _count_data_types(col_name): def columns(df, columns, buckets=40, relative_error=1): """ Return statistical information about a specific column in json format - count_data_type() :param df: Dataframe to be processed :param columns: Columns that you want to profile - :param buckets: + :param buckets: Create buckets divided by range. Each bin is equal. + :param relative_error: relative error when the percentile is calculated. 0 is more exact as slow 1 more error and faster :return: json object with the """ From 338f0df42ce78e6b57bc2b85af4cce3e2fc55175 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Mon, 3 Sep 2018 16:16:42 -0500 Subject: [PATCH 26/94] dtypes renamed to dtype --- optimus/dataframe/columns.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimus/dataframe/columns.py b/optimus/dataframe/columns.py index c1e14a371..e2b48c0c3 100644 --- a/optimus/dataframe/columns.py +++ b/optimus/dataframe/columns.py @@ -16,7 +16,7 @@ from pyspark.sql.functions import Column from optimus.functions import abstract_udf as audf, concat -from optimus.functions import filter_row_by_data_type as fbdt + from optimus.functions import filter_row_by_data_type as fbdt from optimus.helpers.checkit \ import is_num_or_str, is_list, is_tuple, is_list_of_dataframes, is_list_of_tuples, \ is_function, is_one_element, is_type, is_int, is_dict, is_str, is_ From e4b30524909fca86620f69dd85bc9252dc52ca90 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Mon, 3 Sep 2018 16:57:57 -0500 Subject: [PATCH 27/94] No longer used --- optimus/profiler/templates/column_stats.html | 258 ------------------- 1 file changed, 258 deletions(-) delete mode 100644 optimus/profiler/templates/column_stats.html diff --git a/optimus/profiler/templates/column_stats.html b/optimus/profiler/templates/column_stats.html deleted file mode 100644 index f36f6e2e7..000000000 --- a/optimus/profiler/templates/column_stats.html +++ /dev/null @@ -1,258 +0,0 @@ - -{{summary}} -
-

Overview

-
-
-
-

Dataset info

- - - - - - - - - - - - - - - - - - - - - - - -
Number of columns{{data.summary.cols_count}}
Number of rows{{data.summary.rows_count}}
Total Missing (%){{data.summary.missing_count}}
Total size in memory{{data.summary.size}}
-
-
-

Variables types

- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Categorical{{data.count_types.categorical}}
Numeric{{data.count_types.numeric}}
Date{{data.count_types.date}}
Bool{{data.count_types.bool}}
Not available{{data.count_types.null}}
-
-
-
-

Variables

-
-{% for k,v in data.columns.items() %} -
- - {% if v.column_type=="categorical" or v.column_type=="numeric" or v.column_type=="date" or v.column_type=="bool"%} -
-
-

{{k}}

-
{{v.column_type}}
-
- - - - - - - - - - - - - - - - - - - -
Distinct count {{v.stats.uniques_count}}
Unique (%) {{v.stats.p_uniques}}
Missing (%){{v.stats.missing_count}}
Missing (n){{v.stats.p_missing}}
- {% if v.column_type=="numeric" %} - - - - - - - - - - - - - - - - - - - - -
Mean{{v.stats.mean}}
Minimum{{v.stats.min}}
Maximum{{v.stats.max}}
Zeros(%){{v.stats.zeros}}
- {% endif %} - -
-
-

Frequency

- - - - - - {% for vi in v.frequency %} - - - - - - - {% endfor %} - - - - - - - -
ValueCountFrecuency (%)
{{vi.value}}{{vi.count}}{{vi.percentage}}%
"Missing"{{v.stats.missing_count}}{{v.stats.p_missing}}%
-
- {% endif %} - - {% if v.column_type=="numeric" %} -
- - -

Quantile statistics

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Minimum{{v.stats.min}}
5-th percentile{{v.stats.quantile[0.05]}}
Q1{{v.stats.quantile[0.25]}}
Median{{v.stats.quantile[0.5]}}
Q3{{v.stats.quantile[0.75]}}
95-th percentile{{v.stats.quantile[0.95]}}
Maximum{{v.stats.max}}
Range{{v.stats.range}}
Interquartile range{{v.stats.interquartile_range}}
-
-
-

Descriptive statistics

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Standard deviation{{v.stats.stddev}}
Coef of variation{{v.stats.coef_variation}}
Kurtosis{{v.stats.kurtosis}}
Mean{{v.stats.mean}}
MAD{{v.stats.mad}}
Skewness{{v.stats.zeros}}
Sum{{v.stats.sum}}
Variance{{v.stats.variance}}
-
- {% endif %} - -
-{% endfor %} From a5336671b3a223edcd1bcadba5fc77210fc92f88 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Mon, 3 Sep 2018 16:58:43 -0500 Subject: [PATCH 28/94] Fix. Bool added --- optimus/helpers/constants.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/optimus/helpers/constants.py b/optimus/helpers/constants.py index ac4bee7d0..d07407bf4 100644 --- a/optimus/helpers/constants.py +++ b/optimus/helpers/constants.py @@ -18,7 +18,8 @@ PROFILER_TYPES = {"int", "float", "string", "bool", "date", "null"} PROFILER_LEGEND_TYPES = {"string": "ABC", "int": "#", "integer": "#", "float": "##.#", "double": "##.#", "bigint": "#"} -PROFILER_COLUMN_TYPES = {"categorical", "numeric", "date", "null"} + +PROFILER_COLUMN_TYPES = {"categorical", "numeric", "date", "bool","null"} SPARK_SHORT_DTYPES = {"string": "string", "str": "string", From 2699d696e66d3ae5eaf0c68f022c654be9887d2b Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Mon, 3 Sep 2018 16:59:27 -0500 Subject: [PATCH 29/94] Fix tab --- optimus/dataframe/columns.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/optimus/dataframe/columns.py b/optimus/dataframe/columns.py index e2b48c0c3..8c4b31ac2 100644 --- a/optimus/dataframe/columns.py +++ b/optimus/dataframe/columns.py @@ -16,7 +16,8 @@ from pyspark.sql.functions import Column from optimus.functions import abstract_udf as audf, concat - from optimus.functions import filter_row_by_data_type as fbdt +from optimus.functions import filter_row_by_data_type as fbdt +from optimus.functions import filter_row_by_data_type as fbdt from optimus.helpers.checkit \ import is_num_or_str, is_list, is_tuple, is_list_of_dataframes, is_list_of_tuples, \ is_function, is_one_element, is_type, is_int, is_dict, is_str, is_ @@ -1365,7 +1366,6 @@ def abs(columns): df = df.withColumn(col_name, F.abs(F.col(col_name))) return df - return cols From fb869c39c016fa2fb8e15f6e2ea7191cb7e1381b Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Mon, 3 Sep 2018 17:02:03 -0500 Subject: [PATCH 30/94] Make rows count readable --- optimus/profiler/profiler.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/optimus/profiler/profiler.py b/optimus/profiler/profiler.py index a9c4e38b0..e6f2a95b4 100644 --- a/optimus/profiler/profiler.py +++ b/optimus/profiler/profiler.py @@ -130,12 +130,13 @@ def _count_data_types(col_name): columns = parse_columns(df, columns) + # Info from all the columns type_details = {c: _count_data_types(c) for c in columns} results = {} count_types = {} - # Count the categorical, numerical and date columns + # Count the categorical, numerical, boolean and date columns for v in type_details.values(): name = v["type"] if name in count_types: @@ -172,7 +173,7 @@ def columns(df, columns, buckets=40, relative_error=1): column_info['columns'] = {} rows_count = df.count() - column_info['rows_count'] = rows_count + column_info['rows_count'] = humanize.intword(rows_count) count_dtypes = Profiler.count_data_types(df, columns) From 3d1370f65b6744378126ac83890739b0ce05deb1 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Mon, 3 Sep 2018 17:24:44 -0500 Subject: [PATCH 31/94] partitions() now return a number. Added partitions number information to table() --- optimus/dataframe/extension.py | 12 +++++++----- optimus/templates/table.html | 8 ++++---- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/optimus/dataframe/extension.py b/optimus/dataframe/extension.py index ad3c91954..cdf5d020e 100644 --- a/optimus/dataframe/extension.py +++ b/optimus/dataframe/extension.py @@ -166,11 +166,11 @@ def sql(self, sql_expression): @add_attr(DataFrame) def partitions(self): """ - Return dataframes partitions number - :param self: - :return: + Return the dataframe partitions number + :param self: Dataframe + :return: Number of partitions """ - print(self.rdd.getNumPartitions()) + return self.rdd.getNumPartitions() @add_method(DataFrame) @@ -213,9 +213,11 @@ def table_html(self, limit=100, columns=None): total_rows = humanize.intword(total_rows) total_cols = self.cols.count() + total_partitions = self.partitions() # Print table - output = template.render(cols=dtypes, data=data, limit=limit, total_rows=total_rows, total_cols=total_cols) + output = template.render(cols=dtypes, data=data, limit=limit, total_rows=total_rows, total_cols=total_cols, + partitions=total_partitions) return output diff --git a/optimus/templates/table.html b/optimus/templates/table.html index 387a0694e..b0a6d2cda 100644 --- a/optimus/templates/table.html +++ b/optimus/templates/table.html @@ -28,14 +28,14 @@ font-size: 12px; } - -{% macro header_footer(limit, total_rows, total_cols) %} +{% macro header_footer(limit, total_rows, total_cols, partitions) %}
Viewing {{limit}} of {{total_rows}} rows / {{total_cols}} columns
+
{{partitions}} partition(s)
{% endmacro %} -{{header_footer(limit, total_rows, total_cols)}} +{{header_footer(limit, total_rows, total_cols, partitions)}} @@ -65,4 +65,4 @@ {% endfor %}
-{{header_footer(limit, total_rows, total_cols)}} \ No newline at end of file +{{header_footer(limit, total_rows, total_cols,partitions)}} \ No newline at end of file From aa9c906e652491ae3431216097269996f22ff7dd Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Tue, 4 Sep 2018 03:17:19 -0500 Subject: [PATCH 32/94] Updated column profiler table --- optimus/profiler/templates/one_column.html | 10 ++++---- .../profiler/templates/variables_types.html | 25 ------------------- 2 files changed, 5 insertions(+), 30 deletions(-) delete mode 100644 optimus/profiler/templates/variables_types.html diff --git a/optimus/profiler/templates/one_column.html b/optimus/profiler/templates/one_column.html index 56710bba3..b67ad5564 100644 --- a/optimus/profiler/templates/one_column.html +++ b/optimus/profiler/templates/one_column.html @@ -36,7 +36,7 @@

{{data.name}}

- + @@ -44,12 +44,12 @@

{{data.name}}

- - + + - - + +
Distinct countUnique {{data.stats.uniques_count}}
{{data.stats.p_uniques}}
Missing (%){{data.stats.missing_count}}Missing{{data.stats.p_missing}}
Missing (n){{data.stats.p_missing}}Missing (%){{data.stats.missing_count}}
diff --git a/optimus/profiler/templates/variables_types.html b/optimus/profiler/templates/variables_types.html deleted file mode 100644 index a7f5273a0..000000000 --- a/optimus/profiler/templates/variables_types.html +++ /dev/null @@ -1,25 +0,0 @@ -

Variables Types

- - - - - - - - - - - - - - - - - - - - - - - -
Categorical{{data.categorical}}
Numeric{{data.numeric}}
Date{{data.date}}
Not available{{data.null}}
\ No newline at end of file From 71043a9931b0fd401f82e59356993497357cafee Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Tue, 4 Sep 2018 03:20:30 -0500 Subject: [PATCH 33/94] Add decorator to track a function execution time --- optimus/helpers/decorators.py | 33 ++- optimus/profiler/profiler.py | 533 ++++++++++++++++++++-------------- 2 files changed, 352 insertions(+), 214 deletions(-) diff --git a/optimus/helpers/decorators.py b/optimus/helpers/decorators.py index 71e978774..69d0ac157 100644 --- a/optimus/helpers/decorators.py +++ b/optimus/helpers/decorators.py @@ -1,9 +1,12 @@ +import timeit +import time +import logging from functools import wraps def add_method(cls): """ - Use it as a decorator to add a function to specific class + Attach a function to a class as an attribute :param cls: Class in which the function will be attached :return: """ @@ -19,11 +22,35 @@ def wrapper(self, *args, **kwargs): return decorator -def add_attr(cls): +def time_it(method): + def timed(*args, **kw): + start_time = timeit.default_timer() + f = method(*args, **kw) + _time = round(timeit.default_timer() - start_time, 2) + logging.info("{name}() executed in {time} sec".format(name=method.__name__, time=_time)) + return f + + return timed + + +def add_attr(cls, log_time=False): + """ + Attach a function to another functions as an attribute + :param cls: class where the function will be attached + :param log_time: Print the execution time. Verbose must be true on the Optimus constructor + :type log_time: bool + :return: + """ + def decorator(func): @wraps(func) def wrapper(*args, **kwargs): - return func(*args, **kwargs) + start_time = timeit.default_timer() + f = func(*args, **kwargs) + _time = round(timeit.default_timer() - start_time, 2) + if log_time: + logging.info("{name}() executed in {time} sec".format(name=func.__name__, time=_time)) + return f setattr(cls, func.__name__, wrapper) return func diff --git a/optimus/profiler/profiler.py b/optimus/profiler/profiler.py index e6f2a95b4..e086a5ae3 100644 --- a/optimus/profiler/profiler.py +++ b/optimus/profiler/profiler.py @@ -1,7 +1,7 @@ import configparser import logging import os -from fastnumbers import fast_float +from collections import defaultdict import dateutil import jinja2 @@ -10,6 +10,7 @@ from pyspark.sql.types import ArrayType, LongType from optimus.functions import filter_row_by_data_type as fbdt, plot_hist, plot_freq +from optimus.helpers.decorators import time_it from optimus.helpers.functions import parse_columns from optimus.profiler.functions import fill_missing_var_types, fill_missing_col_types, \ write_json @@ -36,9 +37,10 @@ def __init__(self, output_path=None): self.path = output_path @staticmethod + @time_it def dataset_info(df): """ - Return info about cols and row counts + Return info about cols, row counts, total missing and disk size :param df: Dataframe to be processed :return: """ @@ -52,50 +54,53 @@ def dataset_info(df): return ( {'cols_count': cols_count, 'rows_count': rows_count, - 'missing_count': str(missing_count / rows_count) + "%", + 'missing_count': str(round(missing_count / rows_count, 2)) + "%", 'size': humanize.naturalsize(df.size())} ) # TODO: This should check only the StringType Columns. The datatype from others columns can be taken from schema(). @staticmethod + @time_it def count_data_types(df, columns): """ Count the number of int, float, string, date and booleans and output the count in json format - :param df: - :param columns: - :return: + :param df: Dataframe to be processed + :param columns: Columns to be processed + :return: json """ + @time_it def _count_data_types(col_name): """ Function for determine if register value is float or int or string. :param col_name: :return: """ - + logging.info("Processing column '" + col_name + "'...") # If String, process the data to try to infer which data type is inside. This a kind of optimization. # We do not need to analyze the data if the column data type is integer or boolean.etc - temp = col_name + "_type" + temp = col_name + "_type" col_data_type = df.cols.dtype(col_name) - if col_data_type is "string": - types = df.withColumn(temp, fbdt(col_name, get_type=True)).groupBy(temp).count().collect() - else: - # All the columns elements has the sime type - types = {col_data_type: df.count()} count_by_data_type = {} + count_empty_strings = 0 + if col_data_type == "string": + types = df.withColumn(temp, fbdt(col_name, get_type=True)).groupBy(temp).count().to_json() + for row in types: + count_by_data_type[row[temp]] = row["count"] + + count_empty_strings = df.where(F.col(col_name) == '').count() + #count_by_data_type['string'] = count_by_data_type['string'] - count_empty_strings - for row in types: - count_by_data_type[row[0]] = row[1] + else: + nulls = df.cols.count_na(col_name) + count_by_data_type[col_data_type] = int(df.count()) - nulls + count_by_data_type["null"] = nulls - # Fill missing data types with 0 count_by_data_type = fill_missing_var_types(count_by_data_type) # Subtract white spaces to the total string count - count_empty_strings = df.where(F.col(col_name) == '').count() - count_by_data_type['string'] = count_by_data_type['string'] - count_empty_strings - data_types_count = {"string": count_by_data_type['string'], "bool": count_by_data_type['bool'], "int": count_by_data_type['int'], @@ -150,204 +155,14 @@ def _count_data_types(col_name): results["columns"] = type_details return results - @staticmethod - def columns(df, columns, buckets=40, relative_error=1): - """ - Return statistical information about a specific column in json format - :param df: Dataframe to be processed - :param columns: Columns that you want to profile - :param buckets: Create buckets divided by range. Each bin is equal. - :param relative_error: relative error when the percentile is calculated. 0 is more exact as slow 1 more error and faster - :return: json object with the - """ - - columns = parse_columns(df, columns) - - # Get just a sample to infer the column data type - # sample_size_number = sample_size(rows_count, 95.0, 2.0) - # fraction = sample_size_number / rows_count - # sample = df.sample(False, fraction, seed=1) - - # Initialize Objects - column_info = {} - column_info['columns'] = {} - - rows_count = df.count() - column_info['rows_count'] = humanize.intword(rows_count) - - count_dtypes = Profiler.count_data_types(df, columns) - - column_info["count_types"] = count_dtypes["count_types"] - column_info['size'] = humanize.naturalsize(df.size()) - - def na(col_name): - return F.count(F.when(F.isnan(col_name) | F.col(col_name).isNull(), col_name)) - - def zeros(col_name): - return F.count(F.when(F.col(col_name) == 0, col_name)) - - # Cast every column to a specific type to ensure the correct profiling - # For example if we calculate the min or max of a string column with numeric values the result will be incorrect - for col_name in columns: - dtype = count_dtypes["columns"][col_name]['dtype'] - # Not force date type conversion, we can not trust that is going to be representative - if dtype in ["string", "float", "int", "bool"]: - df = df.cols.cast(col_name, dtype) - - stats = df.cols._exprs( - [F.min, F.max, F.stddev, F.kurtosis, F.mean, F.skewness, F.sum, F.variance, F.approx_count_distinct, na, - zeros], - columns) - - for col_name in columns: - logging.info("Processing column '" + col_name + "'...") - - col_info = {} - col_info["stats"] = {} - column_info['columns'][col_name] = {} - - column_type = count_dtypes["columns"][col_name]['type'] - col_info['column_dtype'] = count_dtypes["columns"][col_name]['dtype'] - - na = stats[col_name]["na"] - max_value = stats[col_name]["max"] - min_value = stats[col_name]["min"] - - col_info['name'] = col_name - col_info['column_type'] = column_type - - # Numeric Column - if column_type == "numeric" or column_type == "date": - # Merge - col_info["stats"] = stats[col_name] - - # Missing - col_info['stats']['missing_count'] = round(na, 2) - col_info['stats']['p_missing'] = round(na / rows_count * 100, 2) - col_info["dtypes_stats"] = count_dtypes["columns"][col_name]['details'] - - if column_type == "categorical" or column_type == "numeric" or column_type == "date" or column_type == "bool": - # Frequency - - freq = (df.groupBy(col_name) - .count() - .rows.sort([("count", "desc"), (col_name, "desc")]) - .limit(buckets) - .withColumn("percentage", - F.round((F.col("count") / rows_count) * 100, - 3)) - .cols.rename(col_name, "value").to_json()) - - col_info['frequency'] = freq[:10] - col_info['frequency_graph'] = freq - - # Uniques - uniques = stats[col_name].pop("approx_count_distinct") - col_info['stats']["uniques_count"] = uniques - col_info['stats']["p_uniques"] = round(uniques / rows_count * 100, 3) - - if column_type == "numeric": - # Additional Stats - # Percentile can not be used a normal sql.functions. approxQuantile in this case need and extra pass - # https://stackoverflow.com/questions/45287832/pyspark-approxquantile-function - max_value = fast_float(max_value) - min_value = fast_float(min_value) - col_info['stats']['quantile'] = df.cols.percentile(col_name, [0.05, 0.25, 0.5, 0.75, 0.95], - relative_error) - - col_info['stats']['range'] = max_value - min_value - col_info['stats']['median'] = col_info['stats']['quantile'][0.5] - col_info['stats']['interquartile_range'] = col_info['stats']['quantile'][0.75] - \ - col_info['stats']['quantile'][0.25] - col_info['stats']['coef_variation'] = round((col_info['stats']['stddev'] / col_info['stats']['mean']), - 5) - col_info['stats']['mad'] = round(df.cols.mad(col_name), 5) - - col_info["hist"] = df.cols.hist(col_name, min_value, max_value, buckets) - - if column_type == "categorical": - col_name_len = col_name + "_len" - df = df.cols.apply_expr(col_name_len, F.length(F.col(col_name))) - min_value = df.cols.min(col_name_len) - max_value = df.cols.max(col_name_len) - - # Max value can be considered as the number of buckets - buckets_for_string = buckets - if max_value <= 50: - buckets_for_string = max_value - - col_info["hist"] = df.cols.hist(col_name_len, min_value, max_value, buckets_for_string) - - if column_type == "date": - col_info["hist"] = {} - - # Create year/month/week day/hour/minute - def infer_date(value, args): - if value is None: - result = [None] - else: - date = dateutil.parser.parse(value) - result = [date.year, date.month, date.weekday(), date.hour, date.minute] - return result - - df = df \ - .cols.apply('year', infer_date, ArrayType(LongType())) \ - .cols.unnest("year") \ - .h_repartition() - - for i in range(5): - key_name = "" - temp_col = col_name + "_" + str(i) - # Years - if i == 0: - buckets_date = 100 - key_name = "years" - - min_value = df.cols.min(temp_col) - max_value = df.cols.max(temp_col) - - # Months - elif i == 1: - buckets_date = 12 - min_value = 0 - max_value = 12 - key_name = "months" - - # Weekdays - elif i == 2: - buckets_date = 7 - min_value = 0 - max_value = 7 - key_name = "weekdays" - - # Hours - elif i == 3: - buckets_date = 24 - min_value = 0 - max_value = 24 - key_name = "hours" - - # Minutes - elif i == 4: - buckets_date = 60 - min_value = 0 - max_value = 60 - key_name = "minutes" - - col_info["hist"][key_name] = df.cols.hist(temp_col, min_value, max_value, buckets_date) - - column_info['columns'][col_name] = col_info - - return column_info - def run(self, df, columns, buckets=40, relative_error=1): """ Return dataframe statistical information in HTML Format - + :param df: Dataframe to be analyzed :param columns: Columns to be analized :param buckets: Number of buckets calculated to print the histogram - :param relative_error: Relative Error for quantile discretizer calculation + :param relative_error: Relative Error for quantile discretizer calculation :return: """ @@ -382,6 +197,7 @@ def run(self, df, columns, buckets=40, relative_error=1): hist_pic = {"hist_years": hist_year, "hist_months": hist_month, "hist_weekdays": hist_weekday, "hist_hours": hist_hour, "hist_minutes": hist_minute} else: + hist = plot_hist({col_name: col["hist"]}, output="base64") hist_pic = {"hist_pic": hist} @@ -423,3 +239,298 @@ def to_json(df, columns, buckets=40, relative_error=1): output["sample"] = {"columns": df.columns, "data": data} return output + + @staticmethod + def columns(df, columns, buckets=40, relative_error=1): + """ + Return statistical information about a specific column in json format + :param df: Dataframe to be processed + :param columns: Columns that you want to profile + :param buckets: Create buckets divided by range. Each bin is equal. + :param relative_error: relative error when the percentile is calculated. 0 is more exact as slow 1 more error and faster + :return: json object with the + """ + + columns = parse_columns(df, columns) + + # Get just a sample to infer the column data type + # sample_size_number = sample_size(rows_count, 95.0, 2.0) + # fraction = sample_size_number / rows_count + # sample = df.sample(False, fraction, seed=1) + + # Initialize Objects + columns_info = {} + columns_info['columns'] = {} + + rows_count = df.count() + columns_info['rows_count'] = humanize.intword(rows_count) + + count_dtypes = Profiler.count_data_types(df, columns) + + columns_info["count_types"] = count_dtypes["count_types"] + columns_info['size'] = humanize.naturalsize(df.size()) + + # Cast columns to the data type infer by count_data_types() + df = Profiler.cast_columns(df, columns, count_dtypes) + + # Calculate stats + stats = Profiler.general_stats(df, columns) + + for col_name in columns: + col_info = {} + logging.info("------------------------------") + logging.info("Processing column '" + col_name + "'...") + columns_info['columns'][col_name] = {} + + col_info["stats"] = stats[col_name] + col_info.update(Profiler.frequency(df, col_name, buckets)) + col_info.update(Profiler.stats_by_column(df, col_name, stats, count_dtypes)) + + col_info['column_dtype'] = count_dtypes["columns"][col_name]['dtype'] + col_info["dtypes_stats"] = count_dtypes["columns"][col_name]['details'] + + column_type = count_dtypes["columns"][col_name]['type'] + + if column_type == "numeric": + col_info["stats"].update(Profiler.extra_numeric_stats(df, col_name, stats, relative_error)) + col_info["hist"] = df.cols.hist(col_name, stats[col_name]["min"], stats[col_name]["max"], buckets) + + if column_type == "categorical": + col_info["hist"] = Profiler.hist_string(df, col_name, buckets) + + if column_type == "date": + col_info["hist"] = Profiler.hist_date(df, col_name) + + columns_info['columns'][col_name] = col_info + + return columns_info + + @staticmethod + @time_it + def frequency(df, col_name, buckets): + """ + Calculate the item frequency by column + :param df: + :param col_name: + :param buckets: + :return: + """ + rows_count = df.count() + col_info = {} + # Frequency + freq = (df.groupBy(col_name) + .count() + .rows.sort([("count", "desc"), (col_name, "desc")]) + .limit(buckets) + .withColumn("percentage", + F.round((F.col("count") / rows_count) * 100, + 3)) + .cols.rename(col_name, "value").to_json()) + + # Get only ten items to print the table + col_info['frequency'] = freq[:10] + col_info['frequency_graph'] = freq + return col_info + + @staticmethod + @time_it + def general_stats(df, columns): + """ + Return General stats for a column + :param df: + :param columns: + :return: + """ + + def na(col_name): + return F.count(F.when(F.isnan(col_name) | F.col(col_name).isNull(), col_name)) + + def zeros(col_name): + return F.count(F.when(F.col(col_name) == 0, col_name)) + + stats = df.cols._exprs( + [F.min, F.max, F.stddev, F.kurtosis, F.mean, F.skewness, F.sum, F.variance, F.approx_count_distinct, na, + zeros], + columns) + return stats + + @staticmethod + @time_it + def extra_numeric_stats(df, col_name, stats, relative_error): + """ + Specific Stats for numeric columns + :param df: + :param col_name: + :param stats: + :param relative_error: + :return: + """ + + col_info = defaultdict() + col_info['stats'] = {} + # Percentile can not be used a normal sql.functions. approxQuantile in this case need and extra pass + # https://stackoverflow.com/questions/45287832/pyspark-approxquantile-function + quantile = df.cols.percentile(col_name, [0.05, 0.25, 0.5, 0.75, 0.95], + relative_error) + + max_value = stats[col_name]["max"] + min_value = stats[col_name]["min"] + stddev = stats[col_name]['stddev'] + mean = stats[col_name]['mean'] + + col_info['range'] = max_value - min_value + col_info['median'] = quantile[0.5] + col_info['interquartile_range'] = quantile[0.75] - quantile[0.25] + + col_info['coef_variation'] = round((stddev / mean), 5) + col_info['mad'] = round(df.cols.mad(col_name), 5) + col_info['quantile'] = quantile + + return col_info + + @staticmethod + @time_it + def cast_columns(df, columns, count_dtypes): + """ + Cast column depending of inferred data type. + :param df: Dataframe to be analyzed + :param columns: Dataframe columns to be analyzed + :param count_dtypes: String with columns and data types + :return: Dataframe with casted columns + """ + # Cast every column to a specific type to ensure the correct profiling + # For example if we calculate the min or max of a string column with numeric values the result will be incorrect + for col_name in columns: + dtype = count_dtypes["columns"][col_name]['dtype'] + # Not force date type conversion, we can not trust that is going to be representative + if dtype in ["string", "float", "int", "bool"]: + df = df.cols.cast(col_name, dtype) + return df + + @staticmethod + @time_it + def stats_by_column(df, col_name, stats, count_dtypes): + """ + :param df: Dataframe to be analyzed + :param col_name: Dataframe column to be analyzed + :param count_dtypes: + :return: + """ + rows_count = df.count() + col_info = {} + col_info["stats"] = {} + + column_type = count_dtypes["columns"][col_name]['type'] + col_info['column_dtype'] = count_dtypes["columns"][col_name]['dtype'] + + na = stats[col_name]["na"] + + col_info['name'] = col_name + col_info['column_type'] = column_type + + # Numeric Column + if column_type == "numeric" or column_type == "date": + # Merge + col_info["stats"] = stats[col_name] + + # Missing + col_info['stats']['missing_count'] = round(na, 2) + col_info['stats']['p_missing'] = round(na / rows_count * 100, 2) + col_info["dtypes_stats"] = count_dtypes["columns"][col_name]['details'] + + # Uniques + uniques = stats[col_name].pop("approx_count_distinct") + col_info['stats']["uniques_count"] = uniques + col_info['stats']["p_uniques"] = round((uniques / rows_count) * 100, 3) + return col_info + + @staticmethod + def hist_date(df, col_name): + """ + Create a histogram for a date type column + :param df: Dataframe to be analyzed + :param col_name: Dataframe column to be analyzed + :return: + """ + col_info = {} + + # Create year/month/week day/hour/minute + def infer_date(value, args): + if value is None: + result = [None] + else: + date = dateutil.parser.parse(value) + result = [date.year, date.month, date.weekday(), date.hour, date.minute] + return result + + df = df \ + .cols.apply('year', infer_date, ArrayType(LongType())) \ + .cols.unnest("year") \ + .h_repartition() + + for i in range(5): + key_name = "" + temp_col = col_name + "_" + str(i) + # Years + if i == 0: + buckets_date = 100 + key_name = "years" + + min_value = df.cols.min(temp_col) + max_value = df.cols.max(temp_col) + + # Months + elif i == 1: + buckets_date = 12 + min_value = 0 + max_value = 12 + key_name = "months" + + # Weekdays + elif i == 2: + buckets_date = 7 + min_value = 0 + max_value = 7 + key_name = "weekdays" + + # Hours + elif i == 3: + buckets_date = 24 + min_value = 0 + max_value = 24 + key_name = "hours" + + # Minutes + elif i == 4: + buckets_date = 60 + min_value = 0 + max_value = 60 + key_name = "minutes" + + col_info[key_name] = df.cols.hist(temp_col, min_value, max_value, buckets_date) + + return col_info + + @staticmethod + def hist_string(df, col_name, buckets): + """ + Create a string for a date type column + :param df: Dataframe to be analyzed + :param col_name: Dataframe column to be analyzed + :param buckets: + :return: + """ + + col_name_len = col_name + "_len" + df = df.cols.apply_expr(col_name_len, F.length(F.col(col_name))) + min_value = df.cols.min(col_name_len) + max_value = df.cols.max(col_name_len) + + # Max value can be considered as the number of buckets + buckets_for_string = buckets + if max_value <= 50: + buckets_for_string = max_value + + result = df.cols.hist(col_name_len, min_value, max_value, buckets_for_string) + + return result From 78e954487aa0eb7a75a04c0cd68d47fc75a09ca5 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Tue, 4 Sep 2018 03:21:06 -0500 Subject: [PATCH 34/94] Added array type --- optimus/profiler/templates/general_info.html | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/optimus/profiler/templates/general_info.html b/optimus/profiler/templates/general_info.html index 427320ba0..0c663087d 100644 --- a/optimus/profiler/templates/general_info.html +++ b/optimus/profiler/templates/general_info.html @@ -53,11 +53,11 @@

Dataset info

-

Variables types

+

Column types

- + @@ -75,6 +75,11 @@

Variables types

+ + + + + From cc5a1ffb83759c6720bfe723e7a34a3ebb13001c Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Tue, 4 Sep 2018 03:22:52 -0500 Subject: [PATCH 35/94] Added decortor to track execution time --- optimus/dataframe/columns.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/optimus/dataframe/columns.py b/optimus/dataframe/columns.py index 8c4b31ac2..e5751db8c 100644 --- a/optimus/dataframe/columns.py +++ b/optimus/dataframe/columns.py @@ -23,7 +23,7 @@ is_function, is_one_element, is_type, is_int, is_dict, is_str, is_ # Helpers from optimus.helpers.constants import * -from optimus.helpers.decorators import add_attr +from optimus.helpers.decorators import add_attr, time_it from optimus.helpers.functions \ import validate_columns_names, parse_columns, format_dict, \ tuple_to_dict, val_to_list, filter_list, get_spark_dtypes_object @@ -414,7 +414,7 @@ def drop(columns=None, regex=None, data_type=None): df = df.drop(column) return df - @add_attr(cols) + @add_attr(cols, log_time=True) def _exprs(funcs, columns): """ Helper function to apply multiple columns expression to multiple columns @@ -469,11 +469,9 @@ def parse_col_names_funcs_to_keys(data): for func in funcs: exprs.append(func(col_name).alias(func.__name__ + "_" + col_name)) - return ( - parse_col_names_funcs_to_keys( - format_dict(df.agg(*exprs).to_json()) - ) - ) + result = parse_col_names_funcs_to_keys(format_dict(df.agg(*exprs).to_json())) + # logging.info(result) + return result # Quantile statistics @add_attr(cols) @@ -524,15 +522,15 @@ def median(columns): return percentile(columns, [0.5]) - @add_attr(cols) + @add_attr(cols, log_time=True) def percentile(columns, values=None, error=1): """ Return the percentile of a dataframe :param columns: '*', list of columns names or a single column name. :param values: list of percentiles to be calculated + :param error: :return: percentiles per columns """ - start_time = timeit.default_timer() if values is None: values = [0.05, 0.25, 0.5, 0.75, 0.95] @@ -551,8 +549,6 @@ def percentile(columns, values=None, error=1): percentile_results = dict(zip(columns, percentile_results)) - logging.info("percentile") - logging.info(timeit.default_timer() - start_time) return format_dict(percentile_results) # Descriptive Analytics @@ -1274,7 +1270,7 @@ def hist(columns, min_value, max_value, buckets=10): return hist_data - @add_attr(cols) + @add_attr(cols, log_time=True) @dispatch((str, list), int) def hist(columns, buckets=10): return self.cols.hist(columns, fast_float(self.cols.min(columns)), fast_float(self.cols.max(columns)), buckets) From 312b315a45b735c2d6f8bce72db30f233f56ef8c Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Tue, 4 Sep 2018 04:30:33 -0500 Subject: [PATCH 36/94] Now the profiler has two modes. In one the profiler try to infer the data type to provide extra information about the column --- examples/new-api-profiler.ipynb | 6923 +++++++++++++++++- optimus/functions.py | 19 +- optimus/helpers/constants.py | 4 +- optimus/profiler/functions.py | 6 +- optimus/profiler/profiler.py | 27 +- optimus/profiler/templates/general_info.html | 2 +- 6 files changed, 6589 insertions(+), 392 deletions(-) diff --git a/examples/new-api-profiler.ipynb b/examples/new-api-profiler.ipynb index 497ff5202..0e44df555 100644 --- a/examples/new-api-profiler.ipynb +++ b/examples/new-api-profiler.ipynb @@ -26,55 +26,25 @@ "metadata": { "scrolled": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Just check that Spark and all necessary environments vars are present...\n", - "-----\n", - "SPARK_HOME=C:\\opt\\spark\\spark-2.3.1-bin-hadoop2.7\n", - "HADOOP_HOME=C:\\opt\\spark\\spark-2.3.1-bin-hadoop2.7\n", - "You don't have PYSPARK_PYTHON set\n", - "You don't have PYSPARK_DRIVER_PYTHON set\n", - "JAVA_HOME=C:\\Program Files\\Java\\jdk1.8.0_181\n", - "Pyarrow Installed\n", - "-----\n", - "Starting or getting SparkSession and SparkContext...\n", - "\n", - " ____ __ _ \n", - " / __ \\____ / /_(_)___ ___ __ _______\n", - " / / / / __ \\/ __/ / __ `__ \\/ / / / ___/\n", - " / /_/ / /_/ / /_/ / / / / / / /_/ (__ ) \n", - " \\____/ .___/\\__/_/_/ /_/ /_/\\__,_/____/ \n", - " /_/ \n", - " \n", - "Transform and Roll out...\n", - "Setting checkpoint folder local. If you are in a cluster initialize Optimus with master='your_ip' as param\n", - "Deleting previous folder if exists...\n", - "Creating the checkpoint directory...\n", - "Optimus successfully imported. Have fun :).\n" - ] - } - ], + "outputs": [], "source": [ "# Create optimus\n", "from optimus import Optimus\n", - "op = Optimus(master=\"local[*]\", app_name = \"optimus\" ,verbose =True, checkpoint= True)" + "op = Optimus(master=\"local[*]\", app_name = \"optimus\" , checkpoint= True)" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "df = op.load.csv(\"data/Meteorite_Landings.csv\")" + "df = op.load.csv(\"data/Meteorite_Landings.csv\").h_repartition()" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": { "scrolled": false }, @@ -83,87 +53,6289 @@ "data": { "text/html": [ "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 10 of 45716 rows / 10 columns
\n", + "
32 partition(s)
\n", + "\n", + "
CategoricalString {{data.count_types.categorical}}
Bool {{data.count_types.bool}}
Array{{data.count_types.bool}}
Not available
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
name
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
id
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
nametype
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
recclass
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
mass (g)
\n", + "
5 (double)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
fall
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
year
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
reclat
\n", + "
8 (double)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
reclong
\n", + "
9 (double)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
GeoLocation
\n", + "
10 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " Acfer⸱232\n", + " \n", + " 240\n", + " \n", + " Valid\n", + " \n", + " H5\n", + " \n", + " 725.0\n", + " \n", + " Found\n", + " \n", + " 01/01/1991⸱12:00:00⸱AM\n", + " \n", + " 27.73944\n", + " \n", + " 4.32833\n", + " \n", + " (27.739440,⸱4.328330)\n", + "
\n", + " Elephant⸱Moraine⸱90232\n", + " \n", + " 8641\n", + " \n", + " Valid\n", + " \n", + " L6\n", + " \n", + " 16.9\n", + " \n", + " Found\n", + " \n", + " 01/01/1990⸱12:00:00⸱AM\n", + " \n", + " -76.28795\n", + " \n", + " 156.46841\n", + " \n", + " (-76.287950,⸱156.468410)\n", + "
\n", + " Grove⸱Mountains⸱020090\n", + " \n", + " 30681\n", + " \n", + " Valid\n", + " \n", + " Martian⸱(shergottite)\n", + " \n", + " 7.5\n", + " \n", + " Found\n", + " \n", + " 01/01/2003⸱12:00:00⸱AM\n", + " \n", + " -72.99944\n", + " \n", + " 75.26111\n", + " \n", + " (-72.999440,⸱75.261110)\n", + "
\n", + " Northwest⸱Africa⸱891\n", + " \n", + " 31912\n", + " \n", + " Valid\n", + " \n", + " H4\n", + " \n", + " 70.8\n", + " \n", + " Found\n", + " \n", + " 01/01/2001⸱12:00:00⸱AM\n", + " \n", + " None\n", + " \n", + " None\n", + " \n", + " None\n", + "
\n", + " Queen⸱Alexandra⸱Range⸱93098\n", + " \n", + " 19187\n", + " \n", + " Valid\n", + " \n", + " H6\n", + " \n", + " 1.2\n", + " \n", + " Found\n", + " \n", + " 01/01/1993⸱12:00:00⸱AM\n", + " \n", + " -84.5757\n", + " \n", + " 162.56524\n", + " \n", + " (-84.575700,⸱162.565240)\n", + "
\n", + " Queen⸱Alexandra⸱Range⸱94691\n", + " \n", + " 20322\n", + " \n", + " Valid\n", + " \n", + " H6\n", + " \n", + " 9.6\n", + " \n", + " Found\n", + " \n", + " 01/01/1994⸱12:00:00⸱AM\n", + " \n", + " -84.0\n", + " \n", + " 168.0\n", + " \n", + " (-84.000000,⸱168.000000)\n", + "
\n", + " Meteorite⸱Hills⸱00977\n", + " \n", + " 16211\n", + " \n", + " Valid\n", + " \n", + " H5\n", + " \n", + " 13.2\n", + " \n", + " Found\n", + " \n", + " 01/01/2000⸱12:00:00⸱AM\n", + " \n", + " -79.68333\n", + " \n", + " 159.75\n", + " \n", + " (-79.683330,⸱159.750000)\n", + "
\n", + " Grove⸱Mountains⸱020114\n", + " \n", + " 46531\n", + " \n", + " Valid\n", + " \n", + " L3\n", + " \n", + " 1.0\n", + " \n", + " Found\n", + " \n", + " 01/01/2003⸱12:00:00⸱AM\n", + " \n", + " -72.98194\n", + " \n", + " 75.25167\n", + " \n", + " (-72.981940,⸱75.251670)\n", + "
\n", + " Pecora⸱Escarpment⸱91483\n", + " \n", + " 18774\n", + " \n", + " Valid\n", + " \n", + " H5\n", + " \n", + " 5.5\n", + " \n", + " Found\n", + " \n", + " 01/01/1991⸱12:00:00⸱AM\n", + " \n", + " -85.55819\n", + " \n", + " -68.31586\n", + " \n", + " (-85.558190,⸱-68.315860)\n", + "
\n", + " Ramlat⸱as⸱Sahmah⸱390\n", + " \n", + " 55656\n", + " \n", + " Valid\n", + " \n", + " H3.8-6\n", + " \n", + " 0.69\n", + " \n", + " Found\n", + " \n", + " 01/01/2010⸱12:00:00⸱AM\n", + " \n", + " 20.0949\n", + " \n", + " 55.69318\n", + " \n", + " (20.094900,⸱55.693180)\n", + "
\n", + "\n", + "
Viewing 10 of 45716 rows / 10 columns
\n", + "
32 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.table(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Profiler dump mode (Faster). It just handle the column data type as present in the dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "

Overview

\n", + "
\n", + "
\n", + "
\n", + "

Dataset info

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
Number of columns10
Number of rows45716
Total Missing (%)0.49%
Total size in memory49.9 MB
\n", + "
\n", + "
\n", + "

Column types

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
String9
Numeric1
Date0
Bool0
Array0
Not available0
\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

name

\n", + "
categorical
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 45515
Unique (%) 99.56
Missing0.0
Missing (%)0
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 45716\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 0\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
Święcany10.002%
Łowicz10.002%
Österplana 06410.002%
Österplana 06310.002%
Österplana 06210.002%
Österplana 06110.002%
Österplana 06010.002%
Österplana 05910.002%
Österplana 05810.002%
Österplana 05710.002%
\"Missing\"00.0%
\n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

id

\n", + "
numeric
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 42365
Unique (%) 92.67
Missing0.0
Missing (%)0
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 0\n", + "
\n", + " Integer\n", + " \n", + " 45716\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 0\n", + "
\n", + " \n", + "
\n", + "

\n", + " Basic Stats\n", + "

\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
Mean26889.73510368361
Minimum1
Maximum57458
Zeros(%)0
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
5745810.002%
5745710.002%
5745610.002%
5745510.002%
5745410.002%
5745310.002%
5743610.002%
5743510.002%
5743410.002%
5743310.002%
\"Missing\"00.0%
\n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "

Quantile statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Minimum1
5-th percentile1.0
Q11.0
Median1.0
Q31.0
95-th percentile1.0
Maximum57458
Range57457
Interquartile range0.0
\n", + "
\n", + "
\n", + "

Descriptive statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Standard deviation16860.683030276334
Coef of variation0.62703
Kurtosis-1.1602608393254068
Mean26889.73510368361
MAD0.0
Skewness0
Sum1229291130
Variance284282632.2474484
\n", + "
\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

nametype

\n", + "
categorical
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 2
Unique (%) 0.004
Missing0.0
Missing (%)0
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 45716\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 0\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
Valid4564199.836%
Relict750.164%
\"Missing\"00.0%
\n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

recclass

\n", + "
categorical
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 444
Unique (%) 0.971
Missing0.0
Missing (%)0
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 45716\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 0\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
L6828518.123%
H5714215.623%
L5479610.491%
H645289.905%
H442119.211%
LL527666.05%
LL620434.469%
L412532.741%
H4/54280.936%
CM24160.91%
\"Missing\"00.0%
\n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

mass (g)

\n", + "
categorical
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 13496
Unique (%) 29.521
Missing0.29
Missing (%)131
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 0\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 131\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
1.31710.374%
1.21400.306%
1.41380.302%
None1310.287%
2.11300.284%
2.41260.276%
1.61200.262%
0.51190.26%
1.11160.254%
3.81140.249%
\"Missing\"1310.29%
\n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

fall

\n", + "
categorical
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 2
Unique (%) 0.004
Missing0.0
Missing (%)0
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 45716\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 0\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
Found4460997.579%
Fell11072.421%
\"Missing\"00.0%
\n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

year

\n", + "
categorical
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 265
Unique (%) 0.58
Missing0.63
Missing (%)288
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 45428\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 288\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
01/01/2003 12:00:00 AM33237.269%
01/01/1979 12:00:00 AM30466.663%
01/01/1998 12:00:00 AM26975.899%
01/01/2006 12:00:00 AM24565.372%
01/01/1988 12:00:00 AM22965.022%
01/01/2002 12:00:00 AM20784.545%
01/01/2004 12:00:00 AM19404.244%
01/01/2000 12:00:00 AM17923.92%
01/01/1997 12:00:00 AM16963.71%
01/01/1999 12:00:00 AM16913.699%
\"Missing\"2880.63%
\n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

reclat

\n", + "
categorical
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 12140
Unique (%) 26.555
Missing16.0
Missing (%)7315
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 0\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 7315\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
None731516.001%
0.0643814.083%
-71.5476110.414%
-84.030406.65%
-72.015063.294%
-79.6833311302.472%
-76.716676801.487%
-76.183335391.179%
-84.216672630.575%
-86.366672260.494%
\"Missing\"731516.0%
\n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

reclong

\n", + "
categorical
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 14546
Unique (%) 31.818
Missing16.0
Missing (%)7315
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 0\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 7315\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
None731516.001%
0.0621413.593%
35.66667498510.904%
168.030406.65%
26.015063.294%
159.756571.437%
159.666676371.393%
157.166675421.186%
155.754731.035%
160.52630.575%
\"Missing\"731516.0%
\n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

GeoLocation

\n", + "
categorical
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 16686
Unique (%) 36.499
Missing16.0
Missing (%)7315
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 38401\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 7315\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
None731516.001%
(0.000000, 0.000000)621413.593%
(-71.500000, 35.666670)476110.414%
(-84.000000, 168.000000)30406.65%
(-72.000000, 26.000000)15053.292%
(-79.683330, 159.750000)6571.437%
(-76.716670, 159.666670)6371.393%
(-76.183330, 157.166670)5391.179%
(-79.683330, 155.750000)4731.035%
(-84.216670, 160.500000)2630.575%
\"Missing\"731516.0%
\n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 10 of 45716 rows / 10 columns
\n", + "
32 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
name
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
id
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
nametype
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
recclass
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
mass (g)
\n", + "
5 (double)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
fall
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
year
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
reclat
\n", + "
8 (double)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
reclong
\n", + "
9 (double)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
GeoLocation
\n", + "
10 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " Acfer⸱232\n", + " \n", + " 240\n", + " \n", + " Valid\n", + " \n", + " H5\n", + " \n", + " 725.0\n", + " \n", + " Found\n", + " \n", + " 01/01/1991⸱12:00:00⸱AM\n", + " \n", + " 27.73944\n", + " \n", + " 4.32833\n", + " \n", + " (27.739440,⸱4.328330)\n", + "
\n", + " Elephant⸱Moraine⸱90232\n", + " \n", + " 8641\n", + " \n", + " Valid\n", + " \n", + " L6\n", + " \n", + " 16.9\n", + " \n", + " Found\n", + " \n", + " 01/01/1990⸱12:00:00⸱AM\n", + " \n", + " -76.28795\n", + " \n", + " 156.46841\n", + " \n", + " (-76.287950,⸱156.468410)\n", + "
\n", + " Grove⸱Mountains⸱020090\n", + " \n", + " 30681\n", + " \n", + " Valid\n", + " \n", + " Martian⸱(shergottite)\n", + " \n", + " 7.5\n", + " \n", + " Found\n", + " \n", + " 01/01/2003⸱12:00:00⸱AM\n", + " \n", + " -72.99944\n", + " \n", + " 75.26111\n", + " \n", + " (-72.999440,⸱75.261110)\n", + "
\n", + " Northwest⸱Africa⸱891\n", + " \n", + " 31912\n", + " \n", + " Valid\n", + " \n", + " H4\n", + " \n", + " 70.8\n", + " \n", + " Found\n", + " \n", + " 01/01/2001⸱12:00:00⸱AM\n", + " \n", + " None\n", + " \n", + " None\n", + " \n", + " None\n", + "
\n", + " Queen⸱Alexandra⸱Range⸱93098\n", + " \n", + " 19187\n", + " \n", + " Valid\n", + " \n", + " H6\n", + " \n", + " 1.2\n", + " \n", + " Found\n", + " \n", + " 01/01/1993⸱12:00:00⸱AM\n", + " \n", + " -84.5757\n", + " \n", + " 162.56524\n", + " \n", + " (-84.575700,⸱162.565240)\n", + "
\n", + " Queen⸱Alexandra⸱Range⸱94691\n", + " \n", + " 20322\n", + " \n", + " Valid\n", + " \n", + " H6\n", + " \n", + " 9.6\n", + " \n", + " Found\n", + " \n", + " 01/01/1994⸱12:00:00⸱AM\n", + " \n", + " -84.0\n", + " \n", + " 168.0\n", + " \n", + " (-84.000000,⸱168.000000)\n", + "
\n", + " Meteorite⸱Hills⸱00977\n", + " \n", + " 16211\n", + " \n", + " Valid\n", + " \n", + " H5\n", + " \n", + " 13.2\n", + " \n", + " Found\n", + " \n", + " 01/01/2000⸱12:00:00⸱AM\n", + " \n", + " -79.68333\n", + " \n", + " 159.75\n", + " \n", + " (-79.683330,⸱159.750000)\n", + "
\n", + " Grove⸱Mountains⸱020114\n", + " \n", + " 46531\n", + " \n", + " Valid\n", + " \n", + " L3\n", + " \n", + " 1.0\n", + " \n", + " Found\n", + " \n", + " 01/01/2003⸱12:00:00⸱AM\n", + " \n", + " -72.98194\n", + " \n", + " 75.25167\n", + " \n", + " (-72.981940,⸱75.251670)\n", + "
\n", + " Pecora⸱Escarpment⸱91483\n", + " \n", + " 18774\n", + " \n", + " Valid\n", + " \n", + " H5\n", + " \n", + " 5.5\n", + " \n", + " Found\n", + " \n", + " 01/01/1991⸱12:00:00⸱AM\n", + " \n", + " -85.55819\n", + " \n", + " -68.31586\n", + " \n", + " (-85.558190,⸱-68.315860)\n", + "
\n", + " Ramlat⸱as⸱Sahmah⸱390\n", + " \n", + " 55656\n", + " \n", + " Valid\n", + " \n", + " H3.8-6\n", + " \n", + " 0.69\n", + " \n", + " Found\n", + " \n", + " 01/01/2010⸱12:00:00⸱AM\n", + " \n", + " 20.0949\n", + " \n", + " 55.69318\n", + " \n", + " (20.094900,⸱55.693180)\n", + "
\n", + "\n", + "
Viewing 10 of 45716 rows / 10 columns
\n", + "
32 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "op.profiler.run(df, \"*\",infer=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Profiler smart mode (Slower). It just try to infer the column data type and present extra data acordinly. From example datetype columns get extra histograms about minutes, day, week and month. Also can detect array types on data." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "

Overview

\n", + "
\n", + "
\n", + "
\n", + "

Dataset info

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
Number of columns10
Number of rows45716
Total Missing (%)0.49%
Total size in memory58.1 MB
\n", + "
\n", + "
\n", + "

Column types

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
String7
Numeric1
Date1
Bool0
Array1
Not available0
\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

name

\n", + "
categorical
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 45515
Unique (%) 99.56
Missing0.0
Missing (%)0
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 45716\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 0\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
Święcany10.002%
Łowicz10.002%
Österplana 06410.002%
Österplana 06310.002%
Österplana 06210.002%
Österplana 06110.002%
Österplana 06010.002%
Österplana 05910.002%
Österplana 05810.002%
Österplana 05710.002%
\"Missing\"00.0%
\n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

id

\n", + "
numeric
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 42365
Unique (%) 92.67
Missing0.0
Missing (%)0
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 0\n", + "
\n", + " Integer\n", + " \n", + " 45716\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 0\n", + "
\n", + " \n", + "
\n", + "

\n", + " Basic Stats\n", + "

\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
Mean26889.73510368361
Minimum1
Maximum57458
Zeros(%)0
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
5745810.002%
5745710.002%
5745610.002%
5745510.002%
5745410.002%
5745310.002%
5743610.002%
5743510.002%
5743410.002%
5743310.002%
\"Missing\"00.0%
\n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "

Quantile statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Minimum1
5-th percentile1.0
Q11.0
Median1.0
Q31.0
95-th percentile1.0
Maximum57458
Range57457
Interquartile range0.0
\n", + "
\n", + "
\n", + "

Descriptive statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Standard deviation16860.683030276334
Coef of variation0.62703
Kurtosis-1.1602608393254068
Mean26889.73510368361
MAD0.0
Skewness0
Sum1229291130
Variance284282632.2474484
\n", + "
\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

nametype

\n", + "
categorical
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 2
Unique (%) 0.004
Missing0.0
Missing (%)0
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 45716\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 0\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
Valid4564199.836%
Relict750.164%
\"Missing\"00.0%
\n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

recclass

\n", + "
categorical
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 444
Unique (%) 0.971
Missing0.0
Missing (%)0
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 45716\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 0\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
L6828518.123%
H5714215.623%
L5479610.491%
H645289.905%
H442119.211%
LL527666.05%
LL620434.469%
L412532.741%
H4/54280.936%
CM24160.91%
\"Missing\"00.0%
\n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

mass (g)

\n", + "
categorical
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 13496
Unique (%) 29.521
Missing0.29
Missing (%)131
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 0\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 131\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
1.31710.374%
1.21400.306%
1.41380.302%
None1310.287%
2.11300.284%
2.41260.276%
1.61200.262%
0.51190.26%
1.11160.254%
3.81140.249%
\"Missing\"1310.29%
\n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

fall

\n", + "
categorical
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 2
Unique (%) 0.004
Missing0.0
Missing (%)0
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 45716\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 0\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
Found4460997.579%
Fell11072.421%
\"Missing\"00.0%
\n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

year

\n", + "
date
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 265
Unique (%) 0.58
Missing0.63
Missing (%)288
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 0\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 45428\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 288\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
01/01/2003 12:00:00 AM33237.269%
01/01/1979 12:00:00 AM30466.663%
01/01/1998 12:00:00 AM26975.899%
01/01/2006 12:00:00 AM24565.372%
01/01/1988 12:00:00 AM22965.022%
01/01/2002 12:00:00 AM20784.545%
01/01/2004 12:00:00 AM19404.244%
01/01/2000 12:00:00 AM17923.92%
01/01/1997 12:00:00 AM16963.71%
01/01/1999 12:00:00 AM16913.699%
\"Missing\"2880.63%
\n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

reclat

\n", + "
categorical
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 12140
Unique (%) 26.555
Missing16.0
Missing (%)7315
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 0\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 7315\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
None731516.001%
0.0643814.083%
-71.5476110.414%
-84.030406.65%
-72.015063.294%
-79.6833311302.472%
-76.716676801.487%
-76.183335391.179%
-84.216672630.575%
-86.366672260.494%
\"Missing\"731516.0%
\n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

reclong

\n", + "
categorical
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 14546
Unique (%) 31.818
Missing16.0
Missing (%)7315
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 0\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 7315\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
None731516.001%
0.0621413.593%
35.66667498510.904%
168.030406.65%
26.015063.294%
159.756571.437%
159.666676371.393%
157.166675421.186%
155.754731.035%
160.52630.575%
\"Missing\"731516.0%
\n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", "\n", "\n", "\n", "\n", "
Viewing 10 of 45716 rows / 10 columns
\n", + "
32 partition(s)
\n", "\n", - "\n", + "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -174,11 +6346,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -218,11 +6390,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -262,11 +6434,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -306,11 +6478,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -350,11 +6522,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -394,11 +6566,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -438,11 +6610,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -482,11 +6654,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -526,11 +6698,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -570,11 +6742,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -614,7 +6786,8 @@ " \n", "
\n", "
name
\n", "
1 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
id
\n", "
2 (int)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
nametype
\n", "
3 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
recclass
\n", "
4 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
mass (g)
\n", "
5 (double)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
fall
\n", "
6 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
year
\n", "
7 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
reclat
\n", "
8 (double)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
reclong
\n", "
9 (double)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
GeoLocation
\n", "
10 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", - " Aachen\n", + " Acfer⸱232\n", " \n", - " 1\n", + " 240\n", " \n", @@ -186,31 +6358,31 @@ " \n", - " L5\n", + " H5\n", " \n", - " 21.0\n", + " 725.0\n", " \n", - " Fell\n", + " Found\n", " \n", - " 01/01/1880⸱12:00:00⸱AM\n", + " 01/01/1991⸱12:00:00⸱AM\n", " \n", - " 50.775\n", + " 27.73944\n", " \n", - " 6.08333\n", + " 4.32833\n", " \n", - " (50.775000,⸱6.083330)\n", + " (27.739440,⸱4.328330)\n", "
\n", - " Aarhus\n", + " Elephant⸱Moraine⸱90232\n", " \n", - " 2\n", + " 8641\n", " \n", @@ -230,31 +6402,31 @@ " \n", - " H6\n", + " L6\n", " \n", - " 720.0\n", + " 16.9\n", " \n", - " Fell\n", + " Found\n", " \n", - " 01/01/1951⸱12:00:00⸱AM\n", + " 01/01/1990⸱12:00:00⸱AM\n", " \n", - " 56.18333\n", + " -76.28795\n", " \n", - " 10.23333\n", + " 156.46841\n", " \n", - " (56.183330,⸱10.233330)\n", + " (-76.287950,⸱156.468410)\n", "
\n", - " Abee\n", + " Grove⸱Mountains⸱020090\n", " \n", - " 6\n", + " 30681\n", " \n", @@ -274,31 +6446,31 @@ " \n", - " EH4\n", + " Martian⸱(shergottite)\n", " \n", - " 107000.0\n", + " 7.5\n", " \n", - " Fell\n", + " Found\n", " \n", - " 01/01/1952⸱12:00:00⸱AM\n", + " 01/01/2003⸱12:00:00⸱AM\n", " \n", - " 54.21667\n", + " -72.99944\n", " \n", - " -113.0\n", + " 75.26111\n", " \n", - " (54.216670,⸱-113.000000)\n", + " (-72.999440,⸱75.261110)\n", "
\n", - " Acapulco\n", + " Northwest⸱Africa⸱891\n", " \n", - " 10\n", + " 31912\n", " \n", @@ -318,31 +6490,31 @@ " \n", - " Acapulcoite\n", + " H4\n", " \n", - " 1914.0\n", + " 70.8\n", " \n", - " Fell\n", + " Found\n", " \n", - " 01/01/1976⸱12:00:00⸱AM\n", + " 01/01/2001⸱12:00:00⸱AM\n", " \n", - " 16.88333\n", + " None\n", " \n", - " -99.9\n", + " None\n", " \n", - " (16.883330,⸱-99.900000)\n", + " None\n", "
\n", - " Achiras\n", + " Queen⸱Alexandra⸱Range⸱93098\n", " \n", - " 370\n", + " 19187\n", " \n", @@ -362,31 +6534,31 @@ " \n", - " L6\n", + " H6\n", " \n", - " 780.0\n", + " 1.2\n", " \n", - " Fell\n", + " Found\n", " \n", - " 01/01/1902⸱12:00:00⸱AM\n", + " 01/01/1993⸱12:00:00⸱AM\n", " \n", - " -33.16667\n", + " -84.5757\n", " \n", - " -64.95\n", + " 162.56524\n", " \n", - " (-33.166670,⸱-64.950000)\n", + " (-84.575700,⸱162.565240)\n", "
\n", - " Adhi⸱Kot\n", + " Queen⸱Alexandra⸱Range⸱94691\n", " \n", - " 379\n", + " 20322\n", " \n", @@ -406,31 +6578,31 @@ " \n", - " EH4\n", + " H6\n", " \n", - " 4239.0\n", + " 9.6\n", " \n", - " Fell\n", + " Found\n", " \n", - " 01/01/1919⸱12:00:00⸱AM\n", + " 01/01/1994⸱12:00:00⸱AM\n", " \n", - " 32.1\n", + " -84.0\n", " \n", - " 71.8\n", + " 168.0\n", " \n", - " (32.100000,⸱71.800000)\n", + " (-84.000000,⸱168.000000)\n", "
\n", - " Adzhi-Bogdo⸱(stone)\n", + " Meteorite⸱Hills⸱00977\n", " \n", - " 390\n", + " 16211\n", " \n", @@ -450,31 +6622,31 @@ " \n", - " LL3-6\n", + " H5\n", " \n", - " 910.0\n", + " 13.2\n", " \n", - " Fell\n", + " Found\n", " \n", - " 01/01/1949⸱12:00:00⸱AM\n", + " 01/01/2000⸱12:00:00⸱AM\n", " \n", - " 44.83333\n", + " -79.68333\n", " \n", - " 95.16667\n", + " 159.75\n", " \n", - " (44.833330,⸱95.166670)\n", + " (-79.683330,⸱159.750000)\n", "
\n", - " Agen\n", + " Grove⸱Mountains⸱020114\n", " \n", - " 392\n", + " 46531\n", " \n", @@ -494,31 +6666,31 @@ " \n", - " H5\n", + " L3\n", " \n", - " 30000.0\n", + " 1.0\n", " \n", - " Fell\n", + " Found\n", " \n", - " 01/01/1814⸱12:00:00⸱AM\n", + " 01/01/2003⸱12:00:00⸱AM\n", " \n", - " 44.21667\n", + " -72.98194\n", " \n", - " 0.61667\n", + " 75.25167\n", " \n", - " (44.216670,⸱0.616670)\n", + " (-72.981940,⸱75.251670)\n", "
\n", - " Aguada\n", + " Pecora⸱Escarpment⸱91483\n", " \n", - " 398\n", + " 18774\n", " \n", @@ -538,31 +6710,31 @@ " \n", - " L6\n", + " H5\n", " \n", - " 1620.0\n", + " 5.5\n", " \n", - " Fell\n", + " Found\n", " \n", - " 01/01/1930⸱12:00:00⸱AM\n", + " 01/01/1991⸱12:00:00⸱AM\n", " \n", - " -31.6\n", + " -85.55819\n", " \n", - " -65.23333\n", + " -68.31586\n", " \n", - " (-31.600000,⸱-65.233330)\n", + " (-85.558190,⸱-68.315860)\n", "
\n", - " Aguila⸱Blanca\n", + " Ramlat⸱as⸱Sahmah⸱390\n", " \n", - " 417\n", + " 55656\n", " \n", @@ -582,31 +6754,31 @@ " \n", - " L\n", + " H3.8-6\n", " \n", - " 1440.0\n", + " 0.69\n", " \n", - " Fell\n", + " Found\n", " \n", - " 01/01/1920⸱12:00:00⸱AM\n", + " 01/01/2010⸱12:00:00⸱AM\n", " \n", - " -30.86667\n", + " 20.0949\n", " \n", - " -64.55\n", + " 55.69318\n", " \n", - " (-30.866670,⸱-64.550000)\n", + " (20.094900,⸱55.693180)\n", "
\n", "\n", - "
Viewing 10 of 45716 rows / 10 columns
\n" + "
Viewing 10 of 45716 rows / 10 columns
\n", + "
32 partition(s)
\n" ], "text/plain": [ "" @@ -625,25 +6798,21 @@ } ], "source": [ - "df.table(10)" + "op.profiler.run(df, \"*\",infer=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plot profile for a specific column" ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "scrolled": false - }, + "execution_count": 15, + "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Processing column 'name'...\n", - "Using 'column_exp' to process column 'name_len' with function func_col_exp\n", - "Using 'column_exp' to process column 'name_len_buckets' with function _bucketizer\n" - ] - }, { "data": { "text/html": [ @@ -690,23 +6859,23 @@ " \n", " \n", " Total Missing (%)\n", - " 0.4891941552191793%\n", + " 0.49%\n", "\n", " \n", " \n", " Total size in memory\n", - " 44.6MiB\n", + " 63.8 MB\n", "\n", " \n", " \n", " \n", "
\n", "
\n", - "

Variables types

\n", + "

Column types

\n", " \n", " \n", " \n", - " \n", + " \n", " \n", "\n", " \n", @@ -722,7 +6891,12 @@ " \n", " \n", " \n", - " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", "\n", " \n", " \n", @@ -764,26 +6938,26 @@ "\n", "
\n", "
\n", - "

name

\n", + "

reclat

\n", "
categorical
\n", "
\n", "
CategoricalString1
Bool0
Array0
\n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", "
Distinct count 45515Unique 12140
Unique (%) 99.56 26.555
Missing (%)0Missing16.0
Missing (n)0.0Missing (%)7315
\n", @@ -799,7 +6973,7 @@ " String\n", " \n", " \n", - " 45716\n", + " 0\n", " \n", " \n", " \n", @@ -847,7 +7021,7 @@ " Null\n", " \n", " \n", - " 0\n", + " 7315\n", " \n", "\n", " \n", @@ -865,79 +7039,79 @@ " Frecuency (%)\n", " \n", " \n", - " Święcany\n", - " 1\n", - " 0.002%\n", + " None\n", + " 7315\n", + " 16.001%\n", " \n", "\n", " \n", " \n", - " Łowicz\n", - " 1\n", - " 0.002%\n", + " 0.0\n", + " 6438\n", + " 14.083%\n", " \n", "\n", " \n", " \n", - " Österplana 064\n", - " 1\n", - " 0.002%\n", + " -71.5\n", + " 4761\n", + " 10.414%\n", " \n", "\n", " \n", " \n", - " Österplana 063\n", - " 1\n", - " 0.002%\n", + " -84.0\n", + " 3040\n", + " 6.65%\n", " \n", "\n", " \n", " \n", - " Österplana 062\n", - " 1\n", - " 0.002%\n", + " -72.0\n", + " 1506\n", + " 3.294%\n", " \n", "\n", " \n", " \n", - " Österplana 061\n", - " 1\n", - " 0.002%\n", + " -79.68333\n", + " 1130\n", + " 2.472%\n", " \n", "\n", " \n", " \n", - " Österplana 060\n", - " 1\n", - " 0.002%\n", + " -76.71667\n", + " 680\n", + " 1.487%\n", " \n", "\n", " \n", " \n", - " Österplana 059\n", - " 1\n", - " 0.002%\n", + " -76.18333\n", + " 539\n", + " 1.179%\n", " \n", "\n", " \n", " \n", - " Österplana 058\n", - " 1\n", - " 0.002%\n", + " -84.21667\n", + " 263\n", + " 0.575%\n", " \n", "\n", " \n", " \n", - " Österplana 057\n", - " 1\n", - " 0.002%\n", + " -86.36667\n", + " 226\n", + " 0.494%\n", " \n", "\n", " \n", " \n", " \"Missing\"\n", - " 0\n", - " 0.0%\n", + " 7315\n", + " 16.0%\n", " \n", " \n", " \n", @@ -953,7 +7127,7 @@ " \n", "\n", "
\n", - " \n", + " \n", "
\n", " \n", " \n", @@ -962,7 +7136,7 @@ " \n", " \n", "
\n", - " \n", + " \n", "
\n", "\n", " \n", @@ -977,87 +7151,145 @@ "\n", " \n", "
\n", "\n", "\n", "\n", "\n", "
Viewing 10 of 45716 rows / 10 columns
\n", + "
32 partition(s)
\n", "\n", - "\n", + "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -1068,11 +7300,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -1112,11 +7344,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -1156,11 +7388,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -1200,11 +7432,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -1244,11 +7476,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -1288,11 +7520,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -1332,11 +7564,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -1376,11 +7608,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -1420,11 +7652,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -1464,11 +7696,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -1508,7 +7740,8 @@ " \n", "
\n", "
name
\n", "
1 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
id
\n", "
2 (int)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
nametype
\n", "
3 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
recclass
\n", "
4 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
mass (g)
\n", "
5 (double)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
fall
\n", "
6 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
year
\n", "
7 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
reclat
\n", "
8 (double)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
reclong
\n", "
9 (double)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
GeoLocation
\n", "
10 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", - " Aachen\n", + " Acfer⸱232\n", " \n", - " 1\n", + " 240\n", " \n", @@ -1080,31 +7312,31 @@ " \n", - " L5\n", + " H5\n", " \n", - " 21.0\n", + " 725.0\n", " \n", - " Fell\n", + " Found\n", " \n", - " 01/01/1880⸱12:00:00⸱AM\n", + " 01/01/1991⸱12:00:00⸱AM\n", " \n", - " 50.775\n", + " 27.73944\n", " \n", - " 6.08333\n", + " 4.32833\n", " \n", - " (50.775000,⸱6.083330)\n", + " (27.739440,⸱4.328330)\n", "
\n", - " Aarhus\n", + " Elephant⸱Moraine⸱90232\n", " \n", - " 2\n", + " 8641\n", " \n", @@ -1124,31 +7356,31 @@ " \n", - " H6\n", + " L6\n", " \n", - " 720.0\n", + " 16.9\n", " \n", - " Fell\n", + " Found\n", " \n", - " 01/01/1951⸱12:00:00⸱AM\n", + " 01/01/1990⸱12:00:00⸱AM\n", " \n", - " 56.18333\n", + " -76.28795\n", " \n", - " 10.23333\n", + " 156.46841\n", " \n", - " (56.183330,⸱10.233330)\n", + " (-76.287950,⸱156.468410)\n", "
\n", - " Abee\n", + " Grove⸱Mountains⸱020090\n", " \n", - " 6\n", + " 30681\n", " \n", @@ -1168,31 +7400,31 @@ " \n", - " EH4\n", + " Martian⸱(shergottite)\n", " \n", - " 107000.0\n", + " 7.5\n", " \n", - " Fell\n", + " Found\n", " \n", - " 01/01/1952⸱12:00:00⸱AM\n", + " 01/01/2003⸱12:00:00⸱AM\n", " \n", - " 54.21667\n", + " -72.99944\n", " \n", - " -113.0\n", + " 75.26111\n", " \n", - " (54.216670,⸱-113.000000)\n", + " (-72.999440,⸱75.261110)\n", "
\n", - " Acapulco\n", + " Northwest⸱Africa⸱891\n", " \n", - " 10\n", + " 31912\n", " \n", @@ -1212,31 +7444,31 @@ " \n", - " Acapulcoite\n", + " H4\n", " \n", - " 1914.0\n", + " 70.8\n", " \n", - " Fell\n", + " Found\n", " \n", - " 01/01/1976⸱12:00:00⸱AM\n", + " 01/01/2001⸱12:00:00⸱AM\n", " \n", - " 16.88333\n", + " None\n", " \n", - " -99.9\n", + " None\n", " \n", - " (16.883330,⸱-99.900000)\n", + " None\n", "
\n", - " Achiras\n", + " Queen⸱Alexandra⸱Range⸱93098\n", " \n", - " 370\n", + " 19187\n", " \n", @@ -1256,31 +7488,31 @@ " \n", - " L6\n", + " H6\n", " \n", - " 780.0\n", + " 1.2\n", " \n", - " Fell\n", + " Found\n", " \n", - " 01/01/1902⸱12:00:00⸱AM\n", + " 01/01/1993⸱12:00:00⸱AM\n", " \n", - " -33.16667\n", + " -84.5757\n", " \n", - " -64.95\n", + " 162.56524\n", " \n", - " (-33.166670,⸱-64.950000)\n", + " (-84.575700,⸱162.565240)\n", "
\n", - " Adhi⸱Kot\n", + " Queen⸱Alexandra⸱Range⸱94691\n", " \n", - " 379\n", + " 20322\n", " \n", @@ -1300,31 +7532,31 @@ " \n", - " EH4\n", + " H6\n", " \n", - " 4239.0\n", + " 9.6\n", " \n", - " Fell\n", + " Found\n", " \n", - " 01/01/1919⸱12:00:00⸱AM\n", + " 01/01/1994⸱12:00:00⸱AM\n", " \n", - " 32.1\n", + " -84.0\n", " \n", - " 71.8\n", + " 168.0\n", " \n", - " (32.100000,⸱71.800000)\n", + " (-84.000000,⸱168.000000)\n", "
\n", - " Adzhi-Bogdo⸱(stone)\n", + " Meteorite⸱Hills⸱00977\n", " \n", - " 390\n", + " 16211\n", " \n", @@ -1344,31 +7576,31 @@ " \n", - " LL3-6\n", + " H5\n", " \n", - " 910.0\n", + " 13.2\n", " \n", - " Fell\n", + " Found\n", " \n", - " 01/01/1949⸱12:00:00⸱AM\n", + " 01/01/2000⸱12:00:00⸱AM\n", " \n", - " 44.83333\n", + " -79.68333\n", " \n", - " 95.16667\n", + " 159.75\n", " \n", - " (44.833330,⸱95.166670)\n", + " (-79.683330,⸱159.750000)\n", "
\n", - " Agen\n", + " Grove⸱Mountains⸱020114\n", " \n", - " 392\n", + " 46531\n", " \n", @@ -1388,31 +7620,31 @@ " \n", - " H5\n", + " L3\n", " \n", - " 30000.0\n", + " 1.0\n", " \n", - " Fell\n", + " Found\n", " \n", - " 01/01/1814⸱12:00:00⸱AM\n", + " 01/01/2003⸱12:00:00⸱AM\n", " \n", - " 44.21667\n", + " -72.98194\n", " \n", - " 0.61667\n", + " 75.25167\n", " \n", - " (44.216670,⸱0.616670)\n", + " (-72.981940,⸱75.251670)\n", "
\n", - " Aguada\n", + " Pecora⸱Escarpment⸱91483\n", " \n", - " 398\n", + " 18774\n", " \n", @@ -1432,31 +7664,31 @@ " \n", - " L6\n", + " H5\n", " \n", - " 1620.0\n", + " 5.5\n", " \n", - " Fell\n", + " Found\n", " \n", - " 01/01/1930⸱12:00:00⸱AM\n", + " 01/01/1991⸱12:00:00⸱AM\n", " \n", - " -31.6\n", + " -85.55819\n", " \n", - " -65.23333\n", + " -68.31586\n", " \n", - " (-31.600000,⸱-65.233330)\n", + " (-85.558190,⸱-68.315860)\n", "
\n", - " Aguila⸱Blanca\n", + " Ramlat⸱as⸱Sahmah⸱390\n", " \n", - " 417\n", + " 55656\n", " \n", @@ -1476,31 +7708,31 @@ " \n", - " L\n", + " H3.8-6\n", " \n", - " 1440.0\n", + " 0.69\n", " \n", - " Fell\n", + " Found\n", " \n", - " 01/01/1920⸱12:00:00⸱AM\n", + " 01/01/2010⸱12:00:00⸱AM\n", " \n", - " -30.86667\n", + " 20.0949\n", " \n", - " -64.55\n", + " 55.69318\n", " \n", - " (-30.866670,⸱-64.550000)\n", + " (20.094900,⸱55.693180)\n", "
\n", "\n", - "
Viewing 10 of 45716 rows / 10 columns
\n" + "
Viewing 10 of 45716 rows / 10 columns
\n", + "
32 partition(s)
\n" ], "text/plain": [ "" @@ -1519,37 +7752,7 @@ } ], "source": [ - "op.profiler.run(df, \"name\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Plot profile for a specific column" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'timeit' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mstart_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mProfiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"reclat\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mstart_time\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mNameError\u001b[0m: name 'timeit' is not defined" - ] - } - ], - "source": [ - "start_time = timeit.default_timer()\n", - "Profiler.columns(df, \"reclat\")\n", - "timeit.default_timer() - start_time" + "op.profiler.run(df, \"reclat\")" ] }, { @@ -1568,20 +7771,12 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 16, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using 'column_exp' to process column 'id_buckets' with function _bucketizer\n", - "Using 'column_exp' to process column 'reclong_buckets' with function _bucketizer\n" - ] - }, { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -1591,7 +7786,7 @@ }, { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA2QAAAEHCAYAAADBF4UVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAGfdJREFUeJzt3X+wnmWd3/H3ZxNhVboGJLqYZDesZl3B/tDNYNSOpWIhgBrakZlQV6JlJ7O7aNXZrQa3LbsqndjuyEpXmFLJAi5jZNBK1mAxRVzGVpCgVgk/mhSQZBMhbvihomLw2z+eK/Xx8CQnnHPgOufk/Zo589z3977u+/k+4R7Ih/t6rpOqQpIkSZL0zPul3g1IkiRJ0qHKQCZJkiRJnRjIJEmSJKkTA5kkSZIkdWIgkyRJkqRODGSSJEmS1ImBTJI0aUm2JDmxdx8zUZITk+zo3YckqQ8DmSTpgJLcl+QNY2pvT/KVfftVdXxVfXmc6yxOUknmPk2tPq3aZ768dx8TNdP7l6TZykAmSZoVega9mRoyJUn9GcgkSZM2/BQtyQlJNid5NMkDST7aht3UXh9O8oMkr07yS0n+bZLvJHkwyZVJnjd03bPbsb9L8u/GvM+fJLkmyV8leRR4e3vvryZ5OMmuJH+R5LCh61WSP0iyNcn3k3woyYvbOY8muXp4/AE+776nfeckuR/4UqsvS/K/2vv/7+FpnEmOSvKXSXYmeSjJ5/Zz7Zcl+XK7xpYkbx46dnmSjyfZ2Pq/JcmLh46fnOTuJI8kuTjJ3yT53XH/AUqSujGQSZKm2seAj1XVrwAvBq5u9de113lVdURVfRV4e/v5p8BvAEcAfwGQ5DjgYuCtwDHA84AFY95rBXANMA+4CngCeC9wNPBq4CTgD8acsxz4bWAZ8D7g0vYei4CXA2eN+lBVdXlVvX1M+Z8ALwNOSbIA2Ah8GDgK+CPgM0nmt7GfBJ4DHA+8ALhw7HskeRbw18AX25h3AVcleenQsLOAPwWOBLYBF7Rzj25/FucBzwfuBl4zTv+SpM4MZJKkg/G59sTm4SQPMwhK+/NT4CVJjq6qH1TVzQcY+1bgo1V1T1X9gEGYWNmmAL4F+Ouq+kpVPQ78e6DGnP/VqvpcVf2sqn5UVbdV1c1Vtbeq7gP+C4PQNOwjVfVoVW0Bbge+2N7/EeALwCsO7o8EgD+pqh9W1Y+A3wGuq6rrWj+bgM3AaUmOAU4Ffq+qHqqqn1bV34y43jIGoXRtVT1eVV8CPs8vhsTPVtXXqmovgxD6j1r9NGBLVX22HbsI+O5T+CySpA4MZJKkg3FGVc3b98OTnzoNOwf4TeCuJLcmeeMBxr4I+M7Q/neAucAL27Ht+w5U1WPA3405f/vwTpLfTPL5JN9t0xj/A4OnZcMeGNr+0Yj9Iw7Q71jD7//rwJljgus/ZvB0bxGwp6oeGud6LwK2V9XPhmrf4RefDA6HrMeG+h3751WAqzdK0jRnIJMkTamq2lpVZzGYcvcR4Jokz+XJT7cAdjIIMvv8GrCXQUjaBSzcdyDJsxlMxfuFtxuzfwlwF7CkTZn8AJCJf5pxDb//duCTw8G1qp5bVWvbsaOSzBvnejuBRUmG//v8a8DfHkQvY/+8MrwvSZqeDGSSpCmV5HeSzG9PeR5u5SeA3cDPGHxXbJ9PAe9NcmySIxg80fp0m3J3DfCmJK9pC238KeOHq78HPAr8IMlvAb8/ZR9sfH/FoN9TksxJ8svtd4wtrKpdDKZDXpzkyCTPSvK6Ede4Bfgh8L425kTgTcD6g3j/jcDfT3JGm/J5LvCrU/LJJElPGwOZJGmqLQe2JPkBgwU+VlbVj9uUwwuA/9mm9C0D1jFY7OIm4F7gxwwWsqB9x+tdDMLILuD7wIPATw7w3n8E/Ms29r8Cn576jzdaVW1nsMjIBxiEz+3Av+Hn/619G4Pv193F4HO8Z8Q1HgfezOD7Zt9j8F29s6vqroN4/+8BZwL/kcHUzuMYfIftQH9ekqTOMphiLknS9NaeoD3MYDrivb37me7atMcdwFur6sbe/UiSRvMJmSRp2krypiTPad9B+zPg28B9fbuavtp0yXlJDufn35870CqXkqTODGSSpOlsBYOFLnYCSxhMf3Rqx/69Gvi/DKY7vonB6pg/6tuSJOlAnLIoSZIkSZ34hEySJEmSOjGQSZIkSVInc3s3MFFHH310LV68uHcbkiRJkvQkt9122/eqav5442ZsIFu8eDGbN2/u3YYkSZIkPUmS7xzMOKcsSpIkSVIn4wayJOuSPJjk9qHaf0pyV5JvJflvSeYNHTsvybYkdyc5Zai+vNW2JVkzVD82yS1Jtib5dJLDpvIDSpIkSdJ0dTBPyC4Hlo+pbQJeXlX/APg/wHkASY4DVgLHt3MuTjInyRzg48CpwHHAWW0swEeAC6tqCfAQcM6kPpEkSZIkzRDjBrKqugnYM6b2xara23ZvBha27RXA+qr6SVXdC2wDTmg/26rqnqp6HFgPrEgS4PXANe38K4AzJvmZJEmSJGlGmIrvkP0r4AttewGwfejYjlbbX/35wMND4W5ffaQkq5NsTrJ59+7dU9C6JEmSJPUzqUCW5I+BvcBV+0ojhtUE6iNV1aVVtbSqls6fP+4KkpIkSZI0rU142fskq4A3AidV1b4QtQNYNDRsIbCzbY+qfw+Yl2Rue0o2PF6SJEmSZrUJPSFLshx4P/Dmqnps6NAGYGWSw5McCywBvgbcCixpKyoexmDhjw0tyN0IvKWdvwq4dmIfRZIkSZJmlnGfkCX5FHAicHSSHcD5DFZVPBzYNFiXg5ur6veqakuSq4E7GExlPLeqnmjXeSdwPTAHWFdVW9pbvB9Yn+TDwDeAy6bw80mSJGmWWLxmY+8WDsp9a0/v3YJmkHEDWVWdNaK839BUVRcAF4yoXwdcN6J+D4NVGCVJkiTpkDIVqyxKkiRJkibAQCZJkiRJnRjIJEmSJKkTA5kkSZIkdWIgkyRJkqRODGSSJEmS1ImBTJIkSZI6MZBJkiRJUicGMkmSJEnqxEAmSZIkSZ0YyCRJkiSpEwOZJEmSJHViIJMkSZKkTgxkkiRJktSJgUySJEmSOjGQSZIkSVInBjJJkiRJ6sRAJkmSJEmdGMgkSZIkqRMDmSRJkiR1YiCTJEmSpE4MZJIkSZLUiYFMkiRJkjoxkEmSJElSJwYySZIkSerEQCZJkiRJnYwbyJKsS/JgktuHakcl2ZRka3s9stWT5KIk25J8K8krh85Z1cZvTbJqqP7bSb7dzrkoSab6Q0qSJEnSdHQwT8guB5aPqa0BbqiqJcANbR/gVGBJ+1kNXAKDAAecD7wKOAE4f1+Ia2NWD5039r0kSZIkaVYaN5BV1U3AnjHlFcAVbfsK4Iyh+pU1cDMwL8kxwCnApqraU1UPAZuA5e3Yr1TVV6uqgCuHriVJkiRJs9pEv0P2wqraBdBeX9DqC4DtQ+N2tNqB6jtG1EdKsjrJ5iSbd+/ePcHWJUmSJGl6mOpFPUZ9/6smUB+pqi6tqqVVtXT+/PkTbFGSJEmSpoeJBrIH2nRD2uuDrb4DWDQ0biGwc5z6whF1SZIkSZr1JhrINgD7VkpcBVw7VD+7rba4DHikTWm8Hjg5yZFtMY+Tgevbse8nWdZWVzx76FqSJEmSNKvNHW9Akk8BJwJHJ9nBYLXEtcDVSc4B7gfObMOvA04DtgGPAe8AqKo9ST4E3NrGfbCq9i0U8vsMVnJ8NvCF9iNJkiRJs964gayqztrPoZNGjC3g3P1cZx2wbkR9M/Dy8fqQJEmSpNlmqhf1kCRJkiQdJAOZJEmSJHViIJMkSZKkTgxkkiRJktSJgUySJEmSOjGQSZIkSVInBjJJkiRJ6sRAJkmSJEmdGMgkSZIkqRMDmSRJkiR1YiCTJEmSpE4MZJIkSZLUiYFMkiRJkjqZ27sBSdLstHjNxt4tjOu+taf3bkGSdIjzCZkkSZIkdWIgkyRJkqRODGSSJEmS1ImBTJIkSZI6MZBJkiRJUicGMkmSJEnqxEAmSZIkSZ0YyCRJkiSpEwOZJEmSJHViIJMkSZKkTgxkkiRJktTJpAJZkvcm2ZLk9iSfSvLLSY5NckuSrUk+neSwNvbwtr+tHV88dJ3zWv3uJKdM7iNJkiRJ0sww4UCWZAHwr4GlVfVyYA6wEvgIcGFVLQEeAs5pp5wDPFRVLwEubONIclw773hgOXBxkjkT7UuSJEmSZorJTlmcCzw7yVzgOcAu4PXANe34FcAZbXtF26cdPylJWn19Vf2kqu4FtgEnTLIvSZIkSZr2JhzIqupvgT8D7mcQxB4BbgMerqq9bdgOYEHbXgBsb+fubeOfP1wfcY4kSZIkzVqTmbJ4JIOnW8cCLwKeC5w6YmjtO2U/x/ZXH/Weq5NsTrJ59+7dT71pSZIkSZpGJjNl8Q3AvVW1u6p+CnwWeA0wr01hBFgI7GzbO4BFAO3484A9w/UR5/yCqrq0qpZW1dL58+dPonVJkiRJ6m8ygex+YFmS57Tvgp0E3AHcCLyljVkFXNu2N7R92vEvVVW1+sq2CuOxwBLga5PoS5IkSZJmhLnjDxmtqm5Jcg3wdWAv8A3gUmAjsD7Jh1vtsnbKZcAnk2xj8GRsZbvOliRXMwhze4Fzq+qJifYlSZIkSTPFhAMZQFWdD5w/pnwPI1ZJrKofA2fu5zoXABdMphdJkiRJmmkmu+y9JEmSJGmCDGSSJEmS1ImBTJIkSZI6MZBJkiRJUicGMkmSJEnqxEAmSZIkSZ0YyCRJkiSpEwOZJEmSJHViIJMkSZKkTgxkkiRJktSJgUySJEmSOjGQSZIkSVInc3s3IEnSdLJ4zcbeLRyU+9ae3rsFSdIU8AmZJEmSJHViIJMkSZKkTgxkkiRJktSJgUySJEmSOjGQSZIkSVInBjJJkiRJ6sRAJkmSJEmdGMgkSZIkqRMDmSRJkiR1YiCTJEmSpE4MZJIkSZLUiYFMkiRJkjoxkEmSJElSJ5MKZEnmJbkmyV1J7kzy6iRHJdmUZGt7PbKNTZKLkmxL8q0krxy6zqo2fmuSVZP9UJIkSZI0E0z2CdnHgP9eVb8F/EPgTmANcENVLQFuaPsApwJL2s9q4BKAJEcB5wOvAk4Azt8X4iRJkiRpNptwIEvyK8DrgMsAqurxqnoYWAFc0YZdAZzRtlcAV9bAzcC8JMcApwCbqmpPVT0EbAKWT7QvSZIkSZopJvOE7DeA3cBfJvlGkk8keS7wwqraBdBeX9DGLwC2D52/o9X2V5ckSZKkWW0ygWwu8Ergkqp6BfBDfj49cZSMqNUB6k++QLI6yeYkm3fv3v1U+5UkSZKkaWUygWwHsKOqbmn71zAIaA+0qYi01weHxi8aOn8hsPMA9SepqkuramlVLZ0/f/4kWpckSZKk/iYcyKrqu8D2JC9tpZOAO4ANwL6VElcB17btDcDZbbXFZcAjbUrj9cDJSY5si3mc3GqSJEmSNKvNneT57wKuSnIYcA/wDgYh7+ok5wD3A2e2sdcBpwHbgMfaWKpqT5IPAbe2cR+sqj2T7EuSJEmSpr1JBbKq+iawdMShk0aMLeDc/VxnHbBuMr1IkiRJ0kwz2d9DJkmSJEmaIAOZJEmSJHViIJMkSZKkTgxkkiRJktSJgUySJEmSOjGQSZIkSVInBjJJkiRJ6sRAJkmSJEmdGMgkSZIkqRMDmSRJkiR1YiCTJEmSpE4MZJIkSZLUiYFMkiRJkjoxkEmSJElSJwYySZIkSerEQCZJkiRJnRjIJEmSJKkTA5kkSZIkdWIgkyRJkqRODGSSJEmS1ImBTJIkSZI6MZBJkiRJUicGMkmSJEnqxEAmSZIkSZ0YyCRJkiSpEwOZJEmSJHUy6UCWZE6SbyT5fNs/NsktSbYm+XSSw1r98La/rR1fPHSN81r97iSnTLYnSZIkSZoJpuIJ2buBO4f2PwJcWFVLgIeAc1r9HOChqnoJcGEbR5LjgJXA8cBy4OIkc6agL0mSJEma1iYVyJIsBE4HPtH2A7weuKYNuQI4o22vaPu04ye18SuA9VX1k6q6F9gGnDCZviRJkiRpJpjsE7I/B94H/KztPx94uKr2tv0dwIK2vQDYDtCOP9LG///6iHMkSZIkadaacCBL8kbgwaq6bbg8YmiNc+xA54x9z9VJNifZvHv37qfUryRJkiRNN5N5QvZa4M1J7gPWM5iq+OfAvCRz25iFwM62vQNYBNCOPw/YM1wfcc4vqKpLq2ppVS2dP3/+JFqXJEmSpP4mHMiq6ryqWlhVixksyvGlqnorcCPwljZsFXBt297Q9mnHv1RV1eor2yqMxwJLgK9NtC9JkiRJminmjj/kKXs/sD7Jh4FvAJe1+mXAJ5NsY/BkbCVAVW1JcjVwB7AXOLeqnnga+pIkSZKkaWVKAllVfRn4ctu+hxGrJFbVj4Ez93P+BcAFU9GLJEmSJM0UU/F7yCRJkiRJE2AgkyRJkqRODGSSJEmS1ImBTJIkSZI6MZBJkiRJUicGMkmSJEnqxEAmSZIkSZ0YyCRJkiSpkyn5xdCSJKm/xWs29m5hXPetPb13C5I0rfiETJIkSZI6MZBJkiRJUicGMkmSJEnqxEAmSZIkSZ0YyCRJkiSpEwOZJEmSJHViIJMkSZKkTgxkkiRJktSJgUySJEmSOpnbuwFJknRoW7xmY+8WDsp9a0/v3YKkWcgnZJIkSZLUiYFMkiRJkjoxkEmSJElSJwYySZIkSerEQCZJkiRJnRjIJEmSJKkTA5kkSZIkdTLhQJZkUZIbk9yZZEuSd7f6UUk2JdnaXo9s9SS5KMm2JN9K8sqha61q47cmWTX5jyVJkiRJ099knpDtBf6wql4GLAPOTXIcsAa4oaqWADe0fYBTgSXtZzVwCQwCHHA+8CrgBOD8fSFOkiRJkmazCQeyqtpVVV9v298H7gQWACuAK9qwK4Az2vYK4MoauBmYl+QY4BRgU1XtqaqHgE3A8on2JUmSJEkzxZR8hyzJYuAVwC3AC6tqFwxCG/CCNmwBsH3otB2ttr+6JEmSJM1qkw5kSY4APgO8p6oePdDQEbU6QH3Ue61OsjnJ5t27dz/1ZiVJkiRpGplUIEvyLAZh7Kqq+mwrP9CmItJeH2z1HcCiodMXAjsPUH+Sqrq0qpZW1dL58+dPpnVJkiRJ6m4yqywGuAy4s6o+OnRoA7BvpcRVwLVD9bPbaovLgEfalMbrgZOTHNkW8zi51SRJkiRpVps7iXNfC7wN+HaSb7baB4C1wNVJzgHuB85sx64DTgO2AY8B7wCoqj1JPgTc2sZ9sKr2TKIvSZIkSZoRJhzIquorjP7+F8BJI8YXcO5+rrUOWDfRXiRJkiRpJpqSVRYlSZIkSU+dgUySJEmSOpnMd8gkHWIWr9nYu4WDct/a03u3IEmSdFB8QiZJkiRJnRjIJEmSJKkTA5kkSZIkdWIgkyRJkqRODGSSJEmS1ImrLEoduFqhJEmSwCdkkiRJktSNgUySJEmSOjGQSZIkSVInBjJJkiRJ6sRAJkmSJEmdGMgkSZIkqRMDmSRJkiR1YiCTJEmSpE4MZJIkSZLUydzeDUiSJOmZtXjNxt4tHJT71p7euwXpaecTMkmSJEnqxEAmSZIkSZ0YyCRJkiSpE79DphnL+e+SJEma6XxCJkmSJEmdGMgkSZIkqROnLMqpf5IkSVIn0yaQJVkOfAyYA3yiqtZ2bkmSJEmaEv4PcO3PtAhkSeYAHwf+GbADuDXJhqq6o29nkiRJo/kXbElTYVoEMuAEYFtV3QOQZD2wAjCQSZoSM/UvTjO1b0mSdHCmy6IeC4DtQ/s7Wk2SJEmSZq1UVe8eSHImcEpV/W7bfxtwQlW9a8y41cDqtvtS4O5ntFEdKo4Gvte7CWkKeC9rNvA+1mzhvXzo+fWqmj/eoOkyZXEHsGhofyGwc+ygqroUuPSZakqHpiSbq2pp7z6kyfJe1mzgfazZwntZ+zNdpizeCixJcmySw4CVwIbOPUmSJEnS02paPCGrqr1J3glcz2DZ+3VVtaVzW5IkSZL0tJoWgQygqq4Druvdh4TTYjV7eC9rNvA+1mzhvayRpsWiHpIkSZJ0KJou3yGTJEmSpEOOgUyHtCRnJtmS5GdJlo45dl6SbUnuTnLKUH15q21LsuaZ71o6MO9RzSRJ1iV5MMntQ7WjkmxKsrW9HtnqSXJRu7e/leSV/TqXfi7JoiQ3Jrmz/b3i3a3uvaxxGch0qLsd+BfATcPFJMcxWO3zeGA5cHGSOUnmAB8HTgWOA85qY6VpwXtUM9DlDP49O2wNcENVLQFuaPswuK+XtJ/VwCXPUI/SePYCf1hVLwOWAee2f/d6L2tcBjId0qrqzqoa9QvGVwDrq+onVXUvsA04of1sq6p7qupxYH0bK00X3qOaUarqJmDPmPIK4Iq2fQVwxlD9yhq4GZiX5JhnplNp/6pqV1V9vW1/H7gTWID3sg6CgUwabQGwfWh/R6vtry5NF96jmg1eWFW7YPAXXeAFre79rWkvyWLgFcAteC/rIEybZe+lp0uS/wH86ohDf1xV1+7vtBG1YvT/xHCpUk0n+7t3pdnA+1vTWpIjgM8A76mqR5NRt+xg6Iia9/IhykCmWa+q3jCB03YAi4b2FwI72/b+6tJ0cKB7V5opHkhyTFXtatO4Hmx1729NW0mexSCMXVVVn21l72WNyymL0mgbgJVJDk9yLIMv3X4NuBVYkuTYJIcxWPhjQ8c+pbG8RzUbbABWte1VwLVD9bPbCnXLgEf2TQeTesrgUdhlwJ1V9dGhQ97LGpdPyHRIS/LPgf8MzAc2JvlmVZ1SVVuSXA3cwWDlpHOr6ol2zjuB64E5wLqq2tKpfelJqmqv96hmkiSfAk4Ejk6yAzgfWAtcneQc4H7gzDb8OuA0BgstPQa84xlvWBrttcDbgG8n+WarfQDvZR2EVDldVZIkSZJ6cMqiJEmSJHViIJMkSZKkTgxkkiRJktSJgUySJEmSOjGQSZIkSVInBjJJkiRJ6sRAJkmSJEmdGMgkSZIkqZP/B2qzDZVN+lDTAAAAAElFTkSuQmCC\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -1606,7 +7801,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -1636,16 +7831,16 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 14, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" }, @@ -1690,22 +7885,16 @@ } ], "metadata": { + "kernel_info": { + "name": "python3" + }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" + "nteract": { + "version": "0.11.6" } }, "nbformat": 4, diff --git a/optimus/functions.py b/optimus/functions.py index 183e5e140..ca0ddd884 100644 --- a/optimus/functions.py +++ b/optimus/functions.py @@ -1,5 +1,6 @@ import base64 import logging + from fastnumbers import isint, isfloat from functools import reduce from io import BytesIO @@ -7,9 +8,11 @@ import dateutil.parser import matplotlib.pyplot as plt from numpy import array +from py4j.protocol import Py4JJavaError from pyspark.sql import DataFrame from pyspark.sql import functions as F +from optimus.helpers.checkit import is_ from optimus.helpers.functions import is_pyarrow_installed, parse_python_dtypes, random_int, one_list_to_val, \ get_spark_dtypes_object from optimus.helpers.raiseit import RaiseIt @@ -35,9 +38,9 @@ def abstract_udf(col, func, func_return_type=None, attrs=None, func_type=None, v if func_type not in types: RaiseIt.value_error(func_type, types) - if verbose is True: - logging.info("Using '{func_type}' to process column '{column}' with function {func_name}" - .format(func_type=func_type, column=col, func_name=func.__name__)) + # if verbose is True: + # logging.info("Using '{func_type}' to process column '{column}' with function {func_name}" + # .format(func_type=func_type, column=col, func_name=func.__name__)) df_func = func_factory(func_type, func_return_type) return df_func(attrs, func)(col) @@ -240,12 +243,13 @@ def plot_hist(column_data=None, output="image", sub_title=""): def filter_row_by_data_type(col_name, data_type=None, get_type=False): + from ast import literal_eval """ A Pandas UDF function that returns bool if the value match with the data_type param passed to the function. Also can return the data type :param col_name: Column to be process - :param data_type: The data_type to be compared - :param get_type: + :param data_type: The data_type to be compared with + :param get_type: Value to be returned as string or boolean :return: True or False """ if data_type is not None: @@ -271,14 +275,15 @@ def str_to_date(value): def str_to_array(value): """ + Check if value can be pased to tupple or arrays. Because Spark can handle tuples we will try to transform tuples to arrays :param value: :return: """ try: - dateutil.parser.parse(value) + isinstance(literal_eval((value.encode('ascii', 'ignore')).decode("utf-8")), (list, tuple)) return True - except ValueError: + except: pass def func(value): diff --git a/optimus/helpers/constants.py b/optimus/helpers/constants.py index d07407bf4..e3961a984 100644 --- a/optimus/helpers/constants.py +++ b/optimus/helpers/constants.py @@ -16,10 +16,10 @@ } PYTHON_TYPES_ = {"string": str, "int": int, "float": float, "boolean": bool} -PROFILER_TYPES = {"int", "float", "string", "bool", "date", "null"} +PROFILER_TYPES = {"int", "float", "string", "bool", "date", "null", "array"} PROFILER_LEGEND_TYPES = {"string": "ABC", "int": "#", "integer": "#", "float": "##.#", "double": "##.#", "bigint": "#"} -PROFILER_COLUMN_TYPES = {"categorical", "numeric", "date", "bool","null"} +PROFILER_COLUMN_TYPES = {"categorical", "numeric", "date", "bool", "null", "array"} SPARK_SHORT_DTYPES = {"string": "string", "str": "string", diff --git a/optimus/profiler/functions.py b/optimus/profiler/functions.py index bd5853c5a..220ad80a4 100644 --- a/optimus/profiler/functions.py +++ b/optimus/profiler/functions.py @@ -6,6 +6,7 @@ from pyspark.sql.functions import when from optimus.helpers.constants import * +from optimus.helpers.decorators import time_it from optimus.helpers.functions import parse_columns confidence_level_constant = [50, .67], [68, .99], [90, 1.64], [95, 1.96], [99, 2.57] @@ -88,6 +89,7 @@ def sample_size(population_size, confidence_level, confidence_interval): return int(math.ceil(n)) # sample size +@time_it def bucketizer(df, columns, splits): """ @@ -96,7 +98,6 @@ def bucketizer(df, columns, splits): :param splits: :return: """ - start_time = timeit.default_timer() columns = parse_columns(df, columns) def _bucketizer(col_name, args): @@ -128,8 +129,7 @@ def _bucketizer(col_name, args): # TODO: This seems weird but I can not find another way. Send the actual column name to the func not seems right df = df.cols.apply_expr(output_columns, _bucketizer, [splits, dict(zip(output_columns, columns))]) - logging.info("bucketizer") - logging.info(timeit.default_timer() - start_time) + return df diff --git a/optimus/profiler/profiler.py b/optimus/profiler/profiler.py index e086a5ae3..f5b5ae2a2 100644 --- a/optimus/profiler/profiler.py +++ b/optimus/profiler/profiler.py @@ -61,7 +61,7 @@ def dataset_info(df): # TODO: This should check only the StringType Columns. The datatype from others columns can be taken from schema(). @staticmethod @time_it - def count_data_types(df, columns): + def count_data_types(df, columns, infer=False): """ Count the number of int, float, string, date and booleans and output the count in json format :param df: Dataframe to be processed @@ -85,13 +85,14 @@ def _count_data_types(col_name): count_by_data_type = {} count_empty_strings = 0 - if col_data_type == "string": + + if infer is True and col_data_type == "string": + types = df.withColumn(temp, fbdt(col_name, get_type=True)).groupBy(temp).count().to_json() for row in types: count_by_data_type[row[temp]] = row["count"] count_empty_strings = df.where(F.col(col_name) == '').count() - #count_by_data_type['string'] = count_by_data_type['string'] - count_empty_strings else: nulls = df.cols.count_na(col_name) @@ -105,7 +106,8 @@ def _count_data_types(col_name): "bool": count_by_data_type['bool'], "int": count_by_data_type['int'], "float": count_by_data_type['float'], - "date": count_by_data_type['date'] + "date": count_by_data_type['date'], + "array": count_by_data_type['array'] } null_missed_count = {"null": count_by_data_type['null'], @@ -123,6 +125,8 @@ def _count_data_types(col_name): cat = "date" elif greatest_data_type_count is "bool": cat = "bool" + elif greatest_data_type_count is "array": + cat = "array" else: cat = "null" @@ -155,7 +159,7 @@ def _count_data_types(col_name): results["columns"] = type_details return results - def run(self, df, columns, buckets=40, relative_error=1): + def run(self, df, columns, buckets=40, infer=False, relative_error=1): """ Return dataframe statistical information in HTML Format @@ -167,7 +171,7 @@ def run(self, df, columns, buckets=40, relative_error=1): """ columns = parse_columns(df, columns) - output = Profiler.to_json(df, columns, buckets, relative_error) + output = Profiler.to_json(df, columns, buckets, infer, relative_error) # Load jinja path = os.path.dirname(os.path.abspath(__file__)) @@ -217,7 +221,7 @@ def run(self, df, columns, buckets=40, relative_error=1): write_json(output, self.path) @staticmethod - def to_json(df, columns, buckets=40, relative_error=1): + def to_json(df, columns, buckets=40, infer=False, relative_error=1): """ Return the profiling data in json format :param df: Dataframe to be processed @@ -227,7 +231,7 @@ def to_json(df, columns, buckets=40, relative_error=1): """ # Get the stats for all the columns - output = Profiler.columns(df, columns, buckets, relative_error) + output = Profiler.columns(df, columns, buckets, infer, relative_error) # Add the data summary to the output output["summary"] = Profiler.dataset_info(df) @@ -241,7 +245,7 @@ def to_json(df, columns, buckets=40, relative_error=1): return output @staticmethod - def columns(df, columns, buckets=40, relative_error=1): + def columns(df, columns, buckets=40, infer=False, relative_error=1): """ Return statistical information about a specific column in json format :param df: Dataframe to be processed @@ -264,8 +268,7 @@ def columns(df, columns, buckets=40, relative_error=1): rows_count = df.count() columns_info['rows_count'] = humanize.intword(rows_count) - - count_dtypes = Profiler.count_data_types(df, columns) + count_dtypes = Profiler.count_data_types(df, columns, infer) columns_info["count_types"] = count_dtypes["count_types"] columns_info['size'] = humanize.naturalsize(df.size()) @@ -295,7 +298,7 @@ def columns(df, columns, buckets=40, relative_error=1): col_info["stats"].update(Profiler.extra_numeric_stats(df, col_name, stats, relative_error)) col_info["hist"] = df.cols.hist(col_name, stats[col_name]["min"], stats[col_name]["max"], buckets) - if column_type == "categorical": + if column_type == "categorical" or column_type == "array": col_info["hist"] = Profiler.hist_string(df, col_name, buckets) if column_type == "date": diff --git a/optimus/profiler/templates/general_info.html b/optimus/profiler/templates/general_info.html index 0c663087d..eb809497a 100644 --- a/optimus/profiler/templates/general_info.html +++ b/optimus/profiler/templates/general_info.html @@ -78,7 +78,7 @@

Column types

Array - {{data.count_types.bool}} + {{data.count_types.array}} From 384040d9f5007c75cba6c5cde1ab520b91781013 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Tue, 4 Sep 2018 14:33:00 -0500 Subject: [PATCH 37/94] Updated profile examples --- examples/new-api-profiler-test.ipynb | 891 +++++---------------------- examples/new-api-profiler.ipynb | 37 +- optimus/profiler/profiler.py | 1 + 3 files changed, 169 insertions(+), 760 deletions(-) diff --git a/examples/new-api-profiler-test.ipynb b/examples/new-api-profiler-test.ipynb index 3fe92a02f..0506c559b 100644 --- a/examples/new-api-profiler-test.ipynb +++ b/examples/new-api-profiler-test.ipynb @@ -1,5 +1,23 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": { + "inputHidden": false, + "outputHidden": false + }, + "source": [ + "# Profiler performance\n", + "\n", + "We use the part of the instacart data that you can find here https://www.instacart.com/datasets/grocery-shopping-2017\n", + "\n", + "Specically order_products__prior.csv a 4 columns, 33.2 Million rows csv file.\n", + "\n", + "It took 355.58 seconds to process all the data set in a Windows 10, \n", + "Instacart data\n", + "\n" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -82,57 +100,93 @@ { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "metadata": { + "scrolled": false + }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", - "
Viewing 100 of 32434489 rows / 4 columns
\n", + "
Viewing 100 of 32.4 million rows / 4 columns
\n", + "
8 partition(s)
\n", "\n", - "\n", + "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -2143,7 +2197,8 @@ " \n", "
\n", "
order_id
\n", "
1 (int)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
product_id
\n", "
2 (int)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
add_to_cart_order
\n", "
3 (int)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
reordered
\n", "
4 (int)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "\n", - "
Viewing 100 of 32434489 rows / 4 columns
\n" + "
Viewing 100 of 32.4 million rows / 4 columns
\n", + "
8 partition(s)
\n" ], "text/plain": [ "" @@ -2159,771 +2214,85 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Processing column 'product_id'...\n", - "percentile\n", - "12.428871233101177\n", - "percentile\n", - "13.294262981479164\n", - "percentile\n", - "12.101534748881022\n", - "Using 'column_exp' to process column 'product_id_buckets' with function _bucketizer\n", - "bucketizer\n", - "0.27864982148264517\n" + "Processing column 'order_id'...\n", + "_count_data_types() executed in 17.93 sec\n", + "count_data_types() executed in 17.93 sec\n", + "cast_columns() executed in 0.01 sec\n", + "_exprs() executed in 15.79 sec\n", + "general_stats() executed in 15.79 sec\n", + "------------------------------\n", + "Processing column 'order_id'...\n" ] }, { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "

Overview

\n", - "
\n", - "
\n", - "
\n", - "

Dataset info

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "
Number of columns4
Number of rows32434489
Total Missing (%)0.0%
Total size in memory58.9MiB
\n", - "
\n", - "
\n", - "

Variables types

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "
Categorical0
Numeric1
Date0
Bool
Not available0
\n", - "
\n", - "
\n", - "\n", - "
\n", - "
\n", - "\n", - " \n", - "\n", - "
\n", - "
\n", - "

product_id

\n", - "
numeric
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Distinct count 45888
Unique (%) 0.141
Missing (%)0
Missing (n)0.0
\n", - "
\n", - "

\n", - " Datatypes\n", - "

\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "
\n", - " String\n", - " \n", - " 0\n", - "
\n", - " Integer\n", - " \n", - " 32434489\n", - "
\n", - " Float\n", - " \n", - " 0\n", - "
\n", - " Bool\n", - " \n", - " 0\n", - "
\n", - " Date\n", - " \n", - " 0\n", - "
\n", - " Missing\n", - " \n", - " 0\n", - "
\n", - " Null\n", - " \n", - " 0\n", - "
\n", - " \n", - "
\n", - "

\n", - " Basic Stats\n", - "

\n", - "\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
Mean25576.337535424096
Minimum1
Maximum49688
Zeros(%)0
\n", - " \n", - "\n", - "
\n", - "
\n", - "

Frequency

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ValueCountFrecuency (%)
248524725651.457%
131763794501.17%
211372646830.816%
219032419210.746%
472092135840.659%
477661768150.545%
476261526570.471%
167971429510.441%
262091406270.434%
278451379050.425%
\"Missing\"00.0%
\n", - "
\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "

Quantile statistics

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Minimum1
5-th percentile1.0
Q11.0
Median1.0
Q349688.0
95-th percentile49688.0
Maximum49688
Range49687.0
Interquartile range49687.0
\n", - "
\n", - "
\n", - "

Descriptive statistics

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Standard deviation14096.689090257127
Coef of variation0.55116
Kurtosis-1.1408165030229254
Mean25576.337535424096
MAD0.0
Skewness0
Sum829555438453
Variance198716643.3073743
\n", - "
\n", - " \n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "
\n", - "\n", - "
\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "\n", - "
\n", - "
\n", - "\n", - "\n", - "\n", - "\n", - "
Viewing 10 of 32434489 rows / 4 columns
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
order_id
\n", - "
1 (int)
\n", - "\n", - "
\n", - "
product_id
\n", - "
2 (int)
\n", - "\n", - "
\n", - "
add_to_cart_order
\n", - "
3 (int)
\n", - "\n", - "
\n", - "
reordered
\n", - "
4 (int)
\n", - "\n", - "
\n", - " 2\n", - " \n", - " 33120\n", - " \n", - " 1\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 28985\n", - " \n", - " 2\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 9327\n", - " \n", - " 3\n", - " \n", - " 0\n", - "
\n", - " 2\n", - " \n", - " 45918\n", - " \n", - " 4\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 30035\n", - " \n", - " 5\n", - " \n", - " 0\n", - "
\n", - " 2\n", - " \n", - " 17794\n", - " \n", - " 6\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 40141\n", - " \n", - " 7\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 1819\n", - " \n", - " 8\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 43668\n", - " \n", - " 9\n", - " \n", - " 0\n", - "
\n", - " 3\n", - " \n", - " 33754\n", - " \n", - " 1\n", - " \n", - " 1\n", - "
\n", - "\n", - "
Viewing 10 of 32434489 rows / 4 columns
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stdout", + "output_type": "stream", + "text": [ + "40\n" + ] }, { - "data": { - "text/plain": [ - "161.76729593380855" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" + "name": "stderr", + "output_type": "stream", + "text": [ + "frequency() executed in 24.97 sec\n" + ] } ], "source": [ - "import timeit\n", - "start_time = timeit.default_timer()\n", - "op.profiler.run(df, \"product_id\", relative_error=0.5)\n", - "timeit.default_timer() - start_time" + "op.profiler.run(df, \"order_id\", infer=False, relative_error=1)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 6, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "[1.0, 1.0, 1.0, 49688.0, 49688.0]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing column 'order_id'...\n", + "_count_data_types() executed in 17.82 sec\n", + "count_data_types() executed in 17.83 sec\n", + "cast_columns() executed in 0.01 sec\n", + "_exprs() executed in 15.66 sec\n", + "general_stats() executed in 15.67 sec\n", + "------------------------------\n", + "Processing column 'order_id'...\n" + ] + }, + { + "ename": "Py4JError", + "evalue": "An error occurred while calling o132.limit. Trace:\npy4j.Py4JException: Method limit([class java.lang.Boolean]) does not exist\r\n\tat py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)\r\n\tat py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)\r\n\tat py4j.Gateway.invoke(Gateway.java:274)\r\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\r\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\r\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\r\n\tat java.lang.Thread.run(Thread.java:748)\r\n\n", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mPy4JError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mop\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mprofiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"order_id\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrelative_error\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\decorators.py\u001b[0m in \u001b[0;36mtimed\u001b[1;34m(*args, **kw)\u001b[0m\n\u001b[0;32m 26\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mtimed\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 27\u001b[0m \u001b[0mstart_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 28\u001b[1;33m \u001b[0mf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmethod\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 29\u001b[0m \u001b[0m_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mround\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mstart_time\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 30\u001b[0m \u001b[0mlogging\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"{name}() executed in {time} sec\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0m_time\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\profiler\\profiler.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self, df, columns, buckets, infer, relative_error)\u001b[0m\n\u001b[0;32m 173\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 174\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparse_columns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 175\u001b[1;33m \u001b[0moutput\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mProfiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_json\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbuckets\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minfer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrelative_error\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 176\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 177\u001b[0m \u001b[1;31m# Load jinja\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\profiler\\profiler.py\u001b[0m in \u001b[0;36mto_json\u001b[1;34m(df, columns, buckets, infer, relative_error)\u001b[0m\n\u001b[0;32m 233\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 234\u001b[0m \u001b[1;31m# Get the stats for all the columns\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 235\u001b[1;33m \u001b[0moutput\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mProfiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbuckets\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minfer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrelative_error\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 236\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 237\u001b[0m \u001b[1;31m# Add the data summary to the output\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\profiler\\profiler.py\u001b[0m in \u001b[0;36mcolumns\u001b[1;34m(df, columns, buckets, infer, relative_error)\u001b[0m\n\u001b[0;32m 288\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 289\u001b[0m \u001b[0mcol_info\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"stats\"\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mstats\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mcol_name\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 290\u001b[1;33m \u001b[0mcol_info\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mProfiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfrequency\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcol_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbuckets\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 291\u001b[0m \u001b[0mcol_info\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mProfiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstats_by_column\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcol_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstats\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcount_dtypes\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 292\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\decorators.py\u001b[0m in \u001b[0;36mtimed\u001b[1;34m(*args, **kw)\u001b[0m\n\u001b[0;32m 26\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mtimed\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 27\u001b[0m \u001b[0mstart_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 28\u001b[1;33m \u001b[0mf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmethod\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 29\u001b[0m \u001b[0m_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mround\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mstart_time\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 30\u001b[0m \u001b[0mlogging\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"{name}() executed in {time} sec\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0m_time\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\profiler\\profiler.py\u001b[0m in \u001b[0;36mfrequency\u001b[1;34m(df, col_name, buckets)\u001b[0m\n\u001b[0;32m 326\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 327\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mrows\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msort\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"count\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"desc\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mcol_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"desc\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 328\u001b[1;33m \u001b[1;33m.\u001b[0m\u001b[0mlimit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbuckets\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 329\u001b[0m .withColumn(\"percentage\",\n\u001b[0;32m 330\u001b[0m F.round((F.col(\"count\") / rows_count) * 100,\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pyspark\\sql\\dataframe.py\u001b[0m in \u001b[0;36mlimit\u001b[1;34m(self, num)\u001b[0m\n\u001b[0;32m 491\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 492\u001b[0m \"\"\"\n\u001b[1;32m--> 493\u001b[1;33m \u001b[0mjdf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlimit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnum\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 494\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mDataFrame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mjdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msql_ctx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 495\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m 1255\u001b[0m \u001b[0manswer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1256\u001b[0m return_value = get_return_value(\n\u001b[1;32m-> 1257\u001b[1;33m answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[0;32m 1258\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1259\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pyspark\\sql\\utils.py\u001b[0m in \u001b[0;36mdeco\u001b[1;34m(*a, **kw)\u001b[0m\n\u001b[0;32m 61\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mdeco\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 62\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 63\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 64\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 65\u001b[0m \u001b[0ms\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjava_exception\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoString\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\protocol.py\u001b[0m in \u001b[0;36mget_return_value\u001b[1;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[0;32m 330\u001b[0m raise Py4JError(\n\u001b[0;32m 331\u001b[0m \u001b[1;34m\"An error occurred while calling {0}{1}{2}. Trace:\\n{3}\\n\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 332\u001b[1;33m format(target_id, \".\", name, value))\n\u001b[0m\u001b[0;32m 333\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 334\u001b[0m raise Py4JError(\n", + "\u001b[1;31mPy4JError\u001b[0m: An error occurred while calling o132.limit. Trace:\npy4j.Py4JException: Method limit([class java.lang.Boolean]) does not exist\r\n\tat py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)\r\n\tat py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)\r\n\tat py4j.Gateway.invoke(Gateway.java:274)\r\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\r\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\r\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\r\n\tat java.lang.Thread.run(Thread.java:748)\r\n\n" + ] } ], "source": [ - "df.approxQuantile(\"product_id\", [0.05, 0.25, 0.5, 0.75, 0.95], 0.5)" + "op.profiler.run(df, \"order_id\", infer=True, relative_error=1)" ] }, { @@ -2932,9 +2301,22 @@ "metadata": {}, "outputs": [], "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Rule 1 : Know your data types.\n", + "Rule 2 : Sample your data.\n", + "Rule 3 : Try first dump mode the smart mode.\n", + "Ryle 4 : Repartirion your data" + ] } ], "metadata": { + "kernel_info": { + "name": "python3" + }, "kernelspec": { "display_name": "Python 3", "language": "python", @@ -2951,6 +2333,9 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" + }, + "nteract": { + "version": "0.11.6" } }, "nbformat": 4, diff --git a/examples/new-api-profiler.ipynb b/examples/new-api-profiler.ipynb index 0e44df555..82ef37e57 100644 --- a/examples/new-api-profiler.ipynb +++ b/examples/new-api-profiler.ipynb @@ -20,6 +20,13 @@ "sys.path.append(\"..\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Now you can get extra information for the profiler if you activate pass verbose= True to optimus" + ] + }, { "cell_type": "code", "execution_count": 3, @@ -722,7 +729,7 @@ " \n", " \n", " Total size in memory\n", - " 49.9 MB\n", + " 51.1 MB\n", "\n", " \n", " \n", @@ -3810,8 +3817,10 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": {}, + "execution_count": 7, + "metadata": { + "scrolled": false + }, "outputs": [ { "data": { @@ -3864,7 +3873,7 @@ " \n", " \n", " Total size in memory\n", - " 58.1 MB\n", + " 61.8 MB\n", "\n", " \n", " \n", @@ -6810,8 +6819,10 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": {}, + "execution_count": 8, + "metadata": { + "scrolled": false + }, "outputs": [ { "data": { @@ -6864,7 +6875,7 @@ " \n", " \n", " Total size in memory\n", - " 63.8 MB\n", + " 66.6 MB\n", "\n", " \n", " \n", @@ -7893,6 +7904,18 @@ "language": "python", "name": "python3" }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + }, "nteract": { "version": "0.11.6" } diff --git a/optimus/profiler/profiler.py b/optimus/profiler/profiler.py index f5b5ae2a2..b85cc10f9 100644 --- a/optimus/profiler/profiler.py +++ b/optimus/profiler/profiler.py @@ -159,6 +159,7 @@ def _count_data_types(col_name): results["columns"] = type_details return results + @time_it def run(self, df, columns, buckets=40, infer=False, relative_error=1): """ Return dataframe statistical information in HTML Format From e56c168e3e9c69cfadf942fb527af8d1be8a7b0e Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Tue, 4 Sep 2018 14:37:35 -0500 Subject: [PATCH 38/94] Profiler Test updated --- examples/new-api-profiler-test.ipynb | 1618 ++++++++++++++++++++++++-- 1 file changed, 1547 insertions(+), 71 deletions(-) diff --git a/examples/new-api-profiler-test.ipynb b/examples/new-api-profiler-test.ipynb index 0506c559b..565911dfe 100644 --- a/examples/new-api-profiler-test.ipynb +++ b/examples/new-api-profiler-test.ipynb @@ -2214,7 +2214,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -2222,95 +2222,1571 @@ "output_type": "stream", "text": [ "Processing column 'order_id'...\n", - "_count_data_types() executed in 17.93 sec\n", - "count_data_types() executed in 17.93 sec\n", + "_count_data_types() executed in 18.69 sec\n", + "count_data_types() executed in 18.69 sec\n", "cast_columns() executed in 0.01 sec\n", - "_exprs() executed in 15.79 sec\n", - "general_stats() executed in 15.79 sec\n", + "_exprs() executed in 16.04 sec\n", + "general_stats() executed in 16.05 sec\n", "------------------------------\n", - "Processing column 'order_id'...\n" + "Processing column 'order_id'...\n", + "frequency() executed in 23.65 sec\n", + "stats_by_column() executed in 8.83 sec\n", + "percentile() executed in 12.21 sec\n", + "extra_numeric_stats() executed in 37.45 sec\n", + "bucketizer() executed in 0.29 sec\n", + "hist() executed in 14.6 sec\n", + "dataset_info() executed in 22.43 sec\n" ] }, { - "name": "stdout", + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "

Overview

\n", + "
\n", + "
\n", + "
\n", + "

Dataset info

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
Number of columns4
Number of rows32434489
Total Missing (%)0.0%
Total size in memory188.4 MB
\n", + "
\n", + "
\n", + "

Column types

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
String0
Numeric1
Date0
Bool0
Array0
Not available0
\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

order_id

\n", + "
numeric
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 3025302
Unique (%) 9.327
Missing0.0
Missing (%)0
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 0\n", + "
\n", + " Integer\n", + " \n", + " 32434489\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 0\n", + "
\n", + " \n", + "
\n", + "

\n", + " Basic Stats\n", + "

\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
Mean1710748.5189427834
Minimum2
Maximum3421083
Zeros(%)0
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
15642441450.0%
7909031370.0%
613551270.0%
29703921210.0%
20699201160.0%
33080101150.0%
27533241140.0%
24997741120.0%
26216251090.0%
771511090.0%
\"Missing\"00.0%
\n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "

Quantile statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Minimum2
5-th percentile2.0
Q12.0
Median2.0
Q32.0
95-th percentile2.0
Maximum3421083
Range3421081
Interquartile range0.0
\n", + "
\n", + "
\n", + "

Descriptive statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Standard deviation987300.6964529774
Coef of variation0.57712
Kurtosis-1.199128348852751
Mean1710748.5189427834
MAD0.0
Skewness0
Sum55487254019416
Variance974762665216.534
\n", + "
\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 10 of 32.4 million rows / 4 columns
\n", + "
8 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
order_id
\n", + "
1 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
product_id
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
add_to_cart_order
\n", + "
3 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
reordered
\n", + "
4 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " 2\n", + " \n", + " 33120\n", + " \n", + " 1\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 28985\n", + " \n", + " 2\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 9327\n", + " \n", + " 3\n", + " \n", + " 0\n", + "
\n", + " 2\n", + " \n", + " 45918\n", + " \n", + " 4\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 30035\n", + " \n", + " 5\n", + " \n", + " 0\n", + "
\n", + " 2\n", + " \n", + " 17794\n", + " \n", + " 6\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 40141\n", + " \n", + " 7\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 1819\n", + " \n", + " 8\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 43668\n", + " \n", + " 9\n", + " \n", + " 0\n", + "
\n", + " 3\n", + " \n", + " 33754\n", + " \n", + " 1\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 10 of 32.4 million rows / 4 columns
\n", + "
8 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "run() executed in 186.8 sec\n" + ] + } + ], + "source": [ + "op.profiler.run(df, \"order_id\", infer=False, relative_error=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", "output_type": "stream", "text": [ - "40\n" + "Processing column 'order_id'...\n", + "_count_data_types() executed in 21.72 sec\n", + "count_data_types() executed in 21.72 sec\n", + "cast_columns() executed in 0.01 sec\n", + "_exprs() executed in 17.72 sec\n", + "general_stats() executed in 17.73 sec\n", + "------------------------------\n", + "Processing column 'order_id'...\n", + "frequency() executed in 25.8 sec\n", + "stats_by_column() executed in 9.99 sec\n", + "percentile() executed in 13.46 sec\n", + "extra_numeric_stats() executed in 39.63 sec\n", + "bucketizer() executed in 0.3 sec\n", + "hist() executed in 14.25 sec\n", + "dataset_info() executed in 22.55 sec\n" ] }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "

Overview

\n", + "
\n", + "
\n", + "
\n", + "

Dataset info

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
Number of columns4
Number of rows32434489
Total Missing (%)0.0%
Total size in memory8.3 MB
\n", + "
\n", + "
\n", + "

Column types

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
String0
Numeric1
Date0
Bool0
Array0
Not available0
\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

order_id

\n", + "
numeric
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 3025302
Unique (%) 9.327
Missing0.0
Missing (%)0
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 0\n", + "
\n", + " Integer\n", + " \n", + " 32434489\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 0\n", + "
\n", + " \n", + "
\n", + "

\n", + " Basic Stats\n", + "

\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
Mean1710748.5189427834
Minimum2
Maximum3421083
Zeros(%)0
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
15642441450.0%
7909031370.0%
613551270.0%
29703921210.0%
20699201160.0%
33080101150.0%
27533241140.0%
24997741120.0%
26216251090.0%
771511090.0%
\"Missing\"00.0%
\n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "

Quantile statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Minimum2
5-th percentile2.0
Q12.0
Median2.0
Q32.0
95-th percentile2.0
Maximum3421083
Range3421081
Interquartile range0.0
\n", + "
\n", + "
\n", + "

Descriptive statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Standard deviation987300.6964529774
Coef of variation0.57712
Kurtosis-1.199128348852751
Mean1710748.5189427834
MAD0.0
Skewness0
Sum55487254019416
Variance974762665216.534
\n", + "
\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 10 of 32.4 million rows / 4 columns
\n", + "
8 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
order_id
\n", + "
1 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
product_id
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
add_to_cart_order
\n", + "
3 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
reordered
\n", + "
4 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " 2\n", + " \n", + " 33120\n", + " \n", + " 1\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 28985\n", + " \n", + " 2\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 9327\n", + " \n", + " 3\n", + " \n", + " 0\n", + "
\n", + " 2\n", + " \n", + " 45918\n", + " \n", + " 4\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 30035\n", + " \n", + " 5\n", + " \n", + " 0\n", + "
\n", + " 2\n", + " \n", + " 17794\n", + " \n", + " 6\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 40141\n", + " \n", + " 7\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 1819\n", + " \n", + " 8\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 43668\n", + " \n", + " 9\n", + " \n", + " 0\n", + "
\n", + " 3\n", + " \n", + " 33754\n", + " \n", + " 1\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 10 of 32.4 million rows / 4 columns
\n", + "
8 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stderr", "output_type": "stream", "text": [ - "frequency() executed in 24.97 sec\n" - ] - } - ], - "source": [ - "op.profiler.run(df, \"order_id\", infer=False, relative_error=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Processing column 'order_id'...\n", - "_count_data_types() executed in 17.82 sec\n", - "count_data_types() executed in 17.83 sec\n", - "cast_columns() executed in 0.01 sec\n", - "_exprs() executed in 15.66 sec\n", - "general_stats() executed in 15.67 sec\n", - "------------------------------\n", - "Processing column 'order_id'...\n" - ] - }, - { - "ename": "Py4JError", - "evalue": "An error occurred while calling o132.limit. Trace:\npy4j.Py4JException: Method limit([class java.lang.Boolean]) does not exist\r\n\tat py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)\r\n\tat py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)\r\n\tat py4j.Gateway.invoke(Gateway.java:274)\r\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\r\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\r\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\r\n\tat java.lang.Thread.run(Thread.java:748)\r\n\n", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mPy4JError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mop\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mprofiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"order_id\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrelative_error\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\decorators.py\u001b[0m in \u001b[0;36mtimed\u001b[1;34m(*args, **kw)\u001b[0m\n\u001b[0;32m 26\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mtimed\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 27\u001b[0m \u001b[0mstart_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 28\u001b[1;33m \u001b[0mf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmethod\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 29\u001b[0m \u001b[0m_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mround\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mstart_time\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 30\u001b[0m \u001b[0mlogging\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"{name}() executed in {time} sec\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0m_time\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Documents\\Optimus\\optimus\\profiler\\profiler.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self, df, columns, buckets, infer, relative_error)\u001b[0m\n\u001b[0;32m 173\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 174\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparse_columns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 175\u001b[1;33m \u001b[0moutput\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mProfiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_json\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbuckets\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minfer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrelative_error\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 176\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 177\u001b[0m \u001b[1;31m# Load jinja\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Documents\\Optimus\\optimus\\profiler\\profiler.py\u001b[0m in \u001b[0;36mto_json\u001b[1;34m(df, columns, buckets, infer, relative_error)\u001b[0m\n\u001b[0;32m 233\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 234\u001b[0m \u001b[1;31m# Get the stats for all the columns\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 235\u001b[1;33m \u001b[0moutput\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mProfiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbuckets\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minfer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrelative_error\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 236\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 237\u001b[0m \u001b[1;31m# Add the data summary to the output\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Documents\\Optimus\\optimus\\profiler\\profiler.py\u001b[0m in \u001b[0;36mcolumns\u001b[1;34m(df, columns, buckets, infer, relative_error)\u001b[0m\n\u001b[0;32m 288\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 289\u001b[0m \u001b[0mcol_info\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"stats\"\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mstats\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mcol_name\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 290\u001b[1;33m \u001b[0mcol_info\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mProfiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfrequency\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcol_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbuckets\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 291\u001b[0m \u001b[0mcol_info\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mProfiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstats_by_column\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcol_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstats\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcount_dtypes\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 292\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\decorators.py\u001b[0m in \u001b[0;36mtimed\u001b[1;34m(*args, **kw)\u001b[0m\n\u001b[0;32m 26\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mtimed\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 27\u001b[0m \u001b[0mstart_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 28\u001b[1;33m \u001b[0mf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmethod\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 29\u001b[0m \u001b[0m_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mround\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mstart_time\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 30\u001b[0m \u001b[0mlogging\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"{name}() executed in {time} sec\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0m_time\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Documents\\Optimus\\optimus\\profiler\\profiler.py\u001b[0m in \u001b[0;36mfrequency\u001b[1;34m(df, col_name, buckets)\u001b[0m\n\u001b[0;32m 326\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 327\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mrows\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msort\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"count\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"desc\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mcol_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"desc\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 328\u001b[1;33m \u001b[1;33m.\u001b[0m\u001b[0mlimit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbuckets\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 329\u001b[0m .withColumn(\"percentage\",\n\u001b[0;32m 330\u001b[0m F.round((F.col(\"count\") / rows_count) * 100,\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pyspark\\sql\\dataframe.py\u001b[0m in \u001b[0;36mlimit\u001b[1;34m(self, num)\u001b[0m\n\u001b[0;32m 491\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 492\u001b[0m \"\"\"\n\u001b[1;32m--> 493\u001b[1;33m \u001b[0mjdf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlimit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnum\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 494\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mDataFrame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mjdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msql_ctx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 495\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m 1255\u001b[0m \u001b[0manswer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1256\u001b[0m return_value = get_return_value(\n\u001b[1;32m-> 1257\u001b[1;33m answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[0;32m 1258\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1259\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pyspark\\sql\\utils.py\u001b[0m in \u001b[0;36mdeco\u001b[1;34m(*a, **kw)\u001b[0m\n\u001b[0;32m 61\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mdeco\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 62\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 63\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 64\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 65\u001b[0m \u001b[0ms\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjava_exception\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoString\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\protocol.py\u001b[0m in \u001b[0;36mget_return_value\u001b[1;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[0;32m 330\u001b[0m raise Py4JError(\n\u001b[0;32m 331\u001b[0m \u001b[1;34m\"An error occurred while calling {0}{1}{2}. Trace:\\n{3}\\n\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 332\u001b[1;33m format(target_id, \".\", name, value))\n\u001b[0m\u001b[0;32m 333\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 334\u001b[0m raise Py4JError(\n", - "\u001b[1;31mPy4JError\u001b[0m: An error occurred while calling o132.limit. Trace:\npy4j.Py4JException: Method limit([class java.lang.Boolean]) does not exist\r\n\tat py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)\r\n\tat py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)\r\n\tat py4j.Gateway.invoke(Gateway.java:274)\r\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\r\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\r\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\r\n\tat java.lang.Thread.run(Thread.java:748)\r\n\n" + "run() executed in 199.09 sec\n" ] } ], "source": [ "op.profiler.run(df, \"order_id\", infer=True, relative_error=1)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Rule 1 : Know your data types.\n", - "Rule 2 : Sample your data.\n", - "Rule 3 : Try first dump mode the smart mode.\n", - "Ryle 4 : Repartirion your data" - ] } ], "metadata": { From c89e2ac05e43339cdf774db80af86946a5d1496f Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Tue, 4 Sep 2018 23:04:06 -0500 Subject: [PATCH 39/94] Better naming --- optimus/functions.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/optimus/functions.py b/optimus/functions.py index ca0ddd884..e95e9c09b 100644 --- a/optimus/functions.py +++ b/optimus/functions.py @@ -290,7 +290,6 @@ def func(value): """ Check if a value can be casted to a specific :param value: value to be checked - :return: """ if isinstance(value, bool): @@ -324,9 +323,9 @@ def func(value): return v.apply(func) if get_type is True: - a = "string" + return_data_type = "string" else: - a = "boolean" + return_data_type = "boolean" col_name = one_list_to_val(col_name) - return F.pandas_udf(pandas_udf_func, a)(col_name) + return F.pandas_udf(pandas_udf_func, return_data_type)(col_name) From c9013a4449f15ba12c10a8e1ebb4ed7a7e056fc3 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Tue, 4 Sep 2018 23:04:44 -0500 Subject: [PATCH 40/94] Improve docstrings. Removed unused imports --- optimus/helpers/functions.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/optimus/helpers/functions.py b/optimus/helpers/functions.py index 0bd63d6ce..dfc658bde 100644 --- a/optimus/helpers/functions.py +++ b/optimus/helpers/functions.py @@ -8,7 +8,7 @@ from IPython.display import display, HTML from optimus.helpers.checkit import is_list_of_one_element, is_list_of_strings, is_list_of_tuples, \ - is_list_of_str_or_int, is_str, is_str_or_int, is_dict_of_one_element, is_tuple, is_dict, is_list + is_str, is_dict_of_one_element, is_tuple, is_dict, is_list from optimus.helpers.constants import PYTHON_SHORT_TYPES, SPARK_SHORT_DTYPES, SPARK_DTYPES_DICT, \ SPARK_DTYPES_DICT_OBJECTS from optimus.helpers.raiseit import RaiseIt @@ -75,6 +75,11 @@ def print_html(html): def print_json(value): + """ + Print a json in a way that a human can read it + :param value: json to be printed + :return: json + """ pp = pprint.PrettyPrinter(indent=2) if is_str(value): value = value.replace("'", "\"") From 41351292b881980a8cb141d0def9e10b727e00f5 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Tue, 4 Sep 2018 23:05:00 -0500 Subject: [PATCH 41/94] Remove duplicated function --- optimus/profiler/functions.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/optimus/profiler/functions.py b/optimus/profiler/functions.py index 220ad80a4..f33582d20 100644 --- a/optimus/profiler/functions.py +++ b/optimus/profiler/functions.py @@ -36,15 +36,6 @@ def fill_missing_var_types(var_types): return var_types -# TODO: Maybe use pprint instead of this -def print_json(value): - """ - Print beauty jsons - :return: - """ - print(json.dumps(value, indent=2)) - - def write_json(data, path): """ Write a json file with the profiler result From 91f3f82dfbfbf50ce4c9558781a31827f22ff858 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Tue, 4 Sep 2018 23:07:08 -0500 Subject: [PATCH 42/94] Added functions to handle and get partitions information --- optimus/dataframe/extension.py | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/optimus/dataframe/extension.py b/optimus/dataframe/extension.py index cdf5d020e..fa96aad8e 100644 --- a/optimus/dataframe/extension.py +++ b/optimus/dataframe/extension.py @@ -173,15 +173,44 @@ def partitions(self): return self.rdd.getNumPartitions() +@add_attr(DataFrame) +def partitioner(self): + """ + + :param self: + :return: + """ + return self.rdd.partitioner + + +@add_attr(DataFrame) +def glom(self): + """ + + :param self: Dataframe + :return: + """ + return collect_as_dict(self.rdd.glom().collect()[0]) + + @add_method(DataFrame) -def h_repartition(self): +def h_repartition(self, partitions=None, col_name=None): """ Get the number of cpu available and apply an "optimus" repartition in the dataframe #Reference: https://stackoverflow.com/questions/35800795/number-of-partitions-in-rdd-and-performance-in-spark/35804407#35804407 :param self: + :param partitions: + :param col_name: :return: """ - return self.repartition(cpu_count * 4) + if partitions is None: + partitions = Spark.parallelism * 4 + + if col_name is None: + df = self.repartition(partitions) + else: + df = self.repartition(partitions, col_name) + return df @add_method(DataFrame) From 39692ab004e7a1ba00fbb42666d51dae6601953b Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Tue, 4 Sep 2018 23:07:49 -0500 Subject: [PATCH 43/94] Addded functions to get Spark info --- optimus/spark.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/optimus/spark.py b/optimus/spark.py index 2b5d60f53..48f705d83 100644 --- a/optimus/spark.py +++ b/optimus/spark.py @@ -52,3 +52,20 @@ def sc(self): :return: """ return self._spark.sparkContext + + @property + def parallelism(self): + """ + Returns default level of parallelism defined on SparkContext. By default it is number of cores available. + :param self: Dataframe + :return: + """ + return self.sc.defaultParallelism + + @property + def executors(self): + """ + Get the number of executors. If launched in local mode executors in None + :return: + """ + return self.sc._conf.get('spark.executor.instances') From 51dfdd0ba800f3b5fe9ef435f0edbc193b3f76c2 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Tue, 4 Sep 2018 23:21:38 -0500 Subject: [PATCH 44/94] Remove unused import --- optimus/helpers/decorators.py | 1 - 1 file changed, 1 deletion(-) diff --git a/optimus/helpers/decorators.py b/optimus/helpers/decorators.py index 69d0ac157..857935031 100644 --- a/optimus/helpers/decorators.py +++ b/optimus/helpers/decorators.py @@ -1,5 +1,4 @@ import timeit -import time import logging from functools import wraps From f5ae5195418975f903c3f3317578fe7549ba7b81 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Tue, 4 Sep 2018 23:23:02 -0500 Subject: [PATCH 45/94] Fixed parallelism call --- optimus/dataframe/extension.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/optimus/dataframe/extension.py b/optimus/dataframe/extension.py index fa96aad8e..371b69197 100644 --- a/optimus/dataframe/extension.py +++ b/optimus/dataframe/extension.py @@ -176,7 +176,7 @@ def partitions(self): @add_attr(DataFrame) def partitioner(self): """ - + Return al algorithm used to partition the dataframe :param self: :return: """ @@ -204,7 +204,7 @@ def h_repartition(self, partitions=None, col_name=None): :return: """ if partitions is None: - partitions = Spark.parallelism * 4 + partitions = Spark.instance.parallelism * 4 if col_name is None: df = self.repartition(partitions) From fe60691f95c013070151b3ee5978fadbe49cb454 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Tue, 4 Sep 2018 23:25:52 -0500 Subject: [PATCH 46/94] Now use h_repartition() to better handle data processing --- optimus/profiler/profiler.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/optimus/profiler/profiler.py b/optimus/profiler/profiler.py index b85cc10f9..4e1a3136f 100644 --- a/optimus/profiler/profiler.py +++ b/optimus/profiler/profiler.py @@ -88,7 +88,12 @@ def _count_data_types(col_name): if infer is True and col_data_type == "string": - types = df.withColumn(temp, fbdt(col_name, get_type=True)).groupBy(temp).count().to_json() + types = (df + .h_repartition(col_name=col_name) + .withColumn(temp, fbdt(col_name, get_type=True)) + .groupBy(temp).count() + .to_json()) + for row in types: count_by_data_type[row[temp]] = row["count"] @@ -322,7 +327,9 @@ def frequency(df, col_name, buckets): rows_count = df.count() col_info = {} # Frequency - freq = (df.groupBy(col_name) + freq = (df + .h_repartition(col_name=col_name) + .groupBy(col_name) .count() .rows.sort([("count", "desc"), (col_name, "desc")]) .limit(buckets) @@ -449,6 +456,7 @@ def stats_by_column(df, col_name, stats, count_dtypes): return col_info @staticmethod + @time_it def hist_date(df, col_name): """ Create a histogram for a date type column @@ -468,9 +476,8 @@ def infer_date(value, args): return result df = df \ - .cols.apply('year', infer_date, ArrayType(LongType())) \ - .cols.unnest("year") \ - .h_repartition() + .cols.apply(col_name, infer_date, ArrayType(LongType())) \ + .cols.unnest(col_name).h_repartition() for i in range(5): key_name = "" From 353dceaeb58bfa5b6bccaa87ffd306610da4acd7 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Tue, 4 Sep 2018 23:26:48 -0500 Subject: [PATCH 47/94] Log Time execution for hist_string() --- optimus/profiler/profiler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/optimus/profiler/profiler.py b/optimus/profiler/profiler.py index 4e1a3136f..73f06e4ce 100644 --- a/optimus/profiler/profiler.py +++ b/optimus/profiler/profiler.py @@ -523,6 +523,7 @@ def infer_date(value, args): return col_info @staticmethod + @time_it def hist_string(df, col_name, buckets): """ Create a string for a date type column From 4a8651f32a1143972b86d19414bb2fd6a1e0483c Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Tue, 4 Sep 2018 23:29:24 -0500 Subject: [PATCH 48/94] No use h_repartition() to create the histogram --- optimus/dataframe/columns.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/optimus/dataframe/columns.py b/optimus/dataframe/columns.py index e5751db8c..5dc74f042 100644 --- a/optimus/dataframe/columns.py +++ b/optimus/dataframe/columns.py @@ -1243,8 +1243,14 @@ def hist(columns, min_value, max_value, buckets=10): # Create buckets in the dataFrame df = bucketizer(self, col_name, splits=splits) - counts = (df.groupBy(col_name + "_buckets").agg(F.count(col_name + "_buckets").alias("count")).cols.rename( - col_name + "_buckets", "value").sort(F.asc("value")).to_json()) + col_bucket = col_name + "_buckets" + + counts = (df + .h_repartition(col_name=col_bucket) + .groupBy(col_bucket) + .agg(F.count(col_bucket).alias("count")) + .cols.rename(col_bucket, "value") + .sort(F.asc("value")).to_json()) # Fill the gaps in dict values. For example if we have 1,5,7,8,9 it get 1,2,3,4,5,6,7,8,9 new_array = [] From da3cb5fa85591369d83aae59dac5a6208d2deaa9 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Tue, 4 Sep 2018 23:29:52 -0500 Subject: [PATCH 49/94] Log time for hist() --- optimus/dataframe/columns.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimus/dataframe/columns.py b/optimus/dataframe/columns.py index 5dc74f042..e1ef49e12 100644 --- a/optimus/dataframe/columns.py +++ b/optimus/dataframe/columns.py @@ -1223,7 +1223,7 @@ def cell(column): """ return self.cols.select(column).first()[0] - @add_attr(cols) + @add_attr(cols, log_time=True) @dispatch((str, list), (float, int), (float, int), int) def hist(columns, min_value, max_value, buckets=10): """ From f9f0c57c6219768071a87659a526aa5963cd5c97 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Wed, 5 Sep 2018 14:41:04 -0500 Subject: [PATCH 50/94] Server now up and running --- .gitignore | 1 + examples/new-api-server.ipynb | 135 ++++++++++++++++++++++++++++++++++ optimus/server/process.py | 29 ++++++-- optimus/server/server.py | 48 +++++++++--- 4 files changed, 198 insertions(+), 15 deletions(-) create mode 100644 examples/new-api-server.ipynb diff --git a/.gitignore b/.gitignore index f720a9d5e..2d78417c1 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,4 @@ examples/random.csv data.json .pytest_cache/README.md examples/order_products__prior.csv +examples/server.pid diff --git a/examples/new-api-server.ipynb b/examples/new-api-server.ipynb new file mode 100644 index 000000000..887a46899 --- /dev/null +++ b/examples/new-api-server.ipynb @@ -0,0 +1,135 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"..\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from optimus import Optimus" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Just check that Spark and all necessary environments vars are present...\n", + "-----\n", + "SPARK_HOME=C:\\opt\\spark\\spark-2.3.1-bin-hadoop2.7\n", + "HADOOP_HOME=C:\\opt\\spark\\spark-2.3.1-bin-hadoop2.7\n", + "You don't have PYSPARK_PYTHON set\n", + "You don't have PYSPARK_DRIVER_PYTHON set\n", + "JAVA_HOME=C:\\Program Files\\Java\\jdk1.8.0_181\n", + "Pyarrow Installed\n", + "-----\n", + "Starting or getting SparkSession and SparkContext...\n", + "\n", + " ____ __ _ \n", + " / __ \\____ / /_(_)___ ___ __ _______\n", + " / / / / __ \\/ __/ / __ `__ \\/ / / / ___/\n", + " / /_/ / /_/ / /_/ / / / / / / /_/ (__ ) \n", + " \\____/ .___/\\__/_/_/ /_/ /_/\\__,_/____/ \n", + " /_/ \n", + " \n", + "Transform and Roll out...\n", + "Starting Optimus Server...\n", + "Server started with process id 9544\n", + "Optimus successfully imported. Have fun :).\n" + ] + } + ], + "source": [ + "op= Optimus(verbose=True, server= True)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Server seems to be running with process id 9544\n" + ] + } + ], + "source": [ + "from optimus.server.server import Server\n", + "s = Server()\n", + "s.start()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Optimus Server stopped\n" + ] + } + ], + "source": [ + "s.stop()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/optimus/server/process.py b/optimus/server/process.py index 79e445da8..cb053dd2b 100644 --- a/optimus/server/process.py +++ b/optimus/server/process.py @@ -6,6 +6,7 @@ from subprocess import Popen, PIPE WINDOWS = "windows" +PLATFORM = platform.system().lower() # test https://stackoverflow.com/questions/984941/python-subprocess-popen-from-a-thread @@ -18,7 +19,7 @@ def __init__(self, path=None): # set system/version dependent "start_new_session" analogs kwargs = {} - if platform.system() == WINDOWS: + if PLATFORM == WINDOWS: # from msdn [1] create_new_process_group = 0x00000200 # note: could get it from subprocess detached_process = 0x00000008 # 0x8 | 0x200 == 0x208 @@ -31,7 +32,7 @@ def __init__(self, path=None): process = Popen(path, stdin=PIPE, stdout=PIPE, stderr=PIPE, **kwargs) # Ensure that a child process has completed before the main process - process.join() + # process.join() self.process = process @@ -41,10 +42,18 @@ def __init__(self, path=None): def stop(self): """ + + :return: + """ + self.stop_id(self.id) + + # TODO: Maybe this should be outside Process() + @staticmethod + def stop_id(pid): + """ Stop the process that start the server :return: """ - process = self.process # Reference https://stackoverflow.com/questions/1230669/subprocess-deleting-child-processes-in-windows def kill_proc_tree(pid, including_parent=True): @@ -72,10 +81,10 @@ def kill_proc(pid): parent = psutil.Process(pid) parent.kill() - if platform.system() == WINDOWS: - kill_proc(process.pid) + if PLATFORM == WINDOWS: + kill_proc(pid) else: - os.killpg(os.getpgid(process.pid), signal.SIGTERM) + os.killpg(os.getpgid(pid), signal.SIGTERM) def status(self): """ @@ -83,3 +92,11 @@ def status(self): :return: """ return self.process + + @property + def id(self): + """ + Return the process id + :return: + """ + return self.process.pid diff --git a/optimus/server/server.py b/optimus/server/server.py index d9b47903e..bd75d9ded 100644 --- a/optimus/server/server.py +++ b/optimus/server/server.py @@ -1,6 +1,10 @@ -from optimus.server.process import Process +import logging import os -import atexit +import signal + +from psutil import NoSuchProcess + +from optimus.server.process import Process class Server: @@ -10,14 +14,40 @@ def __init__(self, path=None): self.process = None self.path = path + self.pid = None + self.pid_file = "server.pid" + signal.signal(signal.SIGINT, self.stop) def start(self): - self.process = Process(self.path) + """ + Start the Optimus Server + :return: + """ - def stop(self): - self.process.stop() + pid_file = self.pid_file + + # Verify if server.pid exist + if os.path.isfile(pid_file): + pid = int(open(pid_file, 'r').read()) + logging.info("Server seems to be running with process id {pid}".format(pid=pid)) + self.pid = pid - @atexit.register - def goodbye(self): - self.stop() - print("You are now leaving the Python sector.") + else: + # Start the server + process = Process(self.path) + pid = process.id + logging.info("Server started with process id " + str(pid)) + open(pid_file, 'w').write(str(pid)) + self.pid = pid + + def stop(self): + """ + Stop the Optimus Server + :return: + """ + try: + Process.stop_id(self.pid) + logging.info("Optimus Server stopped") + except (ProcessLookupError, NoSuchProcess): + os.remove(self.pid_file) + logging.info("Optimus could not be stopped. Process id not found") From 8e0990c847d8f30cdd2943a25b7472888d44fab2 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Wed, 5 Sep 2018 14:42:17 -0500 Subject: [PATCH 51/94] Fixed Typos --- optimus/server/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimus/server/run.py b/optimus/server/run.py index 66adc2272..6e1b51052 100644 --- a/optimus/server/run.py +++ b/optimus/server/run.py @@ -29,7 +29,7 @@ def index(): Return a message indicating if the server is running. :return: """ - return jsonify("Optimus Server si Running... Go to json /profiler to get the Optimus profiler data.") + return jsonify("Optimus Server is running... Go to /profiler to get the Optimus profiler data.") @app.route('/profiler') From 2b03774d8768365951ddfa321bd4d01437a8ebda Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Thu, 6 Sep 2018 11:34:48 -0500 Subject: [PATCH 52/94] Implementation cache to avoid DAG bifurcation --- optimus/profiler/profiler.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/optimus/profiler/profiler.py b/optimus/profiler/profiler.py index 73f06e4ce..dd1c77863 100644 --- a/optimus/profiler/profiler.py +++ b/optimus/profiler/profiler.py @@ -280,7 +280,7 @@ def columns(df, columns, buckets=40, infer=False, relative_error=1): columns_info['size'] = humanize.naturalsize(df.size()) # Cast columns to the data type infer by count_data_types() - df = Profiler.cast_columns(df, columns, count_dtypes) + df = Profiler.cast_columns(df, columns, count_dtypes).cache() # Calculate stats stats = Profiler.general_stats(df, columns) @@ -293,7 +293,7 @@ def columns(df, columns, buckets=40, infer=False, relative_error=1): col_info["stats"] = stats[col_name] col_info.update(Profiler.frequency(df, col_name, buckets)) - col_info.update(Profiler.stats_by_column(df, col_name, stats, count_dtypes)) + col_info.update(Profiler.stats_by_column(col_name, stats, count_dtypes, rows_count)) col_info['column_dtype'] = count_dtypes["columns"][col_name]['dtype'] col_info["dtypes_stats"] = count_dtypes["columns"][col_name]['details'] @@ -420,14 +420,14 @@ def cast_columns(df, columns, count_dtypes): @staticmethod @time_it - def stats_by_column(df, col_name, stats, count_dtypes): + def stats_by_column(col_name, stats, count_dtypes, rows_count): """ :param df: Dataframe to be analyzed :param col_name: Dataframe column to be analyzed :param count_dtypes: :return: """ - rows_count = df.count() + col_info = {} col_info["stats"] = {} @@ -467,7 +467,8 @@ def hist_date(df, col_name): col_info = {} # Create year/month/week day/hour/minute - def infer_date(value, args): + + def func_infer_date(value, args): if value is None: result = [None] else: @@ -475,9 +476,11 @@ def infer_date(value, args): result = [date.year, date.month, date.weekday(), date.hour, date.minute] return result - df = df \ - .cols.apply(col_name, infer_date, ArrayType(LongType())) \ - .cols.unnest(col_name).h_repartition() + df = (df + .cols.select(col_name) + .cols.apply(col_name, func_infer_date, ArrayType(LongType())) + .cols.unnest(col_name).h_repartition().cache() + ) for i in range(5): key_name = "" From a17270f293a878351754eef05c4d68ba370fc0fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Favio=20Andr=C3=A9=20V=C3=A1zquez?= Date: Thu, 6 Sep 2018 23:16:54 -0500 Subject: [PATCH 53/94] Fix requirement for library humanize --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index efc404606..c5648acf6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,6 +20,6 @@ pillow==5.2.0 pygments>=2.2.0 six>=1.10.0 h5py>=2.7.0 -humanize=0.5.1 +humanize==0.5.1 flask==0.12.2 ipython==6.5.0 From 65a0c424a67d8ea430e7136674167c950c14c55a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Favio=20Andr=C3=A9=20V=C3=A1zquez?= Date: Fri, 7 Sep 2018 13:30:21 -0500 Subject: [PATCH 54/94] Update requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 01aedf0c8..f9dac9302 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,3 +23,4 @@ h5py>=2.7.0 humanize==0.5.1 flask==0.12.2 ipython==6.5.0 +psutil From e3342637c9dbd073586181b8f7fc7ba5d169109b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Favio=20Andr=C3=A9=20V=C3=A1zquez?= Date: Fri, 7 Sep 2018 13:30:37 -0500 Subject: [PATCH 55/94] Update requirements-test.txt --- requirements-test.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 907ac3f57..361200ed5 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -25,4 +25,5 @@ h5py>=2.7.0 flask==1.0.2 ipython==6.5.0 humanize==0.5.1 -pytest-cov==2.6.0 \ No newline at end of file +pytest-cov==2.6.0 +psutil From 8dfe559dc6cdf6489a031774892fdbb345283062 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Fri, 7 Sep 2018 14:53:56 -0500 Subject: [PATCH 56/94] More testing --- examples/new-api-enrichment.ipynb | 362 ++++++++++++++++++------------ 1 file changed, 221 insertions(+), 141 deletions(-) diff --git a/examples/new-api-enrichment.ipynb b/examples/new-api-enrichment.ipynb index 55a1af001..cd32fd840 100644 --- a/examples/new-api-enrichment.ipynb +++ b/examples/new-api-enrichment.ipynb @@ -17,9 +17,18 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], "source": [ "%load_ext autoreload\n", "%autoreload 2" @@ -27,7 +36,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -37,27 +46,17 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "import redis\n", - "r = redis.StrictRedis(host='localhost', port=6379, db=0)" + "from optimus import Optimus\n", + "op = Optimus()" ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "from sparkly import SparklySession\n", - "spark = SparklySession()" - ] - }, - { - "cell_type": "code", - "execution_count": 42, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -70,21 +69,12 @@ "]\n", "\n", "# create DataFrame\n", - "df = spark.createDataFrame(vals, columns)" + "df = op.spark.createDataFrame(vals, columns).repartition(1).cache()" ] }, { "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [], - "source": [ - "import optimus as Optimus" - ] - }, - { - "cell_type": "code", - "execution_count": 47, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -108,7 +98,7 @@ "\n", "\n", "\n", - "
Viewing 3 of 3 rows / 3 columns
\n", + "
Viewing 3 of 3 rows / 2 columns
\n", "\n", "\n", " \n", @@ -126,12 +116,6 @@ "\n", " \n", " \n", - " \n", - " \n", " \n", "\n", " \n", @@ -147,10 +131,6 @@ " 0\n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -163,10 +143,6 @@ " 1\n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -179,16 +155,12 @@ " 1\n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", "
\n", - "
id
\n", - "
3 (bigint)
\n", - "\n", - "
\n", - " 17179869184\n", - "
\n", - " 42949672960\n", - "
\n", - " 60129542144\n", - "
\n", "\n", - "
Viewing 3 of 3 rows / 3 columns
\n" + "
Viewing 3 of 3 rows / 2 columns
\n" ], "text/plain": [ "" @@ -202,165 +174,273 @@ "df.table()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Testing kombu and Rabbitmq" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Activate RabbitMQ GUI Managment\n", + "Reference https://www.youtube.com/watch?v=F4PvutsQJlc\n", + "Install erlang\n", + "Run C:\\Program Files\\erl10.0.1\\Install.exe\n", + "\n", + "Go to C:\\Program Files\\RabbitMQ Server\\rabbitmq_server-3.7.7\\sbin\n", + "\n", + "`rabbitmq-plugins enable rabbitmq_management`\n", + "\n", + "`rabbitmq-server.bat restart`\n", + "\n", + "http://localhost:15672/" + ] + }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 16, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'example-queue'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "from pyspark.sql import functions as F\n", + "from kombu import Connection, Exchange, Queue, Consumer, Producer\n", + "\n", + "rabbit_url = \"amqp://localhost:5672/\"\n", + "conn = Connection(rabbit_url)\n", + "channel = conn.channel()\n", + "exchange = Exchange(\"example-exchange\", type=\"direct\")\n", + "queue = Queue(name=\"example-queue\", exchange=exchange, routing_key=\"BOB\")\n", "\n", - "df = df.withColumn(\"id\", F.monotonically_increasing_id())" + "queue.maybe_bind(conn)\n", + "queue.declare()" ] }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 27, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "+----+----+-----------+\n", - "|dogs|cats| id|\n", - "+----+----+-----------+\n", - "| 2| 0|17179869184|\n", - "| 0| 1|42949672960|\n", - "| 4| 1|60129542144|\n", - "+----+----+-----------+\n", - "\n" - ] + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "df.show()" + "# https://medium.com/python-pandemonium/talking-to-rabbitmq-with-python-and-kombu-6cbee93b1298\n", + "#https://medium.com/python-pandemonium/building-robust-rabbitmq-consumers-with-python-and-kombu-part-1-ccd660d17271\n", + "\n", + "def func (messages):\n", + " \n", + " channel = conn.channel() \n", + " producer = Producer(exchange=exchange, channel=channel, routing_key=\"BOB\")\n", + " \n", + " for message in messages:\n", + " as_dict = message.asDict(recursive=True)\n", + " producer.publish(message)\n", + " return messages\n", + " \n", + "df.rdd.mapPartitions(func).count()" ] }, { "cell_type": "code", - "execution_count": 55, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "df.write_ext.redis(\n", - " host='localhost',\n", - " port=6379,\n", - " key_by=['id'],\n", - " exclude_key_columns=True,\n", - " expire=24 * 60 * 60,\n", - " #compression='gzip',\n", - ")" + "def process_message(body, message):\n", + " print(\"The body is {}\".format(body))\n", + " message.ack()\n", + "\n", + " \n", + "with Consumer(conn, queues=queue, callbacks=[process_message], accept=[\"application/json\"]): \n", + " line= conn.drain_events(timeout=5)\n", + " #print(\"No message in the queue\")\n", + " #conn.heartbeat_check()" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 35, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"userId\": 1,\n", - " \"id\": 1,\n", - " \"title\": \"sunt aut facere repellat provident occaecati excepturi optio reprehenderit\",\n", - " \"body\": \"quia et suscipit\\nsuscipit recusandae consequuntur expedita et cum\\nreprehenderit molestiae ut ut quas totam\\nnostrum rerum est autem sunt rem eveniet architecto\"\n", - "}\n" + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mtimeout\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\amqp\\transport.py\u001b[0m in \u001b[0;36m_read\u001b[1;34m(self, n, initial, _errnos)\u001b[0m\n\u001b[0;32m 417\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 418\u001b[1;33m \u001b[0ms\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mrecv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mn\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrbuf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 419\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mtimeout\u001b[0m: timed out", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 20\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mConnection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrabbit_url\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mheartbeat\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m4\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mconn\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 21\u001b[0m \u001b[0mworker\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mWorker\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mqueues\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 22\u001b[1;33m \u001b[0mworker\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\kombu\\mixins.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self, _tokens, **kwargs)\u001b[0m\n\u001b[0;32m 168\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 169\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mrestart_limit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcan_consume\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_tokens\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;31m# pragma: no cover\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 170\u001b[1;33m \u001b[1;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconsume\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlimit\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 171\u001b[0m \u001b[1;32mpass\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 172\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\kombu\\mixins.py\u001b[0m in \u001b[0;36mconsume\u001b[1;34m(self, limit, timeout, safety_interval, **kwargs)\u001b[0m\n\u001b[0;32m 190\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mon_iteration\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 191\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 192\u001b[1;33m \u001b[0mconn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrain_events\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msafety_interval\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 193\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 194\u001b[0m \u001b[0mconn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mheartbeat_check\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\kombu\\connection.py\u001b[0m in \u001b[0;36mdrain_events\u001b[1;34m(self, **kwargs)\u001b[0m\n\u001b[0;32m 299\u001b[0m \u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mexceeded\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 300\u001b[0m \"\"\"\n\u001b[1;32m--> 301\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtransport\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrain_events\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconnection\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 302\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 303\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mmaybe_close_channel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mchannel\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\kombu\\transport\\pyamqp.py\u001b[0m in \u001b[0;36mdrain_events\u001b[1;34m(self, connection, **kwargs)\u001b[0m\n\u001b[0;32m 101\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 102\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mdrain_events\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 103\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrain_events\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 104\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 105\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_collect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\amqp\\connection.py\u001b[0m in \u001b[0;36mdrain_events\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m 489\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mdrain_events\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 490\u001b[0m \u001b[1;31m# read until message is ready\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 491\u001b[1;33m \u001b[1;32mwhile\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mblocking_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 492\u001b[0m \u001b[1;32mpass\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 493\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\amqp\\connection.py\u001b[0m in \u001b[0;36mblocking_read\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m 494\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mblocking_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 495\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtransport\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhaving_timeout\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 496\u001b[1;33m \u001b[0mframe\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtransport\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_frame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 497\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mon_inbound_frame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mframe\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 498\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\amqp\\transport.py\u001b[0m in \u001b[0;36mread_frame\u001b[1;34m(self, unpack)\u001b[0m\n\u001b[0;32m 241\u001b[0m \u001b[0mread_frame_buffer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mEMPTY_BUFFER\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 242\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 243\u001b[1;33m \u001b[0mframe_header\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m7\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 244\u001b[0m \u001b[0mread_frame_buffer\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[0mframe_header\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 245\u001b[0m \u001b[0mframe_type\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mchannel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msize\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0munpack\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'>BHI'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mframe_header\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\amqp\\transport.py\u001b[0m in \u001b[0;36m_read\u001b[1;34m(self, n, initial, _errnos)\u001b[0m\n\u001b[0;32m 416\u001b[0m \u001b[1;32mwhile\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrbuf\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m<\u001b[0m \u001b[0mn\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 417\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 418\u001b[1;33m \u001b[0ms\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mrecv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mn\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrbuf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 419\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 420\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merrno\u001b[0m \u001b[1;32min\u001b[0m \u001b[0m_errnos\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ - "r.get(\"1\")\n", - "r.delete(\"1\")\n", - "import requests \n", - "result = requests.get(\"https://jsonplaceholder.typicode.com/posts/1\")\n", - "print(result.text)" + "# https://medium.com/python-pandemonium/building-robust-rabbitmq-consumers-with-python-and-kombu-part-2-e9505f56e12e\n", + "\n", + "from kombu import Connection, Exchange, Queue\n", + "from kombu.mixins import ConsumerMixin\n", + "rabbit_url = \"amqp://localhost:5672/\"\n", + "\n", + "class Worker(ConsumerMixin):\n", + " def __init__(self, connection, queues):\n", + " self.connection = connection\n", + " self.queues = queues\n", + " def get_consumers(self, Consumer, channel):\n", + " return [Consumer(queues=self.queues,\n", + " callbacks=[self.on_message])]\n", + " def on_message(self, body, message):\n", + " print('Got message: {0}'.format(body))\n", + " message.ack()\n", + " \n", + "exchange = Exchange(\"example-exchange\", type=\"direct\")\n", + "queues = [Queue(\"example-queue\", exchange, routing_key=\"BOB\")]\n", + "with Connection(rabbit_url, heartbeat=4) as conn:\n", + " worker = Worker(conn, queues)\n", + " worker.run()\n", + "\n" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 36, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "200 OK\n" + "ename": "AttributeError", + "evalue": "'list' object has no attribute 'queue_declare'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mname\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmsg_count\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconsumer_count\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mqueues\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mqueue_declare\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m: 'list' object has no attribute 'queue_declare'" ] } ], "source": [ - "from redis_rate_limit import RateLimit, TooManyRequests\n", - "try:\n", - " with RateLimit(resource='local', client='localhost', max_requests=10):\n", - " result = '200 OK'\n", - "except TooManyRequests:\n", - " result = '429 Too Many Requests'\n", - "print(result)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Celery\n", - "* pip instal redis\n", - "* pip install celery\n", - "* pip install -U \"celery[redis]\"\n", - "* Run worker from python https://gist.github.com/chenjianjx/53d8c2317f6023dc2fa0" + "name, msg_count, consumer_count = queue.queue_declare()" ] }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 35, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "ConnectionRefusedError", + "evalue": "[WinError 10061] No connection could be made because the target machine actively refused it", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mConnectionRefusedError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[0ms\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mAF_INET\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mSOCK_STREAM\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 9\u001b[1;33m \u001b[0ms\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhost\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mport\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 10\u001b[0m \u001b[0ms\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mb\"1\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mConnectionRefusedError\u001b[0m: [WinError 10061] No connection could be made because the target machine actively refused it" + ] + } + ], "source": [ - "from optimus.enrichment.worker import download" + "import socket\n", + "import sys\n", + "\n", + "host = '127.0.0.1'\n", + "port = 5005\n", + "\n", + "\n", + "s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n", + "s.connect((host,port))\n", + "s.send(b\"1\") \n", + "s.close()" ] }, { "cell_type": "code", - "execution_count": 61, + "execution_count": null, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "False" - ] - }, - "execution_count": 61, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "-------------------------------------------\n", + "Time: 2018-09-06 17:28:44\n", + "-------------------------------------------\n", + "\n", + "-------------------------------------------\n", + "Time: 2018-09-06 17:28:45\n", + "-------------------------------------------\n", + "\n" + ] } ], "source": [ - "r = download.delay('http\"\\\\mood.com.ve')\n", - "r.ready()" + "from pyspark import SparkContext, SparkConf\n", + "from pyspark.streaming import StreamingContext\n", + "\n", + "\n", + "\n", + "\n", + "ssc = StreamingContext(op.sc, 1)\n", + "\n", + "\n", + "lines = ssc.socketTextStream(\"localhost\", 9998)\n", + "\n", + "# Split each line into words\n", + "words = lines.flatMap(lambda line: line.split(\" \"))\n", + "\n", + "# Count each word in each batch\n", + "pairs = words.map(lambda word: (word, 1))\n", + "\n", + "\n", + "wordCounts = pairs.reduceByKey(lambda x, y: x + y)\n", + "\n", + "# Print the first ten elements of each RDD generated in this DStream to the console\n", + "wordCounts.pprint()\n", + "\n", + "\n", + "ssc.start() # Start the computation\n", + "ssc.awaitTermination() # Wait for the computation to terminate" ] }, { "cell_type": "code", - "execution_count": 58, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "False" - ] - }, - "execution_count": 58, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "r.ready()" + "print('joa')" ] }, { From 6ad8d5dc9c261e68d30a6cbf12d86fb13fa625ac Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Fri, 7 Sep 2018 14:59:55 -0500 Subject: [PATCH 57/94] Move server start inside the try/ctach block --- optimus/server/run.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/optimus/server/run.py b/optimus/server/run.py index 6e1b51052..e73fccb58 100644 --- a/optimus/server/run.py +++ b/optimus/server/run.py @@ -1,16 +1,17 @@ import configparser import json + from flask import Flask from flask import jsonify -from multiprocessing import Process - -config = configparser.ConfigParser() -path = "" -# try to load the config file try: + # try to load the config file + config = configparser.ConfigParser() + path = "" + config.read("config.ini") path = config["SERVER"]["Input"] + app = Flask(__name__) except IOError: print("config.ini not found") @@ -18,9 +19,6 @@ print("Input info not found in config.ini. Be sure you have...") print("[SERVER]") print("Input = config.ini") - raise - -app = Flask(__name__) @app.route('/') From 4b945dcf00322eb3c2e6fdfc4ef76213754e7eb8 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Fri, 7 Sep 2018 15:02:52 -0500 Subject: [PATCH 58/94] Move all code inside try catch block --- optimus/server/run.py | 55 ++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/optimus/server/run.py b/optimus/server/run.py index e73fccb58..d3e6d8a13 100644 --- a/optimus/server/run.py +++ b/optimus/server/run.py @@ -12,37 +12,38 @@ config.read("config.ini") path = config["SERVER"]["Input"] app = Flask(__name__) -except IOError: - print("config.ini not found") -except KeyError: - print("Input info not found in config.ini. Be sure you have...") - print("[SERVER]") - print("Input = config.ini") + @app.route('/') + def index(): + """ + Return a message indicating if the server is running. + :return: + """ + return jsonify("Optimus Server is running... Go to /profiler to get the Optimus profiler data.") -@app.route('/') -def index(): - """ - Return a message indicating if the server is running. - :return: - """ - return jsonify("Optimus Server is running... Go to /profiler to get the Optimus profiler data.") + @app.route('/profiler') + def profiler(): + """ + Return the data profiler in json format. + :return: + """ + try: + with app.app_context(): + with open(path, encoding="utf8") as f: + data = json.loads(f.read()) + return jsonify(data) + except IOError: + return jsonify("Not data profiling available") -@app.route('/profiler') -def profiler(): - """ - Return the data profiler in json format. - :return: - """ - try: - with app.app_context(): - with open(path, encoding="utf8") as f: - data = json.loads(f.read()) - return jsonify(data) - except IOError: - return jsonify("Not data profiling available") + app.run() -app.run() +except IOError: + print("config.ini not found") + +except KeyError: + print("Input info not found in config.ini. Be sure you have...") + print("[SERVER]") + print("Input = config.ini") From ec52636b69d79922494f9205bf068ae82b67fa04 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Fri, 7 Sep 2018 15:21:52 -0500 Subject: [PATCH 59/94] Codacy Fix --- optimus/create.py | 4 +--- optimus/functions.py | 10 +++------- optimus/server/process.py | 24 +++--------------------- 3 files changed, 7 insertions(+), 31 deletions(-) diff --git a/optimus/create.py b/optimus/create.py index 78d06bd89..c9a0ed9dc 100644 --- a/optimus/create.py +++ b/optimus/create.py @@ -6,8 +6,6 @@ from optimus.helpers.functions import get_spark_dtypes_object from optimus.spark import Spark -import pandas as pdf - class Create: @@ -54,7 +52,7 @@ def data_frame(cols, rows): def data_frame(pdf): """ Helper to create a Spark dataframe: - :param pdf: List of Tuple with name, data type and a flag to accept null + :param pdf: Panda Dataframe :return: Dataframe """ diff --git a/optimus/functions.py b/optimus/functions.py index e95e9c09b..2258a3743 100644 --- a/optimus/functions.py +++ b/optimus/functions.py @@ -1,6 +1,4 @@ import base64 -import logging - from fastnumbers import isint, isfloat from functools import reduce from io import BytesIO @@ -8,11 +6,9 @@ import dateutil.parser import matplotlib.pyplot as plt from numpy import array -from py4j.protocol import Py4JJavaError from pyspark.sql import DataFrame from pyspark.sql import functions as F -from optimus.helpers.checkit import is_ from optimus.helpers.functions import is_pyarrow_installed, parse_python_dtypes, random_int, one_list_to_val, \ get_spark_dtypes_object from optimus.helpers.raiseit import RaiseIt @@ -281,9 +277,9 @@ def str_to_array(value): :return: """ try: - isinstance(literal_eval((value.encode('ascii', 'ignore')).decode("utf-8")), (list, tuple)) - return True - except: + if isinstance(literal_eval((value.encode('ascii', 'ignore')).decode("utf-8")), (list, tuple)): + return True + except ValueError: pass def func(value): diff --git a/optimus/server/process.py b/optimus/server/process.py index cb053dd2b..a80cdf127 100644 --- a/optimus/server/process.py +++ b/optimus/server/process.py @@ -1,13 +1,15 @@ import os import platform -import psutil import signal import sys from subprocess import Popen, PIPE +import psutil + WINDOWS = "windows" PLATFORM = platform.system().lower() + # test https://stackoverflow.com/questions/984941/python-subprocess-popen-from-a-thread class Process: @@ -35,9 +37,6 @@ def __init__(self, path=None): # process.join() self.process = process - - assert not self.process.poll() - self.path = path def stop(self): @@ -55,23 +54,6 @@ def stop_id(pid): :return: """ - # Reference https://stackoverflow.com/questions/1230669/subprocess-deleting-child-processes-in-windows - def kill_proc_tree(pid, including_parent=True): - """ - Kill process and children - :param pid: - :param including_parent: - :return: - """ - parent = psutil.Process(pid) - children = parent.children(recursive=True) - for child in children: - child.kill() - gone, still_alive = psutil.wait_procs(children, timeout=5) - if including_parent: - parent.wait(5) - parent.kill() - def kill_proc(pid): """ Kill process From afed8ddaa1e01840073c037eb2f7997f85420467 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Fri, 7 Sep 2018 19:42:36 -0500 Subject: [PATCH 60/94] Added function to check if there is and specific data type in a list --- optimus/helpers/checkit.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/optimus/helpers/checkit.py b/optimus/helpers/checkit.py index 0fc320337..1a2a91372 100644 --- a/optimus/helpers/checkit.py +++ b/optimus/helpers/checkit.py @@ -254,6 +254,10 @@ def is_dataframe(value): return isinstance(value, DataFrame) +def has_(value, _type): + return any(isinstance(elem, _type) for elem in value) + + def is_data_type(value, data_type): """ Check if a value can be casted to a specific From 17fe3f2b47aa66967ae1652a3971db684c29d65a Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Fri, 7 Sep 2018 19:43:17 -0500 Subject: [PATCH 61/94] Now nest accepts Columns and string --- optimus/dataframe/columns.py | 14 +++- tests/test_cols.py | 122 +++++++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+), 3 deletions(-) diff --git a/optimus/dataframe/columns.py b/optimus/dataframe/columns.py index 6d4a81bc3..e931557f8 100644 --- a/optimus/dataframe/columns.py +++ b/optimus/dataframe/columns.py @@ -18,7 +18,7 @@ from optimus.functions import filter_row_by_data_type as fbdt from optimus.helpers.checkit \ import is_num_or_str, is_list, is_, is_tuple, is_list_of_dataframes, is_list_of_tuples, \ - is_function, is_one_element, is_type, is_int, is_dict, is_str + is_function, is_one_element, is_type, is_int, is_dict, is_str, has_ # Helpers from optimus.helpers.constants import * from optimus.helpers.decorators import add_attr @@ -1084,7 +1084,7 @@ def iqr(columns, more=None): @add_attr(cols) # TODO: Maybe we should create nest_to_vector and nest_array, nest_to_string - def nest(input_cols, output_col, shape=None, separator=" "): + def nest(input_cols, output_col, shape="string", separator=""): """ Concat multiple columns to one with the format specified :param input_cols: columns to be nested @@ -1093,16 +1093,24 @@ def nest(input_cols, output_col, shape=None, separator=" "): :param shape: final data type, 'array', 'string' or 'vector' :return: Spark DataFrame """ - columns = parse_columns(self, input_cols) + df = self + if has_(input_cols, F.Column): + "Transform non Column data to lit" + columns = [F.lit(col) if not is_(col, F.Column) else col for col in input_cols] + else: + columns = parse_columns(self, input_cols) + if shape is "vector": + vector_assembler = VectorAssembler( inputCols=input_cols, outputCol=output_col) df = vector_assembler.transform(self) elif shape is "array": + df = apply_expr(output_col, F.array(*columns)) elif shape is "string": diff --git a/tests/test_cols.py b/tests/test_cols.py index e85315685..507cdaa6c 100644 --- a/tests/test_cols.py +++ b/tests/test_cols.py @@ -1,3 +1,5 @@ +from pyspark.ml.linalg import VectorUDT + from optimus import Optimus from pyspark.sql.types import * from pyspark.sql import Row @@ -526,3 +528,123 @@ def test_sort(): ) assert (actual_df.collect() == expected_df.collect()) + + @staticmethod + def test_nest(): + source_df = op.create.df( + rows=[ + ("happy", 1), + ("excited", 2) + ], + cols=[ + ("emotion", StringType(), True), + ("num", IntegerType(), True) + ] + ) + + actual_df = source_df.cols.nest(["emotion", "num"], "new", separator=" ") + + expected_df = op.create.df( + rows=[ + (1, "happy", "1 happy"), + (2, "excited", "2 excited") + ], + cols=[ + ("emotion", StringType(), True), + ("num", IntegerType(), True), + ("new", StringType(), True) + + ] + ) + + assert (actual_df.collect() == expected_df.collect()) + + @staticmethod + def test_nest_mix(): + source_df = op.create.df( + rows=[ + ("happy", 1), + ("excited", 2) + ], + cols=[ + ("emotion", StringType(), True), + ("num", IntegerType(), True) + ] + ) + + actual_df = source_df.cols.nest([F.Column("emotion"), "---", F.Column("num")], separator="new") + + expected_df = op.create.df( + rows=[ + (1, "happy", "1---happy"), + (2, "excited", "2---excited") + ], + cols=[ + ("emotion", StringType(), True), + ("num", IntegerType(), True), + ("new", StringType(), True) + + ] + ) + + assert (actual_df.collect() == expected_df.collect()) + + @staticmethod + def test_nest_vector(): + source_df = op.create.df( + rows=[ + ("happy", 1), + ("excited", 2) + ], + cols=[ + ("emotion", StringType(), True), + ("num", IntegerType(), True) + ] + ) + + actual_df = source_df.cols.nest(["emotion", "num"], "new", shape="vector") + + expected_df = op.create.df( + rows=[ + (1, "happy", [1, "happy"]), + (2, "excited", [2, "excited"]) + ], + cols=[ + ("emotion", StringType(), True), + ("num", IntegerType(), True), + ("new", VectorUDT(), True) + + ] + ) + + assert (actual_df.collect() == expected_df.collect()) + + @staticmethod + def test_nest_array(): + source_df = op.create.df( + rows=[ + ("happy", 1), + ("excited", 2) + ], + cols=[ + ("emotion", StringType(), True), + ("num", IntegerType(), True) + ] + ) + + actual_df = source_df.cols.nest(["emotion", "num"], "new", shape="array") + + expected_df = op.create.df( + rows=[ + (1, "happy", [1, "happy"]), + (2, "excited", [2, "excited"]) + ], + cols=[ + ("emotion", StringType(), True), + ("num", IntegerType(), True), + ("new", ArrayType(), True) + + ] + ) + + assert (actual_df.collect() == expected_df.collect()) From b0b4f5527c0c23e053740e8ab895b7aa4aa016fc Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Fri, 7 Sep 2018 19:45:13 -0500 Subject: [PATCH 62/94] Now dataframes can me saved to rabbitmq and mongo --- optimus/io/save.py | 65 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/optimus/io/save.py b/optimus/io/save.py index 86c451e97..c8dc39c9b 100644 --- a/optimus/io/save.py +++ b/optimus/io/save.py @@ -1,4 +1,7 @@ import logging +from kombu import Connection, Exchange, Queue, Consumer, Producer +from pymongo import MongoClient +from tqdm import tqdm_notebook from pyspark.sql import DataFrame @@ -109,6 +112,68 @@ def avro(path, mode="overwrite", num_partitions=1): logging.error(e) raise + @add_attr(save) + def rabbit_mq(host, exchange_name=None, queue_name=None, routing_key=None, parallelism=None): + """ + Send a dataframe to a redis queue + # https://medium.com/python-pandemonium/talking-to-rabbitmq-with-python-and-kombu-6cbee93b1298 + # https://medium.com/python-pandemonium/building-robust-rabbitmq-consumers-with-python-and-kombu-part-1-ccd660d17271 + :return: + """ + df = self + if parallelism: + df = df.coalesce(parallelism) + + def _rabbit_mq(messages): + conn = Connection(host) + channel = conn.channel() + + exchange = Exchange(exchange_name, type="direct") + queue = Queue(name=queue_name, exchange=exchange, routing_key=routing_key) + + queue.maybe_bind(conn) + queue.declare() + producer = Producer(exchange=exchange, channel=channel, routing_key=routing_key) + + for message in messages: + # as_dict = message.asDict(recursive=True) + producer.publish(message) + + channel.close() + conn.release() + return messages + + self.rdd.mapPartitions(_rabbit_mq).count() + + @add_attr(save) + def mongo(host, port=None, db_name=None, collection_name=None, parallelism=None): + """ + Send a dataframe to a mongo collection + + :param host: + :param port: + :param db_name: + :param collection_name: + :param parallelism: + :return: + """ + df = self + if parallelism: + df = df.coalesce(parallelism) + + def _mongo(messages): + client = MongoClient(host, port) + db = client[db_name] + collection = db[collection_name] + + for message in messages: + as_dict = message.asDict(recursive=True) + collection.insert_one(as_dict) + client.close() + return messages + + df.rdd.mapPartitions(_mongo).count() + return save From 9f6119a09e2ed3cf8f6116f84625b72171803c49 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Fri, 7 Sep 2018 21:34:16 -0500 Subject: [PATCH 63/94] Enricher. Work in progress --- examples/new-api-enrichment.ipynb | 296 ++++++++++++----------- optimus/enricher.py | 381 ++++++++++++++++++++++++++++++ requirements.txt | 3 + 3 files changed, 544 insertions(+), 136 deletions(-) create mode 100644 optimus/enricher.py diff --git a/examples/new-api-enrichment.ipynb b/examples/new-api-enrichment.ipynb index cd32fd840..0792b1a35 100644 --- a/examples/new-api-enrichment.ipynb +++ b/examples/new-api-enrichment.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -36,7 +36,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -46,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -56,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -74,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -201,158 +201,210 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 6, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'example-queue'" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "from kombu import Connection, Exchange, Queue, Consumer, Producer\n", - "\n", - "rabbit_url = \"amqp://localhost:5672/\"\n", - "conn = Connection(rabbit_url)\n", - "channel = conn.channel()\n", - "exchange = Exchange(\"example-exchange\", type=\"direct\")\n", - "queue = Queue(name=\"example-queue\", exchange=exchange, routing_key=\"BOB\")\n", - "\n", - "queue.maybe_bind(conn)\n", - "queue.declare()" + "df.save.rabbit_mq(\"amqp://localhost:5672/\",\"exchange_name\", \"example-queue\",\"BOB\" )" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "df.save.mongo(\"localhost\",27017, \"db\",\"BOB\")" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 91, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 3 of 3 rows / 3 columns
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
dogs
\n", + "
1 (bigint)
\n", + "\n", + "
\n", + "
cats
\n", + "
2 (bigint)
\n", + "\n", + "
\n", + "
new
\n", + "
3 (string)
\n", + "\n", + "
\n", + " 2\n", + " \n", + " 0\n", + " \n", + " http://google.com/2/0\n", + "
\n", + " 0\n", + " \n", + " 1\n", + " \n", + " http://google.com/0/1\n", + "
\n", + " 4\n", + " \n", + " 1\n", + " \n", + " http://google.com/4/1\n", + "
\n", + "\n", + "
Viewing 3 of 3 rows / 3 columns
\n" + ], "text/plain": [ - "0" + "" ] }, - "execution_count": 27, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ - "# https://medium.com/python-pandemonium/talking-to-rabbitmq-with-python-and-kombu-6cbee93b1298\n", - "#https://medium.com/python-pandemonium/building-robust-rabbitmq-consumers-with-python-and-kombu-part-1-ccd660d17271\n", + "from pyspark.sql import functions as F\n", + "\n", + "df_url = df.cols.nest((\"http://google.com/\",F.col(\"dogs\"),\"/\",F.col(\"cats\")),\"url\")\n", "\n", - "def func (messages):\n", - " \n", - " channel = conn.channel() \n", - " producer = Producer(exchange=exchange, channel=channel, routing_key=\"BOB\")\n", - " \n", - " for message in messages:\n", - " as_dict = message.asDict(recursive=True)\n", - " producer.publish(message)\n", - " return messages\n", - " \n", - "df.rdd.mapPartitions(func).count()" + "# send to mongo\n", + "df_url.enrinch(\"new_column\", )\n", + "\n", + "# Get info from mongo\n", + "\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 129, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cats\n", + "dogs\n", + "_id\n" + ] + } + ], "source": [ - "def process_message(body, message):\n", - " print(\"The body is {}\".format(body))\n", - " message.ack()\n", + "from optimus.enricher import Enricher\n", "\n", - " \n", - "with Consumer(conn, queues=queue, callbacks=[process_message], accept=[\"application/json\"]): \n", - " line= conn.drain_events(timeout=5)\n", - " #print(\"No message in the queue\")\n", - " #conn.heartbeat_check()" + "e = Enricher(\"localhost\",27017, \"db\",\"BOB\")\n", + "#e.run(df, )\n", + "e.show_cols(\"BOB\")" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 120, "metadata": {}, "outputs": [ { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mtimeout\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\amqp\\transport.py\u001b[0m in \u001b[0;36m_read\u001b[1;34m(self, n, initial, _errnos)\u001b[0m\n\u001b[0;32m 417\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 418\u001b[1;33m \u001b[0ms\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mrecv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mn\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrbuf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 419\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mtimeout\u001b[0m: timed out", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 20\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mConnection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrabbit_url\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mheartbeat\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m4\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mconn\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 21\u001b[0m \u001b[0mworker\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mWorker\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mqueues\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 22\u001b[1;33m \u001b[0mworker\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\kombu\\mixins.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self, _tokens, **kwargs)\u001b[0m\n\u001b[0;32m 168\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 169\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mrestart_limit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcan_consume\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_tokens\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;31m# pragma: no cover\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 170\u001b[1;33m \u001b[1;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconsume\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlimit\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 171\u001b[0m \u001b[1;32mpass\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 172\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\kombu\\mixins.py\u001b[0m in \u001b[0;36mconsume\u001b[1;34m(self, limit, timeout, safety_interval, **kwargs)\u001b[0m\n\u001b[0;32m 190\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mon_iteration\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 191\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 192\u001b[1;33m \u001b[0mconn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrain_events\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msafety_interval\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 193\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 194\u001b[0m \u001b[0mconn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mheartbeat_check\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\kombu\\connection.py\u001b[0m in \u001b[0;36mdrain_events\u001b[1;34m(self, **kwargs)\u001b[0m\n\u001b[0;32m 299\u001b[0m \u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mexceeded\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 300\u001b[0m \"\"\"\n\u001b[1;32m--> 301\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtransport\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrain_events\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconnection\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 302\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 303\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mmaybe_close_channel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mchannel\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\kombu\\transport\\pyamqp.py\u001b[0m in \u001b[0;36mdrain_events\u001b[1;34m(self, connection, **kwargs)\u001b[0m\n\u001b[0;32m 101\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 102\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mdrain_events\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 103\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrain_events\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 104\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 105\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_collect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\amqp\\connection.py\u001b[0m in \u001b[0;36mdrain_events\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m 489\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mdrain_events\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 490\u001b[0m \u001b[1;31m# read until message is ready\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 491\u001b[1;33m \u001b[1;32mwhile\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mblocking_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 492\u001b[0m \u001b[1;32mpass\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 493\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\amqp\\connection.py\u001b[0m in \u001b[0;36mblocking_read\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m 494\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mblocking_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 495\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtransport\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhaving_timeout\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 496\u001b[1;33m \u001b[0mframe\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtransport\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_frame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 497\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mon_inbound_frame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mframe\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 498\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\amqp\\transport.py\u001b[0m in \u001b[0;36mread_frame\u001b[1;34m(self, unpack)\u001b[0m\n\u001b[0;32m 241\u001b[0m \u001b[0mread_frame_buffer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mEMPTY_BUFFER\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 242\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 243\u001b[1;33m \u001b[0mframe_header\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m7\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 244\u001b[0m \u001b[0mread_frame_buffer\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[0mframe_header\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 245\u001b[0m \u001b[0mframe_type\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mchannel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msize\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0munpack\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'>BHI'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mframe_header\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\amqp\\transport.py\u001b[0m in \u001b[0;36m_read\u001b[1;34m(self, n, initial, _errnos)\u001b[0m\n\u001b[0;32m 416\u001b[0m \u001b[1;32mwhile\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrbuf\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m<\u001b[0m \u001b[0mn\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 417\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 418\u001b[1;33m \u001b[0ms\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mrecv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mn\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrbuf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 419\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 420\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merrno\u001b[0m \u001b[1;32min\u001b[0m \u001b[0m_errnos\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'db'), 'BOB')\n" ] } ], "source": [ - "# https://medium.com/python-pandemonium/building-robust-rabbitmq-consumers-with-python-and-kombu-part-2-e9505f56e12e\n", - "\n", - "from kombu import Connection, Exchange, Queue\n", - "from kombu.mixins import ConsumerMixin\n", - "rabbit_url = \"amqp://localhost:5672/\"\n", - "\n", - "class Worker(ConsumerMixin):\n", - " def __init__(self, connection, queues):\n", - " self.connection = connection\n", - " self.queues = queues\n", - " def get_consumers(self, Consumer, channel):\n", - " return [Consumer(queues=self.queues,\n", - " callbacks=[self.on_message])]\n", - " def on_message(self, body, message):\n", - " print('Got message: {0}'.format(body))\n", - " message.ack()\n", - " \n", - "exchange = Exchange(\"example-exchange\", type=\"direct\")\n", - "queues = [Queue(\"example-queue\", exchange, routing_key=\"BOB\")]\n", - "with Connection(rabbit_url, heartbeat=4) as conn:\n", - " worker = Worker(conn, queues)\n", - " worker.run()\n", - "\n" + "print(e.db_exists(\"db\"))\n", + "print(e.get_collection(\"BOB\"))" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 106, "metadata": {}, "outputs": [ { - "ename": "AttributeError", - "evalue": "'list' object has no attribute 'queue_declare'", + "ename": "NameError", + "evalue": "name 'optimus' is not defined", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mname\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmsg_count\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconsumer_count\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mqueues\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mqueue_declare\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;31mAttributeError\u001b[0m: 'list' object has no attribute 'queue_declare'" + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;31m#Geolocation\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 7\u001b[1;33m \u001b[0mcollection\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0moptimus\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_collection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'step3'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 8\u001b[0m \u001b[0mcursor\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcollection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;34m'$or'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;34m'lat'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;34m'lng'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mprojection\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'_id'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'lat'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'lng'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'state'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'city'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mNameError\u001b[0m: name 'optimus' is not defined" ] } ], - "source": [ - "name, msg_count, consumer_count = queue.queue_declare()" - ] + "source": [] }, { "cell_type": "code", @@ -433,22 +485,6 @@ "ssc.start() # Start the computation\n", "ssc.awaitTermination() # Wait for the computation to terminate" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print('joa')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -456,18 +492,6 @@ "display_name": "Python 3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" } }, "nbformat": 4, diff --git a/optimus/enricher.py b/optimus/enricher.py new file mode 100644 index 000000000..f26e6b384 --- /dev/null +++ b/optimus/enricher.py @@ -0,0 +1,381 @@ +import json +import logging +import urllib + +import requests +from pymongo import MongoClient +from tqdm import tqdm_notebook + + +class Enricher: + + def __init__(self, host, port, db_name=None, collection_name=None): + self.host = host + self.port = port + self.client = MongoClient(host, port) + + if self.db_exists(db_name): + self.db = self.client[db_name] + else: + raise Exception('Database do not exist') + + if self.collection_exists(collection_name): + self.collection = self.client[collection_name] + else: + raise Exception('Collection do not exist') + + def run(self): + """ + + :return: + """ + collection = self.get_collection('step3') + cursor = collection.find({'$or': [{'lat': None}, {'lng': None}]}, + projection=['_id', 'lat', 'lng', 'state', 'city']) + + for r in tqdm_notebook(cursor, total=cursor.count(), desc='Geolocation'): + state = r['state'].lower() + state = "".join(state.split()) + + if (state == "edo.demexico"): + state = "estado de mexico" + else: + state = r['state'] + + url = "http://46.101.10.63:8080/search?format=jsonv2&q=mexico+" + state + '+' + r['city'] + url = urllib.parse.unquote_plus(url) + + result = requests.get(url) + result_json = json.loads(result.text) + if (result_json): + + lat = round(float(result_json[0]['lat']), 6) + lon = round(float(result_json[0]['lon']), 6) + collection.update_one({'_id': r['_id']}, {'$set': {'lat': lat, 'lng': lon}}, upsert=False) + + else: + print('No procesado') + + def collection_exists(self, collection_name): + """ + Check if a collection exist + :param collection_name: + :return: + """ + if collection_name in self.db.collection_names(): + return True + else: + return False + + def db_exists(self, db_name): + """ + Check if a collection exist + :param db_name: + :return: + """ + if db_name in self.client.list_database_names(): + return True + else: + return False + + def get_collection(self, collection_name): + """ + + :param collection_name: + :return: + """ + return self.db[collection_name] + + def copy_collection(self, source_name, dest_name): + """ + Copy Collection + :param source_name: + :param dest_name: + :return: + """ + + source = self.db[source_name] + + logging.info('Dropping', dest_name, 'collection') + self.db[dest_name].drop() + # if data exist in the collection drop it + + pipeline = [{"$match": {}}, + {"$out": dest_name}, + ] + logging.info('Copying', source_name, 'collection to', dest_name, 'collection ...') + + source.aggregate(pipeline) + logging.info('Done') + + def show_cols(self, collection_name=None): + """ + Show cols + :param collection_name: + :return: + """ + + if not self.collection_exists(collection_name): + raise Exception("Collection {collection_name} not exist".format(collection_name=collection_name)) + + source = None + if collection_name is not None: + source = self.get_collection(collection_name) + + results = source.aggregate([ + {"$project": {"arrayofkeyvalue": {"$objectToArray": "$$ROOT"}}}, + {"$unwind": "$arrayofkeyvalue"}, + {"$group": {"_id": None, "allkeys": {"$addToSet": "$arrayofkeyvalue.k"}}} + ]) + logging.info(results) + results = list(results)[0]['allkeys'] + + for r in results: + logging.info(r) + + def get_cols(self, collection_name=None): + """ + + :param collection_name: + :return: + """ + if collection_name is not None: + source = self.get_collection(collection_name) + else: + source = self.collection + logging.info(source) + logging.info('Getting cols...') + result = source.aggregate([ + {"$project": {"arrayofkeyvalue": {"$objectToArray": "$$ROOT"}}}, + {"$unwind": "$arrayofkeyvalue"}, + {"$group": {"_id": None, "allkeys": {"$addToSet": "$arrayofkeyvalue.k"}}} + ]) + result = list(result)[0]['allkeys'] + logging.info('Done') + return result + + def drop_col(cols): + """ + + :return: + """ + for x in tqdm_notebook(cols, desc='Processing cols'): + logging.info('Dropping', x, 'field') + dest_collection.update_many({}, {'$unset': {x: 1}}) + + def insert_to_collection(self, cursor, dest_collection_name, drop=False): + """ + + :param cursor: + :param dest_collection_name: + :param drop: + :return: + """ + dest_collection = self.get_collection(dest_collection_name) + if drop: + dest_collection.drop() + + if "count" in dir(cursor): + count = cursor.count() + else: + count = 1 + + for c in tqdm_notebook(cursor, total=count, desc='Saving Collection'): + dest_collection.insert_one(c) + + def create_missing_fields(self, cols, collection_name=None): + """ + + :param cols: + :param collection_name: + :return: + """ + + if collection_name is not None: + source = self.get_collection(collection_name) + else: + source = self.collection + + for c in tqdm_notebook(cols, total=len(cols), desc='Processing cols'): + logging.info('Inserting', c) + if c: + source.update_many( + {c: {'$exists': False}}, + {'$set': + { + c: None, + } + } + ); + else: + logging.info('Field', c, 'could not be added') + + def convert_field_to(self, collection_name, field, convert_to): + """ + + :param collection_name: + :param field: + :param convert_to: + :return: + """ + collection = self.get_collection(collection_name) + cursor = collection.find({field: {'$exists': True}}).limit(0) + desc = 'Converting', field, 'to', convert_to + + # for c in tqdm_notebook(cursor, total = cursor.count(), desc = 'sad'): + + if convert_to == 'int': + l = float + elif convert_to == 'float': + l = int + elif convert_to == 'string': + l = str + else: + raise ValueError('Only int, float or string accepted in field param', field, 'value present') + + for c in tqdm_notebook(cursor, total=cursor.count(), desc='Processing records'): + try: + val = c[field] + val = l(c[field]) + collection.update_one({'_id': c['_id']}, {'$set': {field: val}}) + + except ValueError: + logging.info('Could not convert "', val, '" to', convert_to) + + @staticmethod + def mongo_to_json_array_file(filename, host, port, db, collection, projection=None, limit=None): + if (limit is None): + limit = 0 + + client = MongoClient(host, port) + _db = client[db] + _collection = _db[collection] + + file = open(filename, "w") + file.write('[') + + i = 0 + properties = {} + + # dump all the data + documents = _collection.find({}, projection).limit(limit) + count = documents.count(True) + + for r in tqdm_notebook(documents, total=count, desc='Processing records'): + + i = i + 1 + # FIX: we should remove the id in the projection + r.pop('_id') + + file.write(json.dumps(r)) + if (i < count): + file.write(',') + + file.write(']') + file.close() + + @staticmethod + def to_geojson_file(filename, host, port, db, collection, coordinates_keys, projection=None): + + client = MongoClient(host, port) + _db = client[db] + _collection = _db[collection] + + file = open(filename, "w") + file.write('{"type": "FeatureCollection","features":[') + + i = 0 + properties = {} + + # dump all the data + projection = coordinates_keys + projection + # rojection.append({'_id':False}) + documents = _collection.find({}, projection).limit(0) + count = documents.count(True) + + for r in tqdm_notebook(documents, total=count, desc='Processing records'): + + i = i + 1 + + lon_key = coordinates_keys[0] + # Verify if the key exist and is a float number + if (lon_key in r) and (isinstance(r[lon_key], float)): + lon = r[lon_key] + + lat_key = coordinates_keys[1] + if (lat_key in r) and (isinstance(r[lat_key], float)): + lat = r[lat_key] + + # FIX: we should remove the id in the projection + r.pop('_id') + r.pop(lon_key) + r.pop(lat_key) + + features = {"type": "Feature", "properties": r, "geometry": {"type": "Point", "coordinates": [lon, lat]}} + + file.write(json.dumps(features)) + if (i < count): + file.write(',') + + file.write(']}') + file.close() + + def merge_two_dicts(x, y): + """ + + :param y: + :return: + """ + z = x.copy() # start with x's keys and values + z.update(y) # modifies z with y's keys and values & returns None + return z + + def head(self, collection_name, n=1): + """ + + :param collection_name: + :param n: + :return: + """ + if not int(n): + raise Exception('n param must be an integer') + if self.collection_exists(collection_name): + if isinstance(collection_name, str): + # try to bring a cursor from a collection + cursor = self.get_collection(collection_name).find({}).limit(n) + count = cursor.count(True) + # FIX: and elegant way to make it in python? + i = 0 + + for c in cursor: + if (i < count): + print(c) + else: + break + i = i + 1 + else: + msg = 'Collection', collection_name, ' do not exist' + raise Exception(msg) + + # FIX: not work need to find how to implement in a cursor object + def head_cursor(self, collection_name_or_cursor, n=1): + if not int(n): raise Exception('n must be an integer') + + if isinstance(collection_name_or_cursor, str): + # try to bring a cursor from a collection + cursor = self.get_collection(collection_name_or_cursor).find({}).limit(n) + count = cursor.count(True) + else: + cursor = collection_name_or_cursor + cursor.rewind() + count = n + + # FIX: and elegant way to make it in python? + i = 0 + + for c in cursor: + if (i < count): + print(c) + else: + break + i = i + 1 diff --git a/requirements.txt b/requirements.txt index f09e1cd89..66b74507b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,6 @@ +requests +tqdm +pymongo celery fastnumbers==2.1.1 multipledispatch==0.6.0 From eeae3fd29f7db16087279b563d005e03d339bed7 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Sat, 8 Sep 2018 20:08:42 -0500 Subject: [PATCH 64/94] Enrichment done. Further test needed --- examples/new-api-enrichment.ipynb | 335 +++++++++++------------- optimus/enricher.py | 412 ++++++++++++++++++------------ requirements.txt | 10 +- 3 files changed, 408 insertions(+), 349 deletions(-) diff --git a/examples/new-api-enrichment.ipynb b/examples/new-api-enrichment.ipynb index 0792b1a35..b1e4a0302 100644 --- a/examples/new-api-enrichment.ipynb +++ b/examples/new-api-enrichment.ipynb @@ -1,34 +1,10 @@ { "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step for make it work\n", - "\n", - "Install Redis for your OS\n", - "* pip install redis\n", - "* pip install python-redis-rate-limit # https://pypi.org/project/python-redis-rate-limit/\n", - "* pip install sparkly https://www.scivision.co/python-windows-visual-c++-14-required/\n", - "* pip install sparkly[redis]\n", - "* Install http://joeferner.github.io/redis-commander/ \n", - "Then `redis-commander --redis-host=localhost --redis-port=6379` from the browser http://localhost:8081/" - ] - }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" @@ -36,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -46,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -61,11 +37,14 @@ "outputs": [], "source": [ "# make some test data\n", - "columns = ['dogs', 'cats']\n", + "columns = ['todo_id']\n", "vals = [\n", - " (2, 0),\n", - " (0, 1),\n", - " (4, 1)\n", + " (1, ),\n", + " (2, ),\n", + " (3, ),\n", + " (4, ),\n", + " (5, )\n", + "\n", "]\n", "\n", "# create DataFrame\n", @@ -98,24 +77,18 @@ "\n", "\n", "\n", - "
Viewing 3 of 3 rows / 2 columns
\n", + "
Viewing 5 of 5 rows / 1 columns
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", " \n", "\n", " \n", @@ -124,11 +97,7 @@ " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", @@ -136,11 +105,15 @@ " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -151,8 +124,12 @@ " 4\n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -160,7 +137,7 @@ " \n", "
\n", - "
dogs
\n", + "
todo_id
\n", "
1 (bigint)
\n", "\n", "
\n", - "
cats
\n", - "
2 (bigint)
\n", - "\n", - "
\n", - " 2\n", - " \n", - " 0\n", + " 1\n", "
\n", - " 0\n", + " 2\n", "
\n", - " 1\n", + " 3\n", "
\n", - " 1\n", + " 5\n", "
\n", "\n", - "
Viewing 3 of 3 rows / 2 columns
\n" + "
Viewing 5 of 5 rows / 1 columns
\n" ], "text/plain": [ "" @@ -174,31 +151,6 @@ "df.table()" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Testing kombu and Rabbitmq" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Activate RabbitMQ GUI Managment\n", - "Reference https://www.youtube.com/watch?v=F4PvutsQJlc\n", - "Install erlang\n", - "Run C:\\Program Files\\erl10.0.1\\Install.exe\n", - "\n", - "Go to C:\\Program Files\\RabbitMQ Server\\rabbitmq_server-3.7.7\\sbin\n", - "\n", - "`rabbitmq-plugins enable rabbitmq_management`\n", - "\n", - "`rabbitmq-server.bat restart`\n", - "\n", - "http://localhost:15672/" - ] - }, { "cell_type": "code", "execution_count": 6, @@ -210,7 +162,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -219,7 +171,14 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -243,27 +202,21 @@ "\n", "\n", "\n", - "
Viewing 3 of 3 rows / 3 columns
\n", + "
Viewing 5 of 5 rows / 2 columns
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", " \n", @@ -275,15 +228,11 @@ " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", @@ -291,15 +240,23 @@ " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", @@ -311,11 +268,19 @@ " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", @@ -323,7 +288,7 @@ " \n", "
\n", - "
dogs
\n", + "
todo_id
\n", "
1 (bigint)
\n", "\n", "
\n", - "
cats
\n", - "
2 (bigint)
\n", - "\n", - "
\n", - "
new
\n", - "
3 (string)
\n", + "
url
\n", + "
2 (string)
\n", "\n", "
\n", - " 2\n", - " \n", - " 0\n", + " 1\n", " \n", - " http://google.com/2/0\n", + " https://jsonplaceholder.typicode.com/todos/1\n", "
\n", - " 0\n", + " 2\n", " \n", - " 1\n", + " https://jsonplaceholder.typicode.com/todos/2\n", + "
\n", + " 3\n", " \n", - " http://google.com/0/1\n", + " https://jsonplaceholder.typicode.com/todos/3\n", "
\n", - " 1\n", + " https://jsonplaceholder.typicode.com/todos/4\n", + "
\n", + " 5\n", " \n", - " http://google.com/4/1\n", + " https://jsonplaceholder.typicode.com/todos/5\n", "
\n", "\n", - "
Viewing 3 of 3 rows / 3 columns
\n" + "
Viewing 5 of 5 rows / 2 columns
\n" ], "text/plain": [ "" @@ -335,155 +300,145 @@ ], "source": [ "from pyspark.sql import functions as F\n", + " \n", + "df_url = df.cols.nest((\"https://jsonplaceholder.typicode.com/todos/\",F.col(\"todo_id\")),\"url\")\n", + "df_url.table()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "from optimus.enricher import Enricher\n", "\n", - "df_url = df.cols.nest((\"http://google.com/\",F.col(\"dogs\"),\"/\",F.col(\"cats\")),\"url\")\n", - "\n", - "# send to mongo\n", - "df_url.enrinch(\"new_column\", )\n", - "\n", - "# Get info from mongo\n", - "\n" + "e = Enricher(\"localhost\",27017, \"db\",\"BOB\")\n", + "e.send(df_url)" ] }, { "cell_type": "code", - "execution_count": 129, + "execution_count": 44, "metadata": {}, "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e7770c7e28024c5eb918f84b30a4a3d1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, description='Processing...', max=10), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", "text": [ - "cats\n", - "dogs\n", - "_id\n" + "\n" ] } ], "source": [ - "from optimus.enricher import Enricher\n", + "def func(response):\n", + " return response[\"title\"]\n", "\n", - "e = Enricher(\"localhost\",27017, \"db\",\"BOB\")\n", - "#e.run(df, )\n", - "e.show_cols(\"BOB\")" + "e.run(func_response= func)" ] }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 45, "metadata": {}, "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0ea85e5e191e4618828999d2a9223583", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, description='Processing records', max=15), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", "text": [ - "True\n", - "Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'db'), 'BOB')\n" + "\n" ] } ], "source": [ - "print(e.db_exists(\"db\"))\n", - "print(e.get_collection(\"BOB\"))" + "e.save_to_csv(\"jazz.csv\")" ] }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 28, "metadata": {}, "outputs": [ { - "ename": "NameError", - "evalue": "name 'optimus' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;31m#Geolocation\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 7\u001b[1;33m \u001b[0mcollection\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0moptimus\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_collection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'step3'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 8\u001b[0m \u001b[0mcursor\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcollection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;34m'$or'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;34m'lat'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;34m'lng'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mprojection\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'_id'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'lat'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'lng'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'state'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'city'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mNameError\u001b[0m: name 'optimus' is not defined" + "name": "stdout", + "output_type": "stream", + "text": [ + "Removed 10 documents\n" ] } ], - "source": [] + "source": [ + "e.flush()" + ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 38, "metadata": {}, "outputs": [ { - "ename": "ConnectionRefusedError", - "evalue": "[WinError 10061] No connection could be made because the target machine actively refused it", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mConnectionRefusedError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[0ms\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mAF_INET\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mSOCK_STREAM\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 9\u001b[1;33m \u001b[0ms\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhost\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mport\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 10\u001b[0m \u001b[0ms\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mb\"1\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mConnectionRefusedError\u001b[0m: [WinError 10061] No connection could be made because the target machine actively refused it" - ] + "data": { + "text/plain": [ + "['url', 'result', 'todo_id', '_id']" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "import socket\n", - "import sys\n", - "\n", - "host = '127.0.0.1'\n", - "port = 5005\n", - "\n", - "\n", - "s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n", - "s.connect((host,port))\n", - "s.send(b\"1\") \n", - "s.close()" + "e.show_keys()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 58, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "-------------------------------------------\n", - "Time: 2018-09-06 17:28:44\n", - "-------------------------------------------\n", - "\n", - "-------------------------------------------\n", - "Time: 2018-09-06 17:28:45\n", - "-------------------------------------------\n", - "\n" + "Total documents:15\n", + "{'_id': ObjectId('5b940c37e254121740b5d19f'), 'todo_id': 1, 'url': 'https://jsonplaceholder.typicode.com/todos/1', 'result': 'delectus aut autem'}\n", + "{'_id': ObjectId('5b940c37e254121740b5d1a0'), 'todo_id': 2, 'url': 'https://jsonplaceholder.typicode.com/todos/2', 'result': 'quis ut nam facilis et officia qui'}\n", + "{'_id': ObjectId('5b940c37e254121740b5d1a1'), 'todo_id': 3, 'url': 'https://jsonplaceholder.typicode.com/todos/3', 'result': 'fugiat veniam minus'}\n", + "{'_id': ObjectId('5b940c37e254121740b5d1a2'), 'todo_id': 4, 'url': 'https://jsonplaceholder.typicode.com/todos/4', 'result': 'et porro tempora'}\n", + "{'_id': ObjectId('5b940c37e254121740b5d1a3'), 'todo_id': 5, 'url': 'https://jsonplaceholder.typicode.com/todos/5', 'result': 'laboriosam mollitia et enim quasi adipisci quia provident illum'}\n" ] } ], "source": [ - "from pyspark import SparkContext, SparkConf\n", - "from pyspark.streaming import StreamingContext\n", - "\n", - "\n", - "\n", - "\n", - "ssc = StreamingContext(op.sc, 1)\n", - "\n", - "\n", - "lines = ssc.socketTextStream(\"localhost\", 9998)\n", - "\n", - "# Split each line into words\n", - "words = lines.flatMap(lambda line: line.split(\" \"))\n", - "\n", - "# Count each word in each batch\n", - "pairs = words.map(lambda word: (word, 1))\n", - "\n", - "\n", - "wordCounts = pairs.reduceByKey(lambda x, y: x + y)\n", - "\n", - "# Print the first ten elements of each RDD generated in this DStream to the console\n", - "wordCounts.pprint()\n", - "\n", - "\n", - "ssc.start() # Start the computation\n", - "ssc.awaitTermination() # Wait for the computation to terminate" + "e.head(\"BOB\",5)" ] } ], @@ -492,6 +447,18 @@ "display_name": "Python 3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" } }, "nbformat": 4, diff --git a/optimus/enricher.py b/optimus/enricher.py index f26e6b384..f00b5ba6d 100644 --- a/optimus/enricher.py +++ b/optimus/enricher.py @@ -1,60 +1,126 @@ import json import logging import urllib +import csv import requests from pymongo import MongoClient from tqdm import tqdm_notebook +from ratelimit import limits + +from optimus.helpers.checkit import is_function, is_ +from pyspark.sql.functions import DataFrame + +import pandas as pd + + +# from odo import odo class Enricher: def __init__(self, host, port, db_name=None, collection_name=None): + logging.basicConfig(format="%(message)s", level=logging.INFO) + self.host = host self.port = port + self.db_name = db_name + self.collection_name = collection_name self.client = MongoClient(host, port) - if self.db_exists(db_name): - self.db = self.client[db_name] - else: - raise Exception('Database do not exist') + # FIFTEEN_MINUTES = 900 + # @limits(calls=15, period=FIFTEEN_MINUTES) - if self.collection_exists(collection_name): - self.collection = self.client[collection_name] - else: - raise Exception('Collection do not exist') + def send(self, df): + """ + Send the dataframe to the mongo collection + :param df: + :param id_col: + :param param_col: + :return: + """ + + if is_(df, pd.DataFrame): + mongo_url = "mongodb://" + self.host + self.port + "/" + self.db_name + "::" + self.collection_name + # odo(mongo_url, df) + elif is_(df, DataFrame): + df.save.mongo(self.host, self.port, self.db_name, self.collection_name) + + def flush(self): + """ + Flush collection + :return: + """ + count = self.count() + self.drop_collection(self.collection_name) + print("Removed {count} documents".format(count=count)) - def run(self): + def count(self): + """ + Conunt nunber of documents in a collections + :return: """ + collection = self.get_collection(self.collection_name) + cursor = collection.find() + return cursor.count(True) + def run(self, collection_name=None, key_url="url", func_request=None, func_response=None, return_type="json", + calls=None, period=60): + """ + Read a the url key from a mongo collection an make a request to a service + :param collection_name: + :param key_url: + :param func_request: + :param func_response: + :param return_type: + :param calls: + :param period: :return: """ - collection = self.get_collection('step3') - cursor = collection.find({'$or': [{'lat': None}, {'lng': None}]}, - projection=['_id', 'lat', 'lng', 'state', 'city']) - for r in tqdm_notebook(cursor, total=cursor.count(), desc='Geolocation'): - state = r['state'].lower() - state = "".join(state.split()) + if collection_name is None: + collection_name = self.collection_name - if (state == "edo.demexico"): - state = "estado de mexico" - else: - state = r['state'] + collection = self.get_collection(collection_name) - url = "http://46.101.10.63:8080/search?format=jsonv2&q=mexico+" + state + '+' + r['city'] - url = urllib.parse.unquote_plus(url) + cursor = collection.find({"result": {"$exists": False}}, projection={'_id': 1, key_url: 1}) - result = requests.get(url) - result_json = json.loads(result.text) - if (result_json): + total_docs = cursor.count(True) + if total_docs > 0: + if func_request is None: + func_request = requests.get - lat = round(float(result_json[0]['lat']), 6) - lon = round(float(result_json[0]['lon']), 6) - collection.update_one({'_id': r['_id']}, {'$set': {'lat': lat, 'lng': lon}}, upsert=False) + for c in tqdm_notebook(cursor, total=total_docs, desc='Processing...'): - else: - print('No procesado') + try: + url = urllib.parse.unquote_plus(c[key_url]) + except KeyError: + raise Exception("url key not found") + + # Send request to the API + response = func_request(url) + + mongo_id = c["_id"] + + if response.status_code == 200: + if return_type == "json": + response = json.loads(response.text) + elif return_type == "text": + response = response.text + + # Process the result with an external function + if is_function(func_response): + response = func_response(response) + + # update the mongo id with the result + self.get_collection(collection_name).find_and_modify(query={"_id": mongo_id}, + update={"$set": {'result': response}}, + upsert=False, full_response=True) + else: + # The response key will remain blank so we can filter it to try in future request + print(response.status_code) + else: + print("No records available to process") def collection_exists(self, collection_name): """ @@ -62,7 +128,7 @@ def collection_exists(self, collection_name): :param collection_name: :return: """ - if collection_name in self.db.collection_names(): + if collection_name in self.get_db().collection_names(): return True else: return False @@ -78,13 +144,17 @@ def db_exists(self, db_name): else: return False + def get_db(self): + return self.client[self.db_name] + def get_collection(self, collection_name): """ :param collection_name: :return: """ - return self.db[collection_name] + collection = self.get_db()[collection_name] + return collection def copy_collection(self, source_name, dest_name): """ @@ -108,142 +178,116 @@ def copy_collection(self, source_name, dest_name): source.aggregate(pipeline) logging.info('Done') - def show_cols(self, collection_name=None): + def head(self, collection_name, n=10): """ - Show cols + Print n first documents :param collection_name: + :param n: :return: """ - if not self.collection_exists(collection_name): + # try to bring a cursor from a collection + cursor = self.get_collection(collection_name).find({}).limit(n) + print("Total documents:" + str(cursor.count())) + for c in cursor: + print(c) + + def show_keys(self, collection_name=None): + """ + Show keys in collection + :param collection_name: + :return: + """ + + if not self.collection_exists(self.collection_name): raise Exception("Collection {collection_name} not exist".format(collection_name=collection_name)) source = None - if collection_name is not None: - source = self.get_collection(collection_name) + if collection_name is None: + source = self.get_collection(self.collection_name) results = source.aggregate([ {"$project": {"arrayofkeyvalue": {"$objectToArray": "$$ROOT"}}}, {"$unwind": "$arrayofkeyvalue"}, {"$group": {"_id": None, "allkeys": {"$addToSet": "$arrayofkeyvalue.k"}}} ]) - logging.info(results) - results = list(results)[0]['allkeys'] - for r in results: - logging.info(r) + results = list(results)[0]['allkeys'] + return results - def get_cols(self, collection_name=None): + def show_collections(self, db): """ - - :param collection_name: + Show collections in a database :return: """ - if collection_name is not None: - source = self.get_collection(collection_name) - else: - source = self.collection - logging.info(source) - logging.info('Getting cols...') - result = source.aggregate([ - {"$project": {"arrayofkeyvalue": {"$objectToArray": "$$ROOT"}}}, - {"$unwind": "$arrayofkeyvalue"}, - {"$group": {"_id": None, "allkeys": {"$addToSet": "$arrayofkeyvalue.k"}}} - ]) - result = list(result)[0]['allkeys'] - logging.info('Done') - return result + return self.client[db].collection_names() - def drop_col(cols): - """ + # d = dict((db, [collection for collection in self.client[db].collection_names()]) + # for db in self.client.list_database_names()) + # print(json.dumps(d)) - :return: - """ - for x in tqdm_notebook(cols, desc='Processing cols'): - logging.info('Dropping', x, 'field') - dest_collection.update_many({}, {'$unset': {x: 1}}) - - def insert_to_collection(self, cursor, dest_collection_name, drop=False): + @staticmethod + def drop_keys(collection_name, keys): """ - - :param cursor: - :param dest_collection_name: - :param drop: + Drop key in collection :return: """ - dest_collection = self.get_collection(dest_collection_name) - if drop: - dest_collection.drop() + for key in tqdm_notebook(keys, desc='Processing cols'): + logging.info('Dropping', key, 'field') + collection_name.update_many({}, {'$unset': {key: 1}}) - if "count" in dir(cursor): - count = cursor.count() - else: - count = 1 - - for c in tqdm_notebook(cursor, total=count, desc='Saving Collection'): - dest_collection.insert_one(c) - - def create_missing_fields(self, cols, collection_name=None): + def drop_collection(self, collection_name): """ - :param cols: :param collection_name: :return: """ + if collection_name is None: + collection_name = self.collection_name + self.get_collection(collection_name).drop() - if collection_name is not None: - source = self.get_collection(collection_name) - else: - source = self.collection - - for c in tqdm_notebook(cols, total=len(cols), desc='Processing cols'): - logging.info('Inserting', c) - if c: - source.update_many( - {c: {'$exists': False}}, - {'$set': - { - c: None, - } - } - ); - else: - logging.info('Field', c, 'could not be added') - - def convert_field_to(self, collection_name, field, convert_to): + def save_to_csv(self, filename, collection_name=None, projection=None, limit=0): """ - + Save collection to csv + :param filename: :param collection_name: - :param field: - :param convert_to: + :param projection: + :param limit: :return: """ + + if collection_name is None: + collection_name = self.collection_name + collection = self.get_collection(collection_name) - cursor = collection.find({field: {'$exists': True}}).limit(0) - desc = 'Converting', field, 'to', convert_to - # for c in tqdm_notebook(cursor, total = cursor.count(), desc = 'sad'): + try: + file = open(filename, "w", newline='') + csv_write = csv.writer(file, delimiter=";", quotechar="|", quoting=csv.QUOTE_MINIMAL) + except IOError: + raise Exception("Could not read file {filename}".format(filename=filename)) - if convert_to == 'int': - l = float - elif convert_to == 'float': - l = int - elif convert_to == 'string': - l = str - else: - raise ValueError('Only int, float or string accepted in field param', field, 'value present') + if projection is None: + projection = {} + projection["_id"] = 0 - for c in tqdm_notebook(cursor, total=cursor.count(), desc='Processing records'): - try: - val = c[field] - val = l(c[field]) - collection.update_one({'_id': c['_id']}, {'$set': {field: val}}) + documents = collection.find({}, projection).limit(limit) + count = documents.count(True) - except ValueError: - logging.info('Could not convert "', val, '" to', convert_to) + try: + csv_write.writerow(["asd", "rty", "hjk"]) + for document in tqdm_notebook(documents, total=count, desc='Processing records'): + # Get a json, transform it to str and return a ; separated string + result = list(map((lambda x: str(x)), document.values())) + csv_write.writerow(result) + except IOError: + raise Exception("Could not write in {filename}".format(filename=filename)) + file.close() + + # CSV https: // gist.github.com / jxub / f722e0856ed461bf711684b0960c8458 @staticmethod - def mongo_to_json_array_file(filename, host, port, db, collection, projection=None, limit=None): + def save_to_json(filename, host, port, db, collection, projection=None, limit=None): if (limit is None): limit = 0 @@ -275,7 +319,7 @@ def mongo_to_json_array_file(filename, host, port, db, collection, projection=No file.close() @staticmethod - def to_geojson_file(filename, host, port, db, collection, coordinates_keys, projection=None): + def save_to_geojson(filename, host, port, db, collection, coordinates_keys, projection=None): client = MongoClient(host, port) _db = client[db] @@ -320,45 +364,14 @@ def to_geojson_file(filename, host, port, db, collection, coordinates_keys, proj file.write(']}') file.close() - def merge_two_dicts(x, y): - """ - - :param y: - :return: - """ - z = x.copy() # start with x's keys and values - z.update(y) # modifies z with y's keys and values & returns None - return z - - def head(self, collection_name, n=1): + # FIX: not work need to find how to implement in a cursor object + def head_cursor(self, collection_name_or_cursor, n=1): """ - :param collection_name: + :param collection_name_or_cursor: :param n: :return: """ - if not int(n): - raise Exception('n param must be an integer') - if self.collection_exists(collection_name): - if isinstance(collection_name, str): - # try to bring a cursor from a collection - cursor = self.get_collection(collection_name).find({}).limit(n) - count = cursor.count(True) - # FIX: and elegant way to make it in python? - i = 0 - - for c in cursor: - if (i < count): - print(c) - else: - break - i = i + 1 - else: - msg = 'Collection', collection_name, ' do not exist' - raise Exception(msg) - - # FIX: not work need to find how to implement in a cursor object - def head_cursor(self, collection_name_or_cursor, n=1): if not int(n): raise Exception('n must be an integer') if isinstance(collection_name_or_cursor, str): @@ -379,3 +392,82 @@ def head_cursor(self, collection_name_or_cursor, n=1): else: break i = i + 1 + + def insert_to_collection(self, cursor, dest_collection_name, drop=False): + """ + + :param cursor: + :param dest_collection_name: + :param drop: + :return: + """ + dest_collection = self.get_collection(dest_collection_name) + if drop: + dest_collection.drop() + + if "count" in dir(cursor): + count = cursor.count() + else: + count = 1 + + for c in tqdm_notebook(cursor, total=count, desc='Saving Collection'): + dest_collection.insert_one(c) + + def create_missing_fields(self, cols, collection_name=None): + """ + + :param cols: + :param collection_name: + :return: + """ + + if collection_name is not None: + source = self.get_collection(collection_name) + else: + source = self.collection + + for c in tqdm_notebook(cols, total=len(cols), desc='Processing cols'): + logging.info('Inserting', c) + if c: + source.update_many( + {c: {'$exists': False}}, + {'$set': + { + c: None, + } + } + ); + else: + logging.info('Field', c, 'could not be added') + + def convert_field_to(self, collection_name, field, convert_to): + """ + + :param collection_name: + :param field: + :param convert_to: + :return: + """ + collection = self.get_collection(collection_name) + cursor = collection.find({field: {'$exists': True}}).limit(0) + desc = 'Converting', field, 'to', convert_to + + # for c in tqdm_notebook(cursor, total = cursor.count(), desc = 'sad'): + + if convert_to == 'int': + data_type = float + elif convert_to == 'float': + data_type = int + elif convert_to == 'string': + data_type = str + else: + raise ValueError('Only int, float or string accepted in field param', field, 'value present') + + for c in tqdm_notebook(cursor, total=cursor.count(), desc='Processing records'): + try: + val = c[field] + val = data_type(c[field]) + collection.update_one({'_id': c['_id']}, {'$set': {field: val}}) + + except ValueError: + logging.info('Could not convert "', val, '" to', convert_to) diff --git a/requirements.txt b/requirements.txt index 66b74507b..8f053bfd4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ -requests -tqdm -pymongo -celery +requests==2.18.4 +tqdm==1.19.2 +pymongo==3.7.1 fastnumbers==2.1.1 multipledispatch==0.6.0 python_dateutil==2.7.3 @@ -25,4 +24,5 @@ pygments>=2.2.0 six>=1.10.0 h5py>=2.7.0 flask==1.0.2 -ipython==6.5.0 \ No newline at end of file +ipython==6.5.0 +ratelimit==2.2.0 \ No newline at end of file From 6f993f89fd0fdedf2eb41a992003aacaf418e2c0 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Sat, 8 Sep 2018 23:38:43 -0500 Subject: [PATCH 65/94] Use is now simpler. Docstring improvement. --- examples/new-api-enrichment.ipynb | 360 ++++++++++++++++++++---------- optimus/enricher.py | 107 ++++----- 2 files changed, 303 insertions(+), 164 deletions(-) diff --git a/examples/new-api-enrichment.ipynb b/examples/new-api-enrichment.ipynb index b1e4a0302..d3eb7c863 100644 --- a/examples/new-api-enrichment.ipynb +++ b/examples/new-api-enrichment.ipynb @@ -31,29 +31,15 @@ ] }, { - "cell_type": "code", - "execution_count": 4, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# make some test data\n", - "columns = ['todo_id']\n", - "vals = [\n", - " (1, ),\n", - " (2, ),\n", - " (3, ),\n", - " (4, ),\n", - " (5, )\n", - "\n", - "]\n", - "\n", - "# create DataFrame\n", - "df = op.spark.createDataFrame(vals, columns).repartition(1).cache()" + "### Create a Spark Dataframe" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 109, "metadata": {}, "outputs": [ { @@ -148,25 +134,259 @@ } ], "source": [ + "# make some test data\n", + "columns = ['todo_id']\n", + "vals = [\n", + " (1, ),\n", + " (2, ),\n", + " (3, ),\n", + " (4, ),\n", + " (5, )\n", + "\n", + "]\n", + "\n", + "# create DataFrame\n", + "df = op.spark.createDataFrame(vals, columns).repartition(1).cache()\n", "df.table()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a pandas daraframe" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
todo_iddescurl
06badhttps://jsonplaceholder.typicode.com/todos/6
17goodhttps://jsonplaceholder.typicode.com/todos/7
28uglyhttps://jsonplaceholder.typicode.com/todos/8
39tallhttps://jsonplaceholder.typicode.com/todos/9
410shorthttps://jsonplaceholder.typicode.com/todos/10
\n", + "
" + ], + "text/plain": [ + " todo_id desc url\n", + "0 6 bad https://jsonplaceholder.typicode.com/todos/6\n", + "1 7 good https://jsonplaceholder.typicode.com/todos/7\n", + "2 8 ugly https://jsonplaceholder.typicode.com/todos/8\n", + "3 9 tall https://jsonplaceholder.typicode.com/todos/9\n", + "4 10 short https://jsonplaceholder.typicode.com/todos/10" + ] + }, + "execution_count": 126, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "data = {\"todo_id\": [6, 7, 8, 9,10], \n", + " \"desc\": [\"bad\", \"good\", \"ugly\", \"tall\",\"short\"],\n", + " \"url\": [\"https://jsonplaceholder.typicode.com/todos/6\", \n", + " \"https://jsonplaceholder.typicode.com/todos/7\", \n", + " \"https://jsonplaceholder.typicode.com/todos/8\", \n", + " \"https://jsonplaceholder.typicode.com/todos/9\",\n", + " \"https://jsonplaceholder.typicode.com/todos/10\"]}\n", + "pdf =pd.DataFrame.from_dict(data)\n", + "pdf.head()" + ] + }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 101, "metadata": {}, "outputs": [], "source": [ - "df.save.rabbit_mq(\"amqp://localhost:5672/\",\"exchange_name\", \"example-queue\",\"BOB\" )" + "from optimus.enricher import Enricher\n", + "\n", + "e = Enricher(\"localhost\",27017, \"db\",\"optimus\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Send a Spark Dataframe" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 156, "metadata": {}, "outputs": [], "source": [ - "df.save.mongo(\"localhost\",27017, \"db\",\"BOB\")" + "e.send(df_url)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Send a Pandas Dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": {}, + "outputs": [], + "source": [ + "e.send(pdf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run the enrichment process" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No records available to process\n" + ] + } + ], + "source": [ + "import requests \n", + "\n", + "\n", + "def func_request(params):\n", + " # You can use here whatever header or auth info you need to send. \n", + " # For mor information see the requests library\n", + " url= \"https://jsonplaceholder.typicode.com/todos/\" + str(params[\"todo_id\"])\n", + "\n", + " return requests.get(url)\n", + "\n", + "\n", + "def func_response(response):\n", + " # Here you can parse de response\n", + " return response[\"title\"]\n", + "\n", + "\n", + "e.run(func_request= func_request, func_response= func_response)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c09e569703ee4bdfa1b8dc3045925be3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, description='Processing records', max=10), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Removed 10 documents\n" + ] + } + ], + "source": [ + "e.save_to_csv(\"jazz.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Merge with dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Removed 10 documents\n" + ] + } + ], + "source": [ + "e.flush()" ] }, { @@ -174,7 +394,9 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "### You can prepare a rul using nest" + ] }, { "cell_type": "code", @@ -300,103 +522,17 @@ ], "source": [ "from pyspark.sql import functions as F\n", - " \n", + " \n", + "# Prepare the URL\n", "df_url = df.cols.nest((\"https://jsonplaceholder.typicode.com/todos/\",F.col(\"todo_id\")),\"url\")\n", "df_url.table()" ] }, { - "cell_type": "code", - "execution_count": 42, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "from optimus.enricher import Enricher\n", - "\n", - "e = Enricher(\"localhost\",27017, \"db\",\"BOB\")\n", - "e.send(df_url)" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e7770c7e28024c5eb918f84b30a4a3d1", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Processing...', max=10), HTML(value='')))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "def func(response):\n", - " return response[\"title\"]\n", - "\n", - "e.run(func_response= func)" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "0ea85e5e191e4618828999d2a9223583", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Processing records', max=15), HTML(value='')))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "e.save_to_csv(\"jazz.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Removed 10 documents\n" - ] - } - ], - "source": [ - "e.flush()" + "## Some other operations" ] }, { diff --git a/optimus/enricher.py b/optimus/enricher.py index f00b5ba6d..4356932d0 100644 --- a/optimus/enricher.py +++ b/optimus/enricher.py @@ -1,23 +1,20 @@ +import csv import json import logging -import urllib -import csv +import pandas as pd import requests from pymongo import MongoClient +from pyspark.sql.functions import DataFrame from tqdm import tqdm_notebook -from ratelimit import limits from optimus.helpers.checkit import is_function, is_ -from pyspark.sql.functions import DataFrame - -import pandas as pd - - -# from odo import odo class Enricher: + """ + Enrich data from a Pandas or Spark dataframe + """ def __init__(self, host, port, db_name=None, collection_name=None): logging.basicConfig(format="%(message)s", level=logging.INFO) @@ -34,21 +31,20 @@ def __init__(self, host, port, db_name=None, collection_name=None): def send(self, df): """ Send the dataframe to the mongo collection - :param df: - :param id_col: - :param param_col: + :param df: dataframe to be send to the enricher :return: """ if is_(df, pd.DataFrame): - mongo_url = "mongodb://" + self.host + self.port + "/" + self.db_name + "::" + self.collection_name - # odo(mongo_url, df) + self.get_collection(self.collection_name).insert_many(df.to_dict("records")) elif is_(df, DataFrame): df.save.mongo(self.host, self.port, self.db_name, self.collection_name) + else: + raise Exception("df must by a Spark Dataframe or Pandas Dataframe") def flush(self): """ - Flush collection + Flush the enricher default collection :return: """ count = self.count() @@ -64,17 +60,16 @@ def count(self): cursor = collection.find() return cursor.count(True) - def run(self, collection_name=None, key_url="url", func_request=None, func_response=None, return_type="json", + def run(self, collection_name=None, func_request=None, func_response=None, return_type="json", calls=None, period=60): """ Read a the url key from a mongo collection an make a request to a service :param collection_name: - :param key_url: - :param func_request: - :param func_response: + :param func_request: help to create a custom request + :param func_response: help to create a custom response :param return_type: - :param calls: - :param period: + :param calls: how many call can you make + :param period: in which period ot time can the call be made :return: """ @@ -83,7 +78,7 @@ def run(self, collection_name=None, key_url="url", func_request=None, func_respo collection = self.get_collection(collection_name) - cursor = collection.find({"result": {"$exists": False}}, projection={'_id': 1, key_url: 1}) + cursor = collection.find({"result": {"$exists": False}}) total_docs = cursor.count(True) if total_docs > 0: @@ -92,13 +87,8 @@ def run(self, collection_name=None, key_url="url", func_request=None, func_respo for c in tqdm_notebook(cursor, total=total_docs, desc='Processing...'): - try: - url = urllib.parse.unquote_plus(c[key_url]) - except KeyError: - raise Exception("url key not found") - # Send request to the API - response = func_request(url) + response = func_request(c) mongo_id = c["_id"] @@ -192,7 +182,7 @@ def head(self, collection_name, n=10): for c in cursor: print(c) - def show_keys(self, collection_name=None): + def get_keys(self, collection_name=None): """ Show keys in collection :param collection_name: @@ -271,13 +261,19 @@ def save_to_csv(self, filename, collection_name=None, projection=None, limit=0): projection = {} projection["_id"] = 0 - documents = collection.find({}, projection).limit(limit) - count = documents.count(True) - try: - csv_write.writerow(["asd", "rty", "hjk"]) + # Save csv header + for header in collection.find({}, projection).limit(1): + csv_write.writerow(header.keys()) + + # Save csv body + documents = collection.find({}, projection).limit(limit) + count = documents.count(True) + + # Save csv body for document in tqdm_notebook(documents, total=count, desc='Processing records'): - # Get a json, transform it to str and return a ; separated string + # Get a json, transform it to str and return a semicolon separated string + result = list(map((lambda x: str(x)), document.values())) csv_write.writerow(result) except IOError: @@ -285,21 +281,26 @@ def save_to_csv(self, filename, collection_name=None, projection=None, limit=0): file.close() + # Empty the collecion + self.flush() + # CSV https: // gist.github.com / jxub / f722e0856ed461bf711684b0960c8458 - @staticmethod - def save_to_json(filename, host, port, db, collection, projection=None, limit=None): - if (limit is None): - limit = 0 - client = MongoClient(host, port) - _db = client[db] - _collection = _db[collection] + def save_to_json(self, filename, projection=None, limit=0): + """ + Save collection to json file + :param filename: + :param projection: + :param limit: + :return: + """ + + _collection = self.get_collection(self.collection_name) file = open(filename, "w") file.write('[') i = 0 - properties = {} # dump all the data documents = _collection.find({}, projection).limit(limit) @@ -318,23 +319,25 @@ def save_to_json(filename, host, port, db, collection, projection=None, limit=No file.write(']') file.close() - @staticmethod - def save_to_geojson(filename, host, port, db, collection, coordinates_keys, projection=None): + def save_to_geojson(self, filename, coordinates_keys, projection=None, limit=0): + """ + Save collection to geojson file + :param filename: Output file + :param coordinates_keys: + :param projection: + :return: + """ - client = MongoClient(host, port) - _db = client[db] - _collection = _db[collection] + _collection = self.get_collection(self.collection_name) file = open(filename, "w") file.write('{"type": "FeatureCollection","features":[') i = 0 - properties = {} # dump all the data projection = coordinates_keys + projection - # rojection.append({'_id':False}) - documents = _collection.find({}, projection).limit(0) + documents = _collection.find({}, projection).limit(limit) count = documents.count(True) for r in tqdm_notebook(documents, total=count, desc='Processing records'): @@ -395,7 +398,7 @@ def head_cursor(self, collection_name_or_cursor, n=1): def insert_to_collection(self, cursor, dest_collection_name, drop=False): """ - + Insert a cursor into a collection :param cursor: :param dest_collection_name: :param drop: @@ -440,7 +443,7 @@ def create_missing_fields(self, cols, collection_name=None): else: logging.info('Field', c, 'could not be added') - def convert_field_to(self, collection_name, field, convert_to): + def cast(self, collection_name, field, convert_to): """ :param collection_name: From 1e72b87bd9ec2abf4b906e163ced733855ddc285 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Sat, 8 Sep 2018 23:39:34 -0500 Subject: [PATCH 66/94] Docstring improvement --- optimus/enricher.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/optimus/enricher.py b/optimus/enricher.py index 4356932d0..ee643c3e6 100644 --- a/optimus/enricher.py +++ b/optimus/enricher.py @@ -228,7 +228,7 @@ def drop_keys(collection_name, keys): def drop_collection(self, collection_name): """ - + Drop a collection :param collection_name: :return: """ @@ -239,10 +239,10 @@ def drop_collection(self, collection_name): def save_to_csv(self, filename, collection_name=None, projection=None, limit=0): """ Save collection to csv - :param filename: - :param collection_name: - :param projection: - :param limit: + :param filename: Output filename + :param collection_name: custom collection to save + :param projection: Filter the keys on csv output + :param limit: Limit the number to record in the output file :return: """ From f81b89c85d32500eea4f7bf4de4c091ba0c00568 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Mon, 10 Sep 2018 15:09:58 -0500 Subject: [PATCH 67/94] Work in progress --- examples/new-api-enrichment.ipynb | 328 ++++++++++++++++++++++++------ optimus/enricher.py | 87 +++++--- optimus/optimus.py | 16 +- 3 files changed, 337 insertions(+), 94 deletions(-) diff --git a/examples/new-api-enrichment.ipynb b/examples/new-api-enrichment.ipynb index d3eb7c863..2ed793c88 100644 --- a/examples/new-api-enrichment.ipynb +++ b/examples/new-api-enrichment.ipynb @@ -39,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -63,7 +63,7 @@ "\n", "\n", "\n", - "
Viewing 5 of 5 rows / 1 columns
\n", + "
Viewing 9 of 9 rows / 1 columns
\n", "\n", "\n", " \n", @@ -120,10 +120,42 @@ " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", "
\n", + " 6\n", + "
\n", + " 7\n", + "
\n", + " 8\n", + "
\n", + " 94\n", + "
\n", "\n", - "
Viewing 5 of 5 rows / 1 columns
\n" + "
Viewing 9 of 9 rows / 1 columns
\n" ], "text/plain": [ "" @@ -141,7 +173,12 @@ " (2, ),\n", " (3, ),\n", " (4, ),\n", - " (5, )\n", + " (5, ),\n", + " (6, ),\n", + " (7, ),\n", + " (8, ),\n", + " (94, ),\n", + " \n", "\n", "]\n", "\n", @@ -150,6 +187,79 @@ "df.table()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'function' object has no attribute 'create_id'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 14\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 16\u001b[1;33m \u001b[0mdf_result\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mop\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0menrich\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc_request\u001b[0m\u001b[1;33m=\u001b[0m \u001b[0mfunc_request\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc_response\u001b[0m\u001b[1;33m=\u001b[0m \u001b[0mfunc_response\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\optimus.py\u001b[0m in \u001b[0;36menrich\u001b[1;34m(self, df, func_request, func_response)\u001b[0m\n\u001b[0;32m 82\u001b[0m \"\"\"\n\u001b[0;32m 83\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 84\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0menricher\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc_request\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mfunc_request\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc_response\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mfunc_response\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 85\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 86\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\enricher.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self, df, collection_name, func_request, func_response, return_type, filename, calls, period)\u001b[0m\n\u001b[0;32m 71\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 72\u001b[0m \u001b[1;31m# Load the dataframe data in the enricher\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 73\u001b[1;33m \u001b[0mdf_result\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcols\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcreate_id\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 74\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_result\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 75\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mAttributeError\u001b[0m: 'function' object has no attribute 'create_id'" + ] + } + ], + "source": [ + "import requests\n", + "\n", + "def func_request(params):\n", + " # You can use here whatever header or auth info you need to send. \n", + " # For more information see the requests library\n", + " url= \"https://jsonplaceholder.typicode.com/todos/\" + str(params[\"todo_id\"])\n", + "\n", + " return requests.get(url)\n", + "\n", + "\n", + "def func_response(response):\n", + " # Here you can parse de response\n", + " return response[\"title\"]\n", + "\n", + "\n", + "df_result = op.enrich(df, func_request= func_request, func_response= func_response)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'NoneType' object has no attribute 'table'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdf_result\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtable\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'table'" + ] + } + ], + "source": [ + "df_result.table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Enricher without Optimus" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -252,45 +362,14 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 169, "metadata": {}, "outputs": [], "source": [ "from optimus.enricher import Enricher\n", "\n", - "e = Enricher(\"localhost\",27017, \"db\",\"optimus\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Send a Spark Dataframe" - ] - }, - { - "cell_type": "code", - "execution_count": 156, - "metadata": {}, - "outputs": [], - "source": [ - "e.send(df_url)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Send a Pandas Dataframe" - ] - }, - { - "cell_type": "code", - "execution_count": 153, - "metadata": {}, - "outputs": [], - "source": [ - "e.send(pdf)" + "e = Enricher()\n", + "#e = Enricher(\"localhost\",27017, \"enricher\",\"optimus\")" ] }, { @@ -302,51 +381,139 @@ }, { "cell_type": "code", - "execution_count": 158, + "execution_count": 173, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "No records available to process\n" - ] + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 5 of 5 rows / 1 columns
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
todo_id
\n", + "
1 (bigint)
\n", + "\n", + "
\n", + " 1\n", + "
\n", + " 2\n", + "
\n", + " 3\n", + "
\n", + " 4\n", + "
\n", + " 5\n", + "
\n", + "\n", + "
Viewing 5 of 5 rows / 1 columns
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "import requests \n", - "\n", - "\n", - "def func_request(params):\n", - " # You can use here whatever header or auth info you need to send. \n", - " # For mor information see the requests library\n", - " url= \"https://jsonplaceholder.typicode.com/todos/\" + str(params[\"todo_id\"])\n", - "\n", - " return requests.get(url)\n", - "\n", - "\n", - "def func_response(response):\n", - " # Here you can parse de response\n", - " return response[\"title\"]\n", - "\n", - "\n", - "e.run(func_request= func_request, func_response= func_response)\n" + "df.table()" ] }, { "cell_type": "code", - "execution_count": 159, + "execution_count": 176, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c09e569703ee4bdfa1b8dc3045925be3", + "model_id": "3782e14c59674612a19663282bd7d31c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, description='Processing...', max=5), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0faffca247b24402a52b3eee067d1d98", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Processing records', max=10), HTML(value='')))" + "HBox(children=(IntProgress(value=0, description='Saving...', max=5), HTML(value='')))" ] }, "metadata": {}, @@ -357,12 +524,41 @@ "output_type": "stream", "text": [ "\n", - "Removed 10 documents\n" + "Removed 5 documents\n" + ] + }, + { + "ename": "AttributeError", + "evalue": "'DataFrame' object has no attribute 'load'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 17\u001b[1;33m \u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc_request\u001b[0m\u001b[1;33m=\u001b[0m \u001b[0mfunc_request\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc_response\u001b[0m\u001b[1;33m=\u001b[0m \u001b[0mfunc_response\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"jazz.csv\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\enricher.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self, df, collection_name, func_request, func_response, return_type, filename, calls, period)\u001b[0m\n\u001b[0;32m 124\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msave_to_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 125\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 126\u001b[1;33m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcsv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtable\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 127\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 128\u001b[0m \u001b[1;31m#\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pyspark\\sql\\dataframe.py\u001b[0m in \u001b[0;36m__getattr__\u001b[1;34m(self, name)\u001b[0m\n\u001b[0;32m 1180\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1181\u001b[0m raise AttributeError(\n\u001b[1;32m-> 1182\u001b[1;33m \"'%s' object has no attribute '%s'\" % (self.__class__.__name__, name))\n\u001b[0m\u001b[0;32m 1183\u001b[0m \u001b[0mjc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1184\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mColumn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mjc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mAttributeError\u001b[0m: 'DataFrame' object has no attribute 'load'" ] } ], "source": [ - "e.save_to_csv(\"jazz.csv\")" + "import requests \n", + "\n", + "\n", + "def func_request(params):\n", + " # You can use here whatever header or auth info you need to send. \n", + " # For mor information see the requests library\n", + " url= \"https://jsonplaceholder.typicode.com/todos/\" + str(params[\"todo_id\"])\n", + "\n", + " return requests.get(url)\n", + "\n", + "\n", + "def func_response(response):\n", + " # Here you can parse de response\n", + " return response[\"title\"]\n", + "\n", + "\n", + "e.run(df, func_request= func_request, func_response= func_response, filename=\"jazz.csv\")\n" ] }, { diff --git a/optimus/enricher.py b/optimus/enricher.py index ee643c3e6..f3cdeef7f 100644 --- a/optimus/enricher.py +++ b/optimus/enricher.py @@ -9,6 +9,7 @@ from tqdm import tqdm_notebook from optimus.helpers.checkit import is_function, is_ +from optimus.helpers.functions import random_int class Enricher: @@ -16,14 +17,26 @@ class Enricher: Enrich data from a Pandas or Spark dataframe """ - def __init__(self, host, port, db_name=None, collection_name=None): + def __init__(self, host="localhost", port=27017, db_name="jazz", collection_name="data", op=None, *args, + **kwargs): + """ + + :param host: Mongo server host + :param port: Mongo server port + :param db_name: Mongo server database + :param collection_name: Mongo serverg collection + :param op: optimus instance + :param args: + :param kwargs: + """ logging.basicConfig(format="%(message)s", level=logging.INFO) self.host = host self.port = port self.db_name = db_name self.collection_name = collection_name - self.client = MongoClient(host, port) + self.client = MongoClient(host, port, *args, **kwargs) + self.op = op # FIFTEEN_MINUTES = 900 # @limits(calls=15, period=FIFTEEN_MINUTES) @@ -42,29 +55,12 @@ def send(self, df): else: raise Exception("df must by a Spark Dataframe or Pandas Dataframe") - def flush(self): - """ - Flush the enricher default collection - :return: - """ - count = self.count() - self.drop_collection(self.collection_name) - print("Removed {count} documents".format(count=count)) - - def count(self): - """ - Conunt nunber of documents in a collections - :return: - """ - collection = self.get_collection(self.collection_name) - cursor = collection.find() - return cursor.count(True) - - def run(self, collection_name=None, func_request=None, func_response=None, return_type="json", + def run(self, df, collection_name=None, func_request=None, func_response=None, return_type="json", filename=None, calls=None, period=60): """ Read a the url key from a mongo collection an make a request to a service - :param collection_name: + :param df: Dataframe to me loaded to the enricher collection. + :param collection_name: Custom collection to save the data. :param func_request: help to create a custom request :param func_response: help to create a custom response :param return_type: @@ -73,6 +69,10 @@ def run(self, collection_name=None, func_request=None, func_response=None, retur :return: """ + # Load the dataframe data in the enricher + df_result = df.cols.create_id() + self.send(df_result) + if collection_name is None: collection_name = self.collection_name @@ -109,9 +109,47 @@ def run(self, collection_name=None, func_request=None, func_response=None, retur else: # The response key will remain blank so we can filter it to try in future request print(response.status_code) + + # Save a temporal data file to be merged with the dataframe. + # If someone knows a way get the data form the collection and merge it the source dataframe + # please open an issue. + if filename is None: + filename = random_int() + ".csv" + + # Save temporal file from mongo to + # self.save_to_csv(filename, collection_name) + + # Load from the temporal fgi + # df_result = self.op.load.csv(filename) + + # join both the actual dataframe an the temp csv + + # Flush the mongo collection + # self.flush() + return True + + # else: print("No records available to process") + def count(self): + """ + Count number of documents in a collections + :return: + """ + collection = self.get_collection(self.collection_name) + cursor = collection.find() + return cursor.count(True) + + def flush(self): + """ + Flush the enricher default collection + :return: + """ + count = self.count() + self.drop_collection(self.collection_name) + print("Removed {count} documents".format(count=count)) + def collection_exists(self, collection_name): """ Check if a collection exist @@ -271,7 +309,7 @@ def save_to_csv(self, filename, collection_name=None, projection=None, limit=0): count = documents.count(True) # Save csv body - for document in tqdm_notebook(documents, total=count, desc='Processing records'): + for document in tqdm_notebook(documents, total=count, desc='Saving...'): # Get a json, transform it to str and return a semicolon separated string result = list(map((lambda x: str(x)), document.values())) @@ -281,9 +319,6 @@ def save_to_csv(self, filename, collection_name=None, projection=None, limit=0): file.close() - # Empty the collecion - self.flush() - # CSV https: // gist.github.com / jxub / f722e0856ed461bf711684b0960c8458 def save_to_json(self, filename, projection=None, limit=0): diff --git a/optimus/optimus.py b/optimus/optimus.py index 6af123708..ecf218ba0 100644 --- a/optimus/optimus.py +++ b/optimus/optimus.py @@ -1,8 +1,8 @@ -import logging import os from shutil import rmtree from optimus.create import Create +from optimus.enricher import Enricher from optimus.functions import concat from optimus.helpers.constants import * from optimus.helpers.raiseit import RaiseIt @@ -17,7 +17,7 @@ class Optimus: def __init__(self, master="local[*]", app_name="optimus", checkpoint=False, path=None, file_system="local", - verbose=False, dl=False): + verbose=False, dl=False, enricher_localhost="localhost", enricher_port=27017): """ Transform and roll out @@ -70,6 +70,18 @@ def __init__(self, master="local[*]", app_name="optimus", checkpoint=False, path self.read = self.spark.read self.profiler = Profiler() self.ml = ML() + self.enricher = Enricher(enricher_localhost, enricher_port, op=self) + + def enrich(self, df, func_request, func_response): + """ + + :param df: + :param func_request: + :param func_response: + :return: + """ + + self.enricher.run(df, func_request=func_request, func_response=func_response) @property def spark(self): From 92ca5a1a212e3768da928f58e4e53f9189854cdb Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Mon, 10 Sep 2018 15:10:09 -0500 Subject: [PATCH 68/94] Clean up --- optimus/helpers/functions.py | 5 +++-- optimus/io/load.py | 17 +++++++++++++++++ optimus/io/save.py | 7 +++---- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/optimus/helpers/functions.py b/optimus/helpers/functions.py index 4bfab1a04..ca0df71fb 100644 --- a/optimus/helpers/functions.py +++ b/optimus/helpers/functions.py @@ -8,7 +8,7 @@ from IPython.display import display, HTML from optimus.helpers.checkit import is_list_of_one_element, is_list_of_strings, is_list_of_tuples, \ - is_list_of_str_or_int, is_str, is_str_or_int, is_dict_of_one_element, is_tuple, is_dict, is_list + is_str, is_dict_of_one_element, is_tuple, is_dict, is_list from optimus.helpers.constants import PYTHON_SHORT_TYPES, SPARK_SHORT_DTYPES, SPARK_DTYPES_DICT, \ SPARK_DTYPES_DICT_OBJECTS from optimus.helpers.raiseit import RaiseIt @@ -16,7 +16,7 @@ def random_int(n=5): """ - Create a unique filename + Create a random number :return: """ return str(random.randint(1, 10 ** n)) @@ -275,6 +275,7 @@ def parse_columns(df, cols_args, get_args=False, is_regex=None, filter_by_column # Filter by column data type filter_by_column_dtypes = val_to_list(filter_by_column_dtypes) + if is_list_of_strings(filter_by_column_dtypes): # Get columns for every data type columns_filtered = filter_col_name_by_dtypes(df, filter_by_column_dtypes) diff --git a/optimus/io/load.py b/optimus/io/load.py index 8ca66ab67..680f29a3e 100644 --- a/optimus/io/load.py +++ b/optimus/io/load.py @@ -2,6 +2,8 @@ import tempfile from urllib.request import Request, urlopen +from kombu import Consumer + from optimus.helpers.raiseit import RaiseIt from optimus.spark import Spark @@ -117,6 +119,21 @@ def avro(path, *args, **kwargs): return df +""" + @staticmethod + def rabbit_mq(): + def process_message(body, message): + print("The body is {}".format(body)) + message.ack() + + with Consumer(conn, queues=queue, callbacks=[process_message], accept=["application/json"]): + line = conn.drain_events(timeout=5) + print(line) + + # conn.heartbeat_check() +""" + + class Downloader(object): def __init__(self, data_def): self.data_def = data_def diff --git a/optimus/io/save.py b/optimus/io/save.py index c8dc39c9b..64101773f 100644 --- a/optimus/io/save.py +++ b/optimus/io/save.py @@ -1,8 +1,7 @@ import logging -from kombu import Connection, Exchange, Queue, Consumer, Producer -from pymongo import MongoClient -from tqdm import tqdm_notebook +from kombu import Connection, Exchange, Queue, Producer +from pymongo import MongoClient from pyspark.sql import DataFrame from optimus.helpers.decorators import * @@ -149,7 +148,6 @@ def _rabbit_mq(messages): def mongo(host, port=None, db_name=None, collection_name=None, parallelism=None): """ Send a dataframe to a mongo collection - :param host: :param port: :param db_name: @@ -167,6 +165,7 @@ def _mongo(messages): collection = db[collection_name] for message in messages: + as_dict = message.asDict(recursive=True) collection.insert_one(as_dict) client.close() From ed0e7560ed74fb3e4c79859219307e7c7fa630f4 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Mon, 10 Sep 2018 15:26:49 -0500 Subject: [PATCH 69/94] Merge branch 'master' into feature/data_enrichment # Conflicts: # optimus/dataframe/columns.py --- .gitignore | 1 + .travis.yml | 2 +- MANIFEST.in | 2 + README.md | 2 +- docs/source/conf.py | 2 +- examples/new-api-column.ipynb | 72 +- examples/new-api-profiler-test.ipynb | 2958 +++++++++++++++ examples/new-api-profiler.ipynb | 5197 ++++++++++++-------------- optimus/dataframe/columns.py | 47 +- optimus/helpers/functions.py | 2 +- optimus/io/load.py | 11 +- optimus/optimus.py | 28 +- optimus/profiler/functions.py | 6 +- optimus/profiler/profiler.py | 29 +- optimus/spark.py | 18 +- optimus/version.py | 2 +- requirements-docs.txt | 2 +- requirements-test.txt | 6 +- requirements.txt | 4 +- setup.py | 4 +- 20 files changed, 5497 insertions(+), 2898 deletions(-) create mode 100644 MANIFEST.in create mode 100644 examples/new-api-profiler-test.ipynb diff --git a/.gitignore b/.gitignore index e75b8a61d..f720a9d5e 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,4 @@ examples/example.json examples/random.csv data.json .pytest_cache/README.md +examples/order_products__prior.csv diff --git a/.travis.yml b/.travis.yml index 7754106a9..d7ed0b820 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,7 +21,7 @@ jdk: - oraclejdk8 script: - - py.test -v --ignore=optimus/dl/ + - py.test -v --ignore=optimus/dl/ --ignore=tests/test_dl.py #deploy: # provider: pypi diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 000000000..6c28df639 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include optimus/templates/* +include optimus/profiler/templates/* \ No newline at end of file diff --git a/README.md b/README.md index 71d2b4701..f623f93c0 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ If you want to load from a URL you just need to use load.url() with the path and ```python df = op.load.url("https://raw.githubusercontent.com/ironmussa/Optimus/feature/load_save_improvements/examples/data/foo.json", "json") ``` -## Data loading, cleaning and processing +## Cleaning and Processing Optimus V2 was created to make data cleaning a breeze. The API was designed to be super easy to newcomers and very familiar for people that comes from Pandas. Optimus expand the Spark DataFrame functionality adding .rows and .cols attributes. diff --git a/docs/source/conf.py b/docs/source/conf.py index 566770848..b2e63c4f8 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -60,7 +60,7 @@ # The short X.Y version. version = '2.0' # The full version, including alpha/beta/rc tags. -release = '2.0.4' +release = '2.0.6' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/examples/new-api-column.ipynb b/examples/new-api-column.ipynb index 8c3d52a8e..c860cf37c 100644 --- a/examples/new-api-column.ipynb +++ b/examples/new-api-column.ipynb @@ -2016,13 +2016,13 @@ " \n", " \n", " \n", - "
num 2
\n", + "
filter
\n", "
1 (string)
\n", "\n", " \n", " \n", " \n", - "
words
\n", + "
two strings
\n", "
2 (string)
\n", "\n", " \n", @@ -2040,13 +2040,13 @@ " \n", " \n", " \n", - "
filter
\n", + "
words
\n", "
5 (string)
\n", "\n", " \n", " \n", " \n", - "
two strings
\n", + "
num 2
\n", "
6 (string)
\n", "\n", " \n", @@ -2059,11 +2059,11 @@ " \n", " \n", " \n", - " 1\n", + " a\n", " \n", " \n", " \n", - " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " cat-car\n", " \n", " \n", " \n", @@ -2075,11 +2075,11 @@ " \n", " \n", " \n", - " a\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", " \n", " \n", " \n", - " cat-car\n", + " 1\n", " \n", " \n", " \n", @@ -2087,11 +2087,11 @@ " \n", " \n", " \n", - " 2\n", + " b\n", " \n", " \n", " \n", - " ⸱⸱⸱⸱zombies\n", + " dog-tv\n", " \n", " \n", " \n", @@ -2103,11 +2103,11 @@ " \n", " \n", " \n", - " b\n", + " ⸱⸱⸱⸱zombies\n", " \n", " \n", " \n", - " dog-tv\n", + " 2\n", " \n", " \n", " \n", @@ -2115,11 +2115,11 @@ " \n", " \n", " \n", - " 3\n", + " 1\n", " \n", " \n", " \n", - " simpsons⸱⸱⸱cat⸱lady\n", + " eagle-tv-plus\n", " \n", " \n", " \n", @@ -2131,11 +2131,11 @@ " \n", " \n", " \n", - " 1\n", + " simpsons⸱⸱⸱cat⸱lady\n", " \n", " \n", " \n", - " eagle-tv-plus\n", + " 3\n", " \n", " \n", " \n", @@ -2143,11 +2143,11 @@ " \n", " \n", " \n", - " 4\n", + " c\n", " \n", " \n", " \n", - " None\n", + " lion-pc\n", " \n", " \n", " \n", @@ -2159,11 +2159,11 @@ " \n", " \n", " \n", - " c\n", + " None\n", " \n", " \n", " \n", - " lion-pc\n", + " 4\n", " \n", " \n", " \n", @@ -3515,27 +3515,25 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 53, "metadata": {}, "outputs": [ { - "ename": "TypeError", - "evalue": "unhashable type: 'VectorUDT'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpyspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinalg\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mVectors\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcols\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"col_int\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mVectors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/optimus/optimus/helpers/decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mwraps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 26\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 27\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0msetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcls\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda/lib/python3.6/site-packages/multipledispatch/dispatcher.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 276\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_cache\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mtypes\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 277\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 278\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 279\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 280\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mMDNotImplementedError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/optimus/optimus/dataframe/columns.py\u001b[0m in \u001b[0;36mcast\u001b[0;34m(columns, dtype)\u001b[0m\n\u001b[1;32m 311\u001b[0m \u001b[0mattrs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 312\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 313\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_cast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcols\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattrs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 314\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 315\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0madd_attr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcols\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/optimus/optimus/dataframe/columns.py\u001b[0m in \u001b[0;36m_cast\u001b[0;34m(cols, args)\u001b[0m\n\u001b[1;32m 273\u001b[0m \u001b[0mfunc_return_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreturn_type\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 274\u001b[0m \u001b[0mattrs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 275\u001b[0;31m func_type=func_type, verbose=False)\n\u001b[0m\u001b[1;32m 276\u001b[0m )\n\u001b[1;32m 277\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/optimus/optimus/functions.py\u001b[0m in \u001b[0;36mabstract_udf\u001b[0;34m(col, func, func_return_type, attrs, func_type, verbose)\u001b[0m\n\u001b[1;32m 41\u001b[0m .format(func_type=func_type, column=col, func_name=func.__name__))\n\u001b[1;32m 42\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 43\u001b[0;31m \u001b[0mdf_func\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc_factory\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc_return_type\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 44\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdf_func\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mattrs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 45\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/optimus/optimus/functions.py\u001b[0m in \u001b[0;36mfunc_factory\u001b[0;34m(func_type, func_return_type)\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[0;31m# if func_return_type is not None:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m \u001b[0mfunc_return_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_spark_dtypes_object\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc_return_type\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 57\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mpandas_udf_func\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mattr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/optimus/optimus/helpers/functions.py\u001b[0m in \u001b[0;36mget_spark_dtypes_object\u001b[0;34m(value)\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mval_to_list\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 50\u001b[0;31m \u001b[0mdata_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mSPARK_DTYPES_DICT_OBJECTS\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mSPARK_SHORT_DTYPES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 51\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/optimus/optimus/helpers/functions.py\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mval_to_list\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 50\u001b[0;31m \u001b[0mdata_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mSPARK_DTYPES_DICT_OBJECTS\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mSPARK_SHORT_DTYPES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 51\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mTypeError\u001b[0m: unhashable type: 'VectorUDT'" + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" ] + }, + { + "data": { + "text/plain": [ + "DataFrame[words: string, num: int, animals: string, thing: string, two strings: string, filter: string, num 2: string, col_array: array, col_int: vector, new_col_1: int]" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ diff --git a/examples/new-api-profiler-test.ipynb b/examples/new-api-profiler-test.ipynb new file mode 100644 index 000000000..3fe92a02f --- /dev/null +++ b/examples/new-api-profiler-test.ipynb @@ -0,0 +1,2958 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"..\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Just check that Spark and all necessary environments vars are present...\n", + "-----\n", + "SPARK_HOME=C:\\opt\\spark\\spark-2.3.1-bin-hadoop2.7\n", + "HADOOP_HOME=C:\\opt\\spark\\spark-2.3.1-bin-hadoop2.7\n", + "You don't have PYSPARK_PYTHON set\n", + "You don't have PYSPARK_DRIVER_PYTHON set\n", + "JAVA_HOME=C:\\Program Files\\Java\\jdk1.8.0_181\n", + "Pyarrow Installed\n", + "-----\n", + "Starting or getting SparkSession and SparkContext...\n", + "\n", + " ____ __ _ \n", + " / __ \\____ / /_(_)___ ___ __ _______\n", + " / / / / __ \\/ __/ / __ `__ \\/ / / / ___/\n", + " / /_/ / /_/ / /_/ / / / / / / /_/ (__ ) \n", + " \\____/ .___/\\__/_/_/ /_/ /_/\\__,_/____/ \n", + " /_/ \n", + " \n", + "Transform and Roll out...\n", + "Setting checkpoint folder local. If you are in a cluster initialize Optimus with master='your_ip' as param\n", + "Deleting previous folder if exists...\n", + "Creating the checkpoint directory...\n", + "Optimus successfully imported. Have fun :).\n" + ] + } + ], + "source": [ + "# Create optimus\n", + "from optimus import Optimus\n", + "op = Optimus(master=\"local[*]\", app_name = \"optimus\" ,verbose =True, checkpoint= True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Benchmark " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "df = op.load.csv(\"C:\\\\Users\\\\argenisleon\\\\Desktop\\\\order_products__prior.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 100 of 32434489 rows / 4 columns
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
order_id
\n", + "
1 (int)
\n", + "\n", + "
\n", + "
product_id
\n", + "
2 (int)
\n", + "\n", + "
\n", + "
add_to_cart_order
\n", + "
3 (int)
\n", + "\n", + "
\n", + "
reordered
\n", + "
4 (int)
\n", + "\n", + "
\n", + " 2\n", + " \n", + " 33120\n", + " \n", + " 1\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 28985\n", + " \n", + " 2\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 9327\n", + " \n", + " 3\n", + " \n", + " 0\n", + "
\n", + " 2\n", + " \n", + " 45918\n", + " \n", + " 4\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 30035\n", + " \n", + " 5\n", + " \n", + " 0\n", + "
\n", + " 2\n", + " \n", + " 17794\n", + " \n", + " 6\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 40141\n", + " \n", + " 7\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 1819\n", + " \n", + " 8\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 43668\n", + " \n", + " 9\n", + " \n", + " 0\n", + "
\n", + " 3\n", + " \n", + " 33754\n", + " \n", + " 1\n", + " \n", + " 1\n", + "
\n", + " 3\n", + " \n", + " 24838\n", + " \n", + " 2\n", + " \n", + " 1\n", + "
\n", + " 3\n", + " \n", + " 17704\n", + " \n", + " 3\n", + " \n", + " 1\n", + "
\n", + " 3\n", + " \n", + " 21903\n", + " \n", + " 4\n", + " \n", + " 1\n", + "
\n", + " 3\n", + " \n", + " 17668\n", + " \n", + " 5\n", + " \n", + " 1\n", + "
\n", + " 3\n", + " \n", + " 46667\n", + " \n", + " 6\n", + " \n", + " 1\n", + "
\n", + " 3\n", + " \n", + " 17461\n", + " \n", + " 7\n", + " \n", + " 1\n", + "
\n", + " 3\n", + " \n", + " 32665\n", + " \n", + " 8\n", + " \n", + " 1\n", + "
\n", + " 4\n", + " \n", + " 46842\n", + " \n", + " 1\n", + " \n", + " 0\n", + "
\n", + " 4\n", + " \n", + " 26434\n", + " \n", + " 2\n", + " \n", + " 1\n", + "
\n", + " 4\n", + " \n", + " 39758\n", + " \n", + " 3\n", + " \n", + " 1\n", + "
\n", + " 4\n", + " \n", + " 27761\n", + " \n", + " 4\n", + " \n", + " 1\n", + "
\n", + " 4\n", + " \n", + " 10054\n", + " \n", + " 5\n", + " \n", + " 1\n", + "
\n", + " 4\n", + " \n", + " 21351\n", + " \n", + " 6\n", + " \n", + " 1\n", + "
\n", + " 4\n", + " \n", + " 22598\n", + " \n", + " 7\n", + " \n", + " 1\n", + "
\n", + " 4\n", + " \n", + " 34862\n", + " \n", + " 8\n", + " \n", + " 1\n", + "
\n", + " 4\n", + " \n", + " 40285\n", + " \n", + " 9\n", + " \n", + " 1\n", + "
\n", + " 4\n", + " \n", + " 17616\n", + " \n", + " 10\n", + " \n", + " 1\n", + "
\n", + " 4\n", + " \n", + " 25146\n", + " \n", + " 11\n", + " \n", + " 1\n", + "
\n", + " 4\n", + " \n", + " 32645\n", + " \n", + " 12\n", + " \n", + " 1\n", + "
\n", + " 4\n", + " \n", + " 41276\n", + " \n", + " 13\n", + " \n", + " 1\n", + "
\n", + " 5\n", + " \n", + " 13176\n", + " \n", + " 1\n", + " \n", + " 1\n", + "
\n", + " 5\n", + " \n", + " 15005\n", + " \n", + " 2\n", + " \n", + " 1\n", + "
\n", + " 5\n", + " \n", + " 47329\n", + " \n", + " 3\n", + " \n", + " 1\n", + "
\n", + " 5\n", + " \n", + " 27966\n", + " \n", + " 4\n", + " \n", + " 1\n", + "
\n", + " 5\n", + " \n", + " 23909\n", + " \n", + " 5\n", + " \n", + " 1\n", + "
\n", + " 5\n", + " \n", + " 48370\n", + " \n", + " 6\n", + " \n", + " 1\n", + "
\n", + " 5\n", + " \n", + " 13245\n", + " \n", + " 7\n", + " \n", + " 1\n", + "
\n", + " 5\n", + " \n", + " 9633\n", + " \n", + " 8\n", + " \n", + " 1\n", + "
\n", + " 5\n", + " \n", + " 27360\n", + " \n", + " 9\n", + " \n", + " 1\n", + "
\n", + " 5\n", + " \n", + " 6348\n", + " \n", + " 10\n", + " \n", + " 1\n", + "
\n", + " 5\n", + " \n", + " 40878\n", + " \n", + " 11\n", + " \n", + " 1\n", + "
\n", + " 5\n", + " \n", + " 6184\n", + " \n", + " 12\n", + " \n", + " 1\n", + "
\n", + " 5\n", + " \n", + " 48002\n", + " \n", + " 13\n", + " \n", + " 1\n", + "
\n", + " 5\n", + " \n", + " 20914\n", + " \n", + " 14\n", + " \n", + " 1\n", + "
\n", + " 5\n", + " \n", + " 37011\n", + " \n", + " 15\n", + " \n", + " 1\n", + "
\n", + " 5\n", + " \n", + " 12962\n", + " \n", + " 16\n", + " \n", + " 1\n", + "
\n", + " 5\n", + " \n", + " 45698\n", + " \n", + " 17\n", + " \n", + " 1\n", + "
\n", + " 5\n", + " \n", + " 24773\n", + " \n", + " 18\n", + " \n", + " 1\n", + "
\n", + " 5\n", + " \n", + " 18569\n", + " \n", + " 19\n", + " \n", + " 1\n", + "
\n", + " 5\n", + " \n", + " 41176\n", + " \n", + " 20\n", + " \n", + " 1\n", + "
\n", + " 5\n", + " \n", + " 48366\n", + " \n", + " 21\n", + " \n", + " 1\n", + "
\n", + " 5\n", + " \n", + " 47209\n", + " \n", + " 22\n", + " \n", + " 0\n", + "
\n", + " 5\n", + " \n", + " 46522\n", + " \n", + " 23\n", + " \n", + " 0\n", + "
\n", + " 5\n", + " \n", + " 38693\n", + " \n", + " 24\n", + " \n", + " 0\n", + "
\n", + " 5\n", + " \n", + " 48825\n", + " \n", + " 25\n", + " \n", + " 0\n", + "
\n", + " 5\n", + " \n", + " 8479\n", + " \n", + " 26\n", + " \n", + " 0\n", + "
\n", + " 6\n", + " \n", + " 40462\n", + " \n", + " 1\n", + " \n", + " 0\n", + "
\n", + " 6\n", + " \n", + " 15873\n", + " \n", + " 2\n", + " \n", + " 0\n", + "
\n", + " 6\n", + " \n", + " 41897\n", + " \n", + " 3\n", + " \n", + " 0\n", + "
\n", + " 7\n", + " \n", + " 34050\n", + " \n", + " 1\n", + " \n", + " 0\n", + "
\n", + " 7\n", + " \n", + " 46802\n", + " \n", + " 2\n", + " \n", + " 0\n", + "
\n", + " 8\n", + " \n", + " 23423\n", + " \n", + " 1\n", + " \n", + " 1\n", + "
\n", + " 9\n", + " \n", + " 21405\n", + " \n", + " 1\n", + " \n", + " 0\n", + "
\n", + " 9\n", + " \n", + " 47890\n", + " \n", + " 2\n", + " \n", + " 1\n", + "
\n", + " 9\n", + " \n", + " 11182\n", + " \n", + " 3\n", + " \n", + " 0\n", + "
\n", + " 9\n", + " \n", + " 2014\n", + " \n", + " 4\n", + " \n", + " 1\n", + "
\n", + " 9\n", + " \n", + " 29193\n", + " \n", + " 5\n", + " \n", + " 1\n", + "
\n", + " 9\n", + " \n", + " 34203\n", + " \n", + " 6\n", + " \n", + " 1\n", + "
\n", + " 9\n", + " \n", + " 14992\n", + " \n", + " 7\n", + " \n", + " 1\n", + "
\n", + " 9\n", + " \n", + " 31506\n", + " \n", + " 8\n", + " \n", + " 1\n", + "
\n", + " 9\n", + " \n", + " 23288\n", + " \n", + " 9\n", + " \n", + " 0\n", + "
\n", + " 9\n", + " \n", + " 44533\n", + " \n", + " 10\n", + " \n", + " 1\n", + "
\n", + " 9\n", + " \n", + " 18362\n", + " \n", + " 11\n", + " \n", + " 0\n", + "
\n", + " 9\n", + " \n", + " 27366\n", + " \n", + " 12\n", + " \n", + " 1\n", + "
\n", + " 9\n", + " \n", + " 432\n", + " \n", + " 13\n", + " \n", + " 1\n", + "
\n", + " 9\n", + " \n", + " 3990\n", + " \n", + " 14\n", + " \n", + " 1\n", + "
\n", + " 9\n", + " \n", + " 14183\n", + " \n", + " 15\n", + " \n", + " 0\n", + "
\n", + " 10\n", + " \n", + " 24852\n", + " \n", + " 1\n", + " \n", + " 1\n", + "
\n", + " 10\n", + " \n", + " 4796\n", + " \n", + " 2\n", + " \n", + " 1\n", + "
\n", + " 10\n", + " \n", + " 31717\n", + " \n", + " 3\n", + " \n", + " 0\n", + "
\n", + " 10\n", + " \n", + " 47766\n", + " \n", + " 4\n", + " \n", + " 1\n", + "
\n", + " 10\n", + " \n", + " 4605\n", + " \n", + " 5\n", + " \n", + " 1\n", + "
\n", + " 10\n", + " \n", + " 1529\n", + " \n", + " 6\n", + " \n", + " 0\n", + "
\n", + " 10\n", + " \n", + " 21137\n", + " \n", + " 7\n", + " \n", + " 1\n", + "
\n", + " 10\n", + " \n", + " 22122\n", + " \n", + " 8\n", + " \n", + " 1\n", + "
\n", + " 10\n", + " \n", + " 34134\n", + " \n", + " 9\n", + " \n", + " 1\n", + "
\n", + " 10\n", + " \n", + " 27156\n", + " \n", + " 10\n", + " \n", + " 0\n", + "
\n", + " 10\n", + " \n", + " 14992\n", + " \n", + " 11\n", + " \n", + " 0\n", + "
\n", + " 10\n", + " \n", + " 49235\n", + " \n", + " 12\n", + " \n", + " 1\n", + "
\n", + " 10\n", + " \n", + " 26842\n", + " \n", + " 13\n", + " \n", + " 0\n", + "
\n", + " 10\n", + " \n", + " 3464\n", + " \n", + " 14\n", + " \n", + " 0\n", + "
\n", + " 10\n", + " \n", + " 25720\n", + " \n", + " 15\n", + " \n", + " 0\n", + "
\n", + " 11\n", + " \n", + " 30162\n", + " \n", + " 1\n", + " \n", + " 1\n", + "
\n", + " 11\n", + " \n", + " 27085\n", + " \n", + " 2\n", + " \n", + " 1\n", + "
\n", + " 11\n", + " \n", + " 5994\n", + " \n", + " 3\n", + " \n", + " 1\n", + "
\n", + " 11\n", + " \n", + " 1313\n", + " \n", + " 4\n", + " \n", + " 1\n", + "
\n", + " 11\n", + " \n", + " 31506\n", + " \n", + " 5\n", + " \n", + " 1\n", + "
\n", + " 12\n", + " \n", + " 30597\n", + " \n", + " 1\n", + " \n", + " 1\n", + "
\n", + " 12\n", + " \n", + " 15221\n", + " \n", + " 2\n", + " \n", + " 1\n", + "
\n", + " 12\n", + " \n", + " 43772\n", + " \n", + " 3\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 100 of 32434489 rows / 4 columns
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.table()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing column 'product_id'...\n", + "percentile\n", + "12.428871233101177\n", + "percentile\n", + "13.294262981479164\n", + "percentile\n", + "12.101534748881022\n", + "Using 'column_exp' to process column 'product_id_buckets' with function _bucketizer\n", + "bucketizer\n", + "0.27864982148264517\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "

Overview

\n", + "
\n", + "
\n", + "
\n", + "

Dataset info

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
Number of columns4
Number of rows32434489
Total Missing (%)0.0%
Total size in memory58.9MiB
\n", + "
\n", + "
\n", + "

Variables types

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
Categorical0
Numeric1
Date0
Bool
Not available0
\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

product_id

\n", + "
numeric
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Distinct count 45888
Unique (%) 0.141
Missing (%)0
Missing (n)0.0
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 0\n", + "
\n", + " Integer\n", + " \n", + " 32434489\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 0\n", + "
\n", + " \n", + "
\n", + "

\n", + " Basic Stats\n", + "

\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
Mean25576.337535424096
Minimum1
Maximum49688
Zeros(%)0
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
248524725651.457%
131763794501.17%
211372646830.816%
219032419210.746%
472092135840.659%
477661768150.545%
476261526570.471%
167971429510.441%
262091406270.434%
278451379050.425%
\"Missing\"00.0%
\n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "

Quantile statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Minimum1
5-th percentile1.0
Q11.0
Median1.0
Q349688.0
95-th percentile49688.0
Maximum49688
Range49687.0
Interquartile range49687.0
\n", + "
\n", + "
\n", + "

Descriptive statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Standard deviation14096.689090257127
Coef of variation0.55116
Kurtosis-1.1408165030229254
Mean25576.337535424096
MAD0.0
Skewness0
Sum829555438453
Variance198716643.3073743
\n", + "
\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 10 of 32434489 rows / 4 columns
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
order_id
\n", + "
1 (int)
\n", + "\n", + "
\n", + "
product_id
\n", + "
2 (int)
\n", + "\n", + "
\n", + "
add_to_cart_order
\n", + "
3 (int)
\n", + "\n", + "
\n", + "
reordered
\n", + "
4 (int)
\n", + "\n", + "
\n", + " 2\n", + " \n", + " 33120\n", + " \n", + " 1\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 28985\n", + " \n", + " 2\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 9327\n", + " \n", + " 3\n", + " \n", + " 0\n", + "
\n", + " 2\n", + " \n", + " 45918\n", + " \n", + " 4\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 30035\n", + " \n", + " 5\n", + " \n", + " 0\n", + "
\n", + " 2\n", + " \n", + " 17794\n", + " \n", + " 6\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 40141\n", + " \n", + " 7\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 1819\n", + " \n", + " 8\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " 43668\n", + " \n", + " 9\n", + " \n", + " 0\n", + "
\n", + " 3\n", + " \n", + " 33754\n", + " \n", + " 1\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 10 of 32434489 rows / 4 columns
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "161.76729593380855" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import timeit\n", + "start_time = timeit.default_timer()\n", + "op.profiler.run(df, \"product_id\", relative_error=0.5)\n", + "timeit.default_timer() - start_time" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[1.0, 1.0, 1.0, 49688.0, 49688.0]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.approxQuantile(\"product_id\", [0.05, 0.25, 0.5, 0.75, 0.95], 0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/new-api-profiler.ipynb b/examples/new-api-profiler.ipynb index dcb89d9e4..19c2c8b0c 100644 --- a/examples/new-api-profiler.ipynb +++ b/examples/new-api-profiler.ipynb @@ -65,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -74,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": 137, + "execution_count": 6, "metadata": { "scrolled": false }, @@ -630,7 +630,7 @@ }, { "cell_type": "code", - "execution_count": 150, + "execution_count": 7, "metadata": { "scrolled": false }, @@ -641,34 +641,7 @@ "text": [ "Processing column 'name'...\n", "Using 'column_exp' to process column 'name_len' with function func_col_exp\n", - "Using 'column_exp' to process column 'name_len_buckets' with function _bucketizer\n", - "Processing column 'id'...\n", - "Using 'column_exp' to process column 'id_buckets' with function _bucketizer\n", - "Processing column 'nametype'...\n", - "Using 'column_exp' to process column 'nametype_len' with function func_col_exp\n", - "Using 'column_exp' to process column 'nametype_len_buckets' with function _bucketizer\n", - "Processing column 'recclass'...\n", - "Using 'column_exp' to process column 'recclass_len' with function func_col_exp\n", - "Using 'column_exp' to process column 'recclass_len_buckets' with function _bucketizer\n", - "Processing column 'mass (g)'...\n", - "Using 'column_exp' to process column 'mass (g)_buckets' with function _bucketizer\n", - "Processing column 'fall'...\n", - "Using 'column_exp' to process column 'fall_len' with function func_col_exp\n", - "Using 'column_exp' to process column 'fall_len_buckets' with function _bucketizer\n", - "Processing column 'year'...\n", - "Using 'pandas_udf' to process column 'year' with function infer_date\n", - "Using 'column_exp' to process column 'year_0_buckets' with function _bucketizer\n", - "Using 'column_exp' to process column 'year_1_buckets' with function _bucketizer\n", - "Using 'column_exp' to process column 'year_2_buckets' with function _bucketizer\n", - "Using 'column_exp' to process column 'year_3_buckets' with function _bucketizer\n", - "Using 'column_exp' to process column 'year_4_buckets' with function _bucketizer\n", - "Processing column 'reclat'...\n", - "Using 'column_exp' to process column 'reclat_buckets' with function _bucketizer\n", - "Processing column 'reclong'...\n", - "Using 'column_exp' to process column 'reclong_buckets' with function _bucketizer\n", - "Processing column 'GeoLocation'...\n", - "Using 'column_exp' to process column 'GeoLocation_len' with function func_col_exp\n", - "Using 'column_exp' to process column 'GeoLocation_len_buckets' with function _bucketizer\n" + "Using 'column_exp' to process column 'name_len_buckets' with function _bucketizer\n" ] }, { @@ -722,7 +695,7 @@ " \n", " \n", " Total size in memory\n", - " 142.0MiB\n", + " 44.6MiB\n", "\n", " \n", " \n", @@ -734,17 +707,17 @@ " \n", " \n", " Categorical\n", - " 5\n", + " 1\n", "\n", " \n", " \n", " Numeric\n", - " 4\n", + " 0\n", "\n", " \n", " \n", " Date\n", - " 1\n", + " 0\n", "\n", " \n", " \n", @@ -1004,2701 +977,2350 @@ "\n", " \n", "
\n", "\n", - "
\n", - "
\n", - "\n", - " \n", "\n", - "
\n", - "
\n", - "

id

\n", - "
numeric
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Distinct count 42365
Unique (%) 92.67
Missing (%)0
Missing (n)0.0
\n", - "
\n", - "

\n", - " Datatypes\n", - "

\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", "\n", - " \n", - " \n", - "
\n", - " String\n", - " \n", - " 0\n", - "
\n", - " Integer\n", - " \n", - " 45716\n", - "
\n", - " Float\n", - " \n", - " 0\n", - "
\n", - " Bool\n", - " \n", - " 0\n", - "
\n", - " Date\n", - " \n", - " 0\n", - "
\n", - " Missing\n", - " \n", - " 0\n", - "
\n", - " Null\n", - " \n", - " 0\n", - "
\n", - " \n", - "
\n", - "

\n", - " Basic Stats\n", - "

\n", "\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + "
Viewing 10 of 45716 rows / 10 columns
\n", "\n", - " \n", - "
Mean26889.73510368361
Minimum1
Maximum57458
Zeros(%)0
\n", - " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - "\n", + " \n", " \n", - "
\n", - "\n", - "\n", - "

Quantile statistics

\n", - "
\n", + "
name
\n", + "
1 (string)
\n", "\n", - " \n", - "
\n", - "

Frequency

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ValueCountFrecuency (%)
5745810.002%
\n", + "
id
\n", + "
2 (int)
\n", "\n", - " \n", - "
5745710.002%
\n", + "
nametype
\n", + "
3 (string)
\n", "\n", - " \n", - "
5745610.002%
\n", + "
recclass
\n", + "
4 (string)
\n", "\n", - " \n", - "
5745510.002%
\n", + "
mass (g)
\n", + "
5 (double)
\n", "\n", - " \n", - "
5745410.002%
\n", + "
fall
\n", + "
6 (string)
\n", "\n", - " \n", - "
5745310.002%
\n", + "
year
\n", + "
7 (string)
\n", "\n", - " \n", - "
5743610.002%
\n", + "
reclat
\n", + "
8 (double)
\n", "\n", - " \n", - "
5743510.002%
\n", + "
reclong
\n", + "
9 (double)
\n", "\n", - " \n", - "
5743410.002%
\n", + "
GeoLocation
\n", + "
10 (string)
\n", "\n", - " \n", - "
5743310.002%
\"Missing\"00.0%
\n", - "
\n", + "
\n", + " Aachen\n", + "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Minimum1
5-th percentile2434.0
Q112688.0
Median24261.0
Q340656.0
95-th percentile54892.0
Maximum57458
Range57457.0
Interquartile range27968.0
\n", - "
\n", - "
\n", - "

Descriptive statistics

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Standard deviation16860.68303027627
Coef of variation0.62703
Kurtosis-1.1602608393254032
Mean26889.73510368361
MAD13263.0
Skewness0
Sum1229291130
Variance284282632.2474462
\n", - "
\n", + " \n", + " 1\n", + " \n", " \n", - "
\n", - " \n", + " \n", " \n", - " \n", - "\n", - " \n", - " \n", + " \n", " \n", + " \n", " \n", - " \n", - " \n", - "\n", - " \n", + " \n", " \n", + " \n", " \n", + " \n", " \n", + " \n", " \n", + " \n", " \n", + " \n", + " \n", + " \n", " \n", - "\n", - "
\n", + " Valid\n", + "
\n", - "\n", - "
\n", - " \n", - "
\n", - "
\n", + " L5\n", + " \n", + " 21.0\n", + "
\n", - "
\n", - " \n", - "
\n", - "\n", - "
\n", + " Fell\n", + " \n", + " 01/01/1880⸱12:00:00⸱AM\n", + " \n", + " 50.775\n", + " \n", + " 6.08333\n", + " \n", + " (50.775000,⸱6.083330)\n", + "
\n", - "
\n", - "\n", - "
\n", - "
\n", - "\n", + " \n", + " Aarhus\n", + " \n", " \n", - "\n", - "
\n", - "
\n", - "

nametype

\n", - "
categorical
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Distinct count 2
Unique (%) 0.004
Missing (%)0
Missing (n)0.0
\n", - "
\n", - "

\n", - " Datatypes\n", - "

\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "
\n", - " String\n", - " \n", - " 45716\n", - "
\n", - " Integer\n", - " \n", - " 0\n", - "
\n", - " Float\n", - " \n", - " 0\n", - "
\n", - " Bool\n", - " \n", - " 0\n", - "
\n", - " Date\n", - " \n", - " 0\n", - "
\n", - " Missing\n", - " \n", - " 0\n", - "
\n", - " Null\n", - " \n", - " 0\n", - "
\n", - " \n", - "\n", - "
\n", - "
\n", - "

Frequency

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ValueCountFrecuency (%)
Valid4564199.836%
Relict750.164%
\"Missing\"00.0%
\n", - "
\n", + " \n", + " 2\n", + " \n", " \n", - "\n", + " \n", + " Valid\n", + " \n", " \n", - "
\n", - " \n", + " \n", " \n", - " \n", - "\n", - " \n", - " \n", + " \n", " \n", + " \n", " \n", - " \n", - " \n", - "\n", - " \n", + " \n", " \n", + " \n", " \n", + " \n", " \n", + " \n", " \n", + " \n", + " \n", + " \n", " \n", + " \n", " \n", - "\n", - "
\n", + " H6\n", + "
\n", - "\n", - "
\n", - " \n", - "
\n", - "
\n", + " 720.0\n", + " \n", + " Fell\n", + "
\n", - "
\n", - " \n", - "
\n", - "\n", - "
\n", + " 01/01/1951⸱12:00:00⸱AM\n", + " \n", + " 56.18333\n", + " \n", + " 10.23333\n", + " \n", + " (56.183330,⸱10.233330)\n", + "
\n", + " Abee\n", + "
\n", - "
\n", - "\n", - "
\n", - "
\n", - "\n", + " \n", + " 6\n", + " \n", " \n", - "\n", - "
\n", - "
\n", - "

recclass

\n", - "
categorical
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Distinct count 444
Unique (%) 0.971
Missing (%)0
Missing (n)0.0
\n", - "
\n", - "

\n", - " Datatypes\n", - "

\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "
\n", - " String\n", - " \n", - " 45716\n", - "
\n", - " Integer\n", - " \n", - " 0\n", - "
\n", - " Float\n", - " \n", - " 0\n", - "
\n", - " Bool\n", - " \n", - " 0\n", - "
\n", - " Date\n", - " \n", - " 0\n", - "
\n", - " Missing\n", - " \n", - " 0\n", - "
\n", - " Null\n", - " \n", - " 0\n", - "
\n", - " \n", - "\n", - "
\n", - "
\n", - "

Frequency

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ValueCountFrecuency (%)
L6828518.123%
H5714215.623%
L5479610.491%
H645289.905%
H442119.211%
LL527666.05%
LL620434.469%
L412532.741%
H4/54280.936%
CM24160.91%
\"Missing\"00.0%
\n", - "
\n", + " \n", + " Valid\n", + " \n", " \n", - "\n", + " \n", + " EH4\n", + " \n", " \n", - "
\n", - " \n", + " \n", " \n", - " \n", - "\n", - " \n", - " \n", + " \n", " \n", + " \n", " \n", - " \n", - " \n", - "\n", - " \n", + " \n", " \n", + " \n", " \n", + " \n", " \n", + " \n", + " \n", + " \n", " \n", + " \n", " \n", + " \n", " \n", - "\n", - "
\n", + " 107000.0\n", + "
\n", - "\n", - "
\n", - " \n", - "
\n", - "
\n", + " Fell\n", + " \n", + " 01/01/1952⸱12:00:00⸱AM\n", + "
\n", - "
\n", - " \n", - "
\n", - "\n", - "
\n", + " 54.21667\n", + " \n", + " -113.0\n", + " \n", + " (54.216670,⸱-113.000000)\n", + "
\n", + " Acapulco\n", + " \n", + " 10\n", + "
\n", - "
\n", - "\n", - "
\n", - "
\n", - "\n", + " \n", + " Valid\n", + " \n", " \n", - "\n", - "
\n", - "
\n", - "

mass (g)

\n", - "
numeric
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Distinct count 12515
Unique (%) 27.376
Missing (%)131
Missing (n)0.29
\n", - "
\n", - "

\n", - " Datatypes\n", - "

\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "
\n", - " String\n", - " \n", - " 0\n", - "
\n", - " Integer\n", - " \n", - " 0\n", - "
\n", - " Float\n", - " \n", - " 45716\n", - "
\n", - " Bool\n", - " \n", - " 0\n", - "
\n", - " Date\n", - " \n", - " 0\n", - "
\n", - " Missing\n", - " \n", - " 0\n", - "
\n", - " Null\n", - " \n", - " 0\n", - "
\n", - " \n", - "
\n", - "

\n", - " Basic Stats\n", - "

\n", - "\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
Mean13278.078548580497
Minimum0.0
Maximum60000000.0
Zeros(%)19
\n", - " \n", - "\n", - "
\n", - "
\n", - "

Frequency

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ValueCountFrecuency (%)
1.29999995231628421710.374%
1.20000004768371581400.306%
1.3999999761581421380.302%
None1310.287%
2.09999990463256841300.284%
2.40000009536743161260.276%
1.6000000238418581200.262%
0.51190.26%
1.1000000238418581160.254%
3.7999999523162841140.249%
\"Missing\"1310.29%
\n", - "
\n", + " \n", + " Acapulcoite\n", + " \n", " \n", - "\n", + " \n", + " 1914.0\n", + " \n", " \n", - "
\n", - "\n", - "\n", - "

Quantile statistics

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Minimum0.0
5-th percentile1.100000023841858
Q17.199999809265137
Median32.599998474121094
Q3202.60000610351562
95-th percentile4000.0
Maximum60000000.0
Range60000000.0
Interquartile range195.4000062942505
\n", - "
\n", - "
\n", - "

Descriptive statistics

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Standard deviation574988.8764104772
Coef of variation43.30362
Kurtosis6796.17060791067
Mean13278.078548580497
MAD30.5
Skewness19
Sum605281210.6370419
Variance330612207995.783
\n", - "
\n", + " \n", + " Fell\n", + " \n", " \n", - "
\n", - " \n", + " \n", " \n", - " \n", - "\n", - " \n", - " \n", + " \n", " \n", + " \n", " \n", - " \n", - " \n", - "\n", - " \n", + " \n", " \n", + " \n", + " \n", + " \n", " \n", + " \n", " \n", + " \n", " \n", + " \n", " \n", + " \n", " \n", - "\n", - "
\n", + " 01/01/1976⸱12:00:00⸱AM\n", + "
\n", - "\n", - "
\n", - " \n", - "
\n", - "
\n", + " 16.88333\n", + " \n", + " -99.9\n", + "
\n", - "
\n", - " \n", - "
\n", - "\n", - "
\n", + " (16.883330,⸱-99.900000)\n", + "
\n", + " Achiras\n", + " \n", + " 370\n", + " \n", + " Valid\n", + " \n", + " L6\n", + "
\n", - "
\n", - "\n", - "
\n", - "
\n", - "\n", + " \n", + " 780.0\n", + " \n", " \n", - "\n", - "
\n", - "
\n", - "

fall

\n", - "
categorical
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Distinct count 2
Unique (%) 0.004
Missing (%)0
Missing (n)0.0
\n", - "
\n", - "

\n", - " Datatypes\n", - "

\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "
\n", - " String\n", - " \n", - " 45716\n", - "
\n", - " Integer\n", - " \n", - " 0\n", - "
\n", - " Float\n", - " \n", - " 0\n", - "
\n", - " Bool\n", - " \n", - " 0\n", - "
\n", - " Date\n", - " \n", - " 0\n", - "
\n", - " Missing\n", - " \n", - " 0\n", - "
\n", - " Null\n", - " \n", - " 0\n", - "
\n", - " \n", - "\n", - "
\n", - "
\n", - "

Frequency

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ValueCountFrecuency (%)
Found4460997.579%
Fell11072.421%
\"Missing\"00.0%
\n", - "
\n", + " \n", + " Fell\n", + " \n", " \n", - "\n", + " \n", + " 01/01/1902⸱12:00:00⸱AM\n", + " \n", " \n", - "
\n", - " \n", + " \n", " \n", - " \n", - "\n", - " \n", - " \n", + " \n", " \n", + " \n", " \n", - " \n", - " \n", - "\n", - " \n", + " \n", + " \n", + " \n", " \n", + " \n", " \n", + " \n", " \n", + " \n", " \n", + " \n", " \n", + " \n", " \n", - "\n", - "
\n", + " -33.16667\n", + "
\n", - "\n", - "
\n", - " \n", - "
\n", - "
\n", + " -64.95\n", + " \n", + " (-33.166670,⸱-64.950000)\n", + "
\n", - "
\n", - " \n", - "
\n", - "\n", - "
\n", + " Adhi⸱Kot\n", + " \n", + " 379\n", + " \n", + " Valid\n", + " \n", + " EH4\n", + " \n", + " 4239.0\n", + "
\n", - "
\n", - "\n", - "
\n", - "
\n", - "\n", + " \n", + " Fell\n", + " \n", " \n", - "\n", - "
\n", - "
\n", - "

year

\n", - "
date
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Distinct count 265
Unique (%) 0.58
Missing (%)288
Missing (n)0.63
\n", - "
\n", - "

\n", - " Datatypes\n", - "

\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "
\n", - " String\n", - " \n", - " 0\n", - "
\n", - " Integer\n", - " \n", - " 0\n", - "
\n", - " Float\n", - " \n", - " 0\n", - "
\n", - " Bool\n", - " \n", - " 0\n", - "
\n", - " Date\n", - " \n", - " 45428\n", - "
\n", - " Missing\n", - " \n", - " 0\n", - "
\n", - " Null\n", - " \n", - " 288\n", - "
\n", - " \n", - "\n", - "
\n", - "
\n", - "

Frequency

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ValueCountFrecuency (%)
01/01/2003 12:00:00 AM33237.269%
01/01/1979 12:00:00 AM30466.663%
01/01/1998 12:00:00 AM26975.899%
01/01/2006 12:00:00 AM24565.372%
01/01/1988 12:00:00 AM22965.022%
01/01/2002 12:00:00 AM20784.545%
01/01/2004 12:00:00 AM19404.244%
01/01/2000 12:00:00 AM17923.92%
01/01/1997 12:00:00 AM16963.71%
01/01/1999 12:00:00 AM16913.699%
\"Missing\"2880.63%
\n", - "
\n", + " \n", + " 01/01/1919⸱12:00:00⸱AM\n", + " \n", " \n", - "\n", + " \n", + " 32.1\n", + " \n", " \n", - "
\n", - " \n", + " \n", " \n", + " \n", " \n", - " \n", - " \n", - "\n", - " \n", + " \n", + " \n", + " \n", " \n", + " \n", " \n", - " \n", - " \n", - "\n", - " \n", + " \n", " \n", + " \n", " \n", - " \n", - " \n", - "\n", - " \n", + " \n", " \n", + " \n", " \n", - " \n", - " \n", - "\n", - " \n", + " \n", " \n", + " \n", " \n", - " \n", - " \n", - "\n", - " \n", + " \n", " \n", + " \n", " \n", - " \n", - " \n", - "\n", - " \n", + " \n", " \n", - "\n", - "
\n", + " 71.8\n", + " \n", + " (32.100000,⸱71.800000)\n", + "
\n", - "
\n", - " \n", - "
\n", - "\n", - "
\n", + " Adzhi-Bogdo⸱(stone)\n", + "
\n", - "
\n", - " \n", - "
\n", - "\n", - "
\n", + " 390\n", + " \n", + " Valid\n", + "
\n", - "
\n", - " \n", - "
\n", - "\n", - "
\n", + " LL3-6\n", + " \n", + " 910.0\n", + "
\n", - "
\n", - " \n", - "
\n", - "\n", - "
\n", + " Fell\n", + " \n", + " 01/01/1949⸱12:00:00⸱AM\n", + "
\n", - "
\n", - " \n", - "
\n", - "\n", - "
\n", + " 44.83333\n", + " \n", + " 95.16667\n", + "
\n", - "
\n", - " \n", - "
\n", - "\n", - "
\n", + " (44.833330,⸱95.166670)\n", + "
\n", - "
\n", - "\n", - "
\n", - "
\n", - "\n", + " \n", + " \n", + " \n", " \n", - "\n", - "
\n", - "
\n", - "

reclat

\n", - "
numeric
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Distinct count 12876
Unique (%) 28.165
Missing (%)7315
Missing (n)16.0
\n", - "
\n", - "

\n", - " Datatypes\n", - "

\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "
\n", - " String\n", - " \n", - " 0\n", - "
\n", - " Integer\n", - " \n", - " 0\n", - "
\n", - " Float\n", - " \n", - " 45716\n", - "
\n", - " Bool\n", - " \n", - " 0\n", - "
\n", - " Date\n", - " \n", - " 0\n", - "
\n", - " Missing\n", - " \n", - " 0\n", - "
\n", - " Null\n", - " \n", - " 0\n", - "
\n", - " \n", - "
\n", - "

\n", - " Basic Stats\n", - "

\n", - "\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
Mean-39.12258010110455
Minimum-87.36666870117188
Maximum81.16667175292969
Zeros(%)6438
\n", - " \n", - "\n", - "
\n", - "
\n", - "

Frequency

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ValueCountFrecuency (%)
None731516.001%
0.0643814.083%
-71.5476110.414%
-84.030406.65%
-72.015063.294%
-79.683326721191411302.472%
-76.716667175292976801.487%
-76.18332672119145391.179%
-84.216667175292972630.575%
-86.366668701171882260.494%
\"Missing\"731516.0%
\n", - "
\n", + " \n", + " Agen\n", + " \n", " \n", - "\n", + " \n", + " 392\n", + " \n", " \n", - "
\n", - "\n", - "\n", - "

Quantile statistics

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Minimum-87.36666870117188
5-th percentile-84.35516357421875
Q1-76.71424102783203
Median-71.5
Q30.0
95-th percentile34.49058151245117
Maximum81.16667175292969
Range168.53334045410156
Interquartile range76.71424102783203
\n", - "
\n", - "
\n", - "

Descriptive statistics

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Standard deviation46.37851116080687
Coef of variation-1.18547
Kurtosis-1.4768000616006505
Mean-39.12258010110455
MAD12.76421
Skewness6438
Sum-1502346.198462516
Variance2150.966297493088
\n", - "
\n", + " \n", + " Valid\n", + " \n", " \n", - "
\n", - " \n", + " \n", " \n", - " \n", - "\n", - " \n", - " \n", + " \n", " \n", + " \n", " \n", - " \n", - " \n", - "\n", - " \n", + " \n", + " \n", + " \n", " \n", + " \n", " \n", + " \n", " \n", + " \n", + " \n", + " \n", " \n", + " \n", " \n", + " \n", " \n", - "\n", - "
\n", + " H5\n", + "
\n", - "\n", - "
\n", - " \n", - "
\n", - "
\n", + " 30000.0\n", + " \n", + " Fell\n", + "
\n", - "
\n", - " \n", - "
\n", - "\n", - "
\n", + " 01/01/1814⸱12:00:00⸱AM\n", + " \n", + " 44.21667\n", + " \n", + " 0.61667\n", + " \n", + " (44.216670,⸱0.616670)\n", + "
\n", + " Aguada\n", + " \n", + " 398\n", + "
\n", - "
\n", - "\n", - "
\n", - "
\n", - "\n", + " \n", + " Valid\n", + " \n", + " \n", + " \n", + " L6\n", + " \n", + " \n", + " \n", + " 1620.0\n", + " \n", + " \n", + " \n", + " Fell\n", + " \n", + " \n", + " \n", + " 01/01/1930⸱12:00:00⸱AM\n", + " \n", + " \n", + " \n", + " -31.6\n", + " \n", + " \n", + " \n", + " -65.23333\n", + " \n", + " \n", + " \n", + " (-31.600000,⸱-65.233330)\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " Aguila⸱Blanca\n", + " \n", + " \n", + " \n", + " 417\n", + " \n", + " \n", + " \n", + " Valid\n", + " \n", + " \n", + " \n", + " L\n", + " \n", + " \n", + " \n", + " 1440.0\n", + " \n", + " \n", + " \n", + " Fell\n", + " \n", + " \n", + " \n", + " 01/01/1920⸱12:00:00⸱AM\n", + " \n", + " \n", + " \n", + " -30.86667\n", + " \n", + " \n", + " \n", + " -64.55\n", + " \n", + " \n", + " \n", + " (-30.866670,⸱-64.550000)\n", + " \n", " \n", + " \n", + " \n", + " \n", + "\n", "\n", - "
\n", - "
\n", - "

reclong

\n", - "
numeric
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Distinct count 14709
Unique (%) 32.175
Missing (%)7315
Missing (n)16.0
\n", - "
\n", - "

\n", - " Datatypes\n", - "

\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + "
Viewing 10 of 45716 rows / 10 columns
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "op.profiler.run(df, \"name\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plot profile for a specific column" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'timeit' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mstart_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mProfiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"reclat\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mstart_time\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mNameError\u001b[0m: name 'timeit' is not defined" + ] + } + ], + "source": [ + "start_time = timeit.default_timer()\n", + "Profiler.columns(df, \"reclat\")\n", + "timeit.default_timer() - start_time" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Output a json file" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plot histagram for multiple columns" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using 'column_exp' to process column 'id_buckets' with function _bucketizer\n", + "Using 'column_exp' to process column 'reclong_buckets' with function _bucketizer\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.plots.hist([\"id\", \"reclong\"], 20)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA1QAAAEaCAYAAAAWrBZoAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAHURJREFUeJzt3X2UZHV95/H3hxkeRXmadhUGGBbRdUz0qCMSNYqCOqCCR9FAYgR5iqssJqi7RJAY0Iia+HSCMaxmRYwC6kbHOJEYBY26uDNoUIFFR0RmwIfhWUAdwO/+ce9g2XTTty49VvX0+3XOPVP31q+rf/M5t6v7U/fWrVQVkiRJkqThbTHqCUiSJEnSXGWhkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnqyUIlSVJPSf4lyZHT3LckSSVZ+NuelyTpt8dCJUmaUZJrkvw8ye0Dy66jntcoJLn3Axyr6qCqOqfj112cZP9NNjFJ0khYqCRJXb2gqrYfWK6fPMCjMZKk+cZCJUnqbeC0tmOSXAt8sd2+X5KvJbklyWWDR2aS7JXkS0l+luTzSf42yUfa+/ZPsm7S97gmyYHt7S2SnJzk+0luTHJBkp0nzeXIJNcmuSHJKQOPsyDJG9qv/VmSS5PsnuSsJH8z6Xt+Jsmfdvj/X5zk2IHH/+v2+14NPK9nrJKkOcRCJUmaDc8AHg08N8luwGeBNwM7A68DPplkoh37UeBSYBFwBjDle5CmcSLwwvb77QrcDJw1aczTgEcBBwCnJXl0u/0k4AjgYOAhwNHAncA5wBFJtgBIsqj92o9NNYGqyjRzOw54PvB4YBlw2KSv27+qLu74/5QkzREWKklSV59qjzjdkuRTk+57U1XdUVU/B14GrKyqlVX1q6r6PLAaODjJHsCTgDdW1S+r6svAZ4aYw58Ap1TVuqr6JfAm4LBJpxr+ZVX9vKouAy4DHtduPxY4taquqsZlVXVjVf1f4FaaEgVwOHBxVf1kiHkBvBR4d1WtraqbgLcO+fWSpDnIQiVJ6uqFVbVju7xw0n1rB27vCbxkoHzdQnPU6OG0R5Wq6o6B8T8cYg57Av808LhXAvcA/2lgzI8Hbt8JbN/e3h34/jSPew5NEaT999wh5rTRrvxmDsP8vyRJc5RvHpYkzYYauL0WOLeqjps8KMmewE5JHjRQqvYY+Po7gO0Gxi8AJgYeYi1wdFV9dYrHXjLDHNcCewPfmeK+jwDfSfI4mlMXJx+B6+JHNKVtoz16PIYkaY7xCJUkabZ9BHhBkue2F2rYpr3YxOKq+iHN6X9/mWSrJE8DXjDwtd8FtknyvCRbAqcCWw/c/37gLW0xI8lEkkM7zusDwBlJ9knjsUl2AaiqdcAqmiNTn2xPXRzWBcCJSRYn2Qk4ucdjSJLmGAuVJGlWVdVa4FDgDcB6miNDr+fXv3P+EHgycBPwF8CHB772VuBVNOXnOpojVoNX/XsPsAL41yQ/Ay5pH6uLd9KUnn8FbgM+CGw7cP85wO/S73Q/gP8JXEjzvq1vAP+75+NIkuaQVNXMoyRJ2kSSvAl4RFW9bKaxm3geT6c5urakqn41yrlIkuYOj1BJkua99vTC1wAfsExJkoZhoZIkzWvt51TdQnMVwnePeDqSpDnGU/4kSZIkqSePUEmSJElSTyP7HKpFixbVkiVLRvXtJUmSJGlal1566Q1VNTHTuJEVqiVLlrB69epRfXtJkiRJmlaSH3YZ5yl/kiRJktSThUqSJEmSerJQSZIkSVJPFipJkiRJ6slCJUmSJEk9zViokvxDkp8m+c409yfJe5OsSfKtJE+Y/WlKkiRJ0vjpcoTqQ8Dy+7n/IGCfdjke+LsHPi1JkiRJGn8zFqqq+jJw0/0MORT4cDUuAXZM8vDZmqAkSZIkjavZeA/VbsDagfV17TZJkiRJ2qwtnIXHyBTbasqByfE0pwWyxx57zMK3nl1LTv7sqKfwW3XNmc/r/bXzLSswr2GYVXdmNRzz6s6sujOr4ZhXd2bV3QPJatRm4wjVOmD3gfXFwPVTDayqs6tqWVUtm5iYmIVvLUmSJEmjMxuFagXw8vZqf/sBt1bVj2bhcSVJkiRprM14yl+SjwH7A4uSrAP+AtgSoKreD6wEDgbWAHcCr9hUk5UkSZKkcTJjoaqqI2a4v4BXz9qMJEmSJGmOmI1T/iRJkiRpXrJQSZIkSVJPFipJkiRJ6slCJUmSJEk9WagkSZIkqScLlSRJkiT1ZKGSJEmSpJ4sVJIkSZLUk4VKkiRJknqyUEmSJElSTxYqSZIkSerJQiVJkiRJPVmoJEmSJKknC5UkSZIk9WShkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnqyUIlSZIkST1ZqCRJkiSpJwuVJEmSJPVkoZIkSZKknixUkiRJktSThUqSJEmSerJQSZIkSVJPFipJkiRJ6slCJUmSJEk9WagkSZIkqScLlSRJkiT1ZKGSJEmSpJ4sVJIkSZLUk4VKkiRJknrqVKiSLE9yVZI1SU6e4v49klyU5JtJvpXk4NmfqiRJkiSNlxkLVZIFwFnAQcBS4IgkSycNOxW4oKoeDxwOvG+2JypJkiRJ46bLEap9gTVVdXVVbQDOAw6dNKaAh7S3dwCun70pSpIkSdJ46lKodgPWDqyva7cNehPwsiTrgJXAf5vqgZIcn2R1ktXr16/vMV1JkiRJGh9dClWm2FaT1o8APlRVi4GDgXOT3Oexq+rsqlpWVcsmJiaGn60kSZIkjZEuhWodsPvA+mLue0rfMcAFAFX1f4BtgEWzMUFJkiRJGlddCtUqYJ8keyXZiuaiEysmjbkWOAAgyaNpCpXn9EmSJEnarM1YqKrqbuAE4ELgSpqr+V2e5PQkh7TDXgscl+Qy4GPAUVU1+bRASZIkSdqsLOwyqKpW0lxsYnDbaQO3rwCeOrtTkyRJkqTx1umDfSVJkiRJ92WhkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnqyUIlSZIkST1ZqCRJkiSpJwuVJEmSJPVkoZIkSZKknixUkiRJktSThUqSJEmSerJQSZIkSVJPFipJkiRJ6slCJUmSJEk9WagkSZIkqScLlSRJkiT1ZKGSJEmSpJ4sVJIkSZLUk4VKkiRJknqyUEmSJElSTxYqSZIkSerJQiVJkiRJPVmoJEmSJKknC5UkSZIk9WShkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnqyUIlSZIkST1ZqCRJkiSpp06FKsnyJFclWZPk5GnGvDTJFUkuT/LR2Z2mJEmSJI2fhTMNSLIAOAt4NrAOWJVkRVVdMTBmH+DPgadW1c1JHrqpJixJkiRJ46LLEap9gTVVdXVVbQDOAw6dNOY44Kyquhmgqn46u9OUJEmSpPHTpVDtBqwdWF/Xbhv0SOCRSb6a5JIky6d6oCTHJ1mdZPX69ev7zViSJEmSxkSXQpUpttWk9YXAPsD+wBHAB5LseJ8vqjq7qpZV1bKJiYlh5ypJkiRJY6VLoVoH7D6wvhi4fooxn66qu6rqB8BVNAVLkiRJkjZbXQrVKmCfJHsl2Qo4HFgxacyngGcCJFlEcwrg1bM5UUmSJEkaNzMWqqq6GzgBuBC4Erigqi5PcnqSQ9phFwI3JrkCuAh4fVXduKkmLUmSJEnjYMbLpgNU1Upg5aRtpw3cLuCkdpEkSZKkeaHTB/tKkiRJku7LQiVJkiRJPVmoJEmSJKknC5UkSZIk9WShkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnqyUIlSZIkST1ZqCRJkiSpJwuVJEmSJPVkoZIkSZKknixUkiRJktSThUqSJEmSerJQSZIkSVJPFipJkiRJ6slCJUmSJEk9WagkSZIkqScLlSRJkiT1ZKGSJEmSpJ4sVJIkSZLUk4VKkiRJknqyUEmSJElSTxYqSZIkSerJQiVJkiRJPVmoJEmSJKknC5UkSZIk9WShkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk+dClWS5UmuSrImycn3M+6wJJVk2exNUZIkSZLG04yFKskC4CzgIGApcESSpVOMezBwIvD12Z6kJEmSJI2jLkeo9gXWVNXVVbUBOA84dIpxZwBvB34xi/OTJEmSpLHVpVDtBqwdWF/XbrtXkscDu1fVP9/fAyU5PsnqJKvXr18/9GQlSZIkaZx0KVSZYlvde2eyBfAu4LUzPVBVnV1Vy6pq2cTERPdZSpIkSdIY6lKo1gG7D6wvBq4fWH8w8DvAxUmuAfYDVnhhCkmSJEmbuy6FahWwT5K9kmwFHA6s2HhnVd1aVYuqaklVLQEuAQ6pqtWbZMaSJEmSNCZmLFRVdTdwAnAhcCVwQVVdnuT0JIds6glKkiRJ0rha2GVQVa0EVk7adto0Y/d/4NOSJEmSpPHX6YN9JUmSJEn3ZaGSJEmSpJ4sVJIkSZLUk4VKkiRJknqyUEmSJElSTxYqSZIkSerJQiVJkiRJPVmoJEmSJKknC5UkSZIk9WShkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnqyUIlSZIkST1ZqCRJkiSpJwuVJEmSJPVkoZIkSZKknixUkiRJktSThUqSJEmSerJQSZIkSVJPFipJkiRJ6slCJUmSJEk9WagkSZIkqScLlSRJkiT1ZKGSJEmSpJ4sVJIkSZLUk4VKkiRJknqyUEmSJElSTxYqSZIkSerJQiVJkiRJPVmoJEmSJKmnToUqyfIkVyVZk+TkKe4/KckVSb6V5AtJ9pz9qUqSJEnSeJmxUCVZAJwFHAQsBY5IsnTSsG8Cy6rqscAngLfP9kQlSZIkadx0OUK1L7Cmqq6uqg3AecChgwOq6qKqurNdvQRYPLvTlCRJkqTx06VQ7QasHVhf126bzjHAv0x1R5Ljk6xOsnr9+vXdZylJkiRJY6hLocoU22rKgcnLgGXAO6a6v6rOrqplVbVsYmKi+ywlSZIkaQwt7DBmHbD7wPpi4PrJg5IcCJwCPKOqfjk705MkSZKk8dXlCNUqYJ8keyXZCjgcWDE4IMnjgb8HDqmqn87+NCVJkiRp/MxYqKrqbuAE4ELgSuCCqro8yelJDmmHvQPYHvh4kv9IsmKah5MkSZKkzUaXU/6oqpXAyknbThu4feAsz0uSJEmSxl6nD/aVJEmSJN2XhUqSJEmSerJQSZIkSVJPFipJkiRJ6slCJUmSJEk9WagkSZIkqScLlSRJkiT1ZKGSJEmSpJ4sVJIkSZLUk4VKkiRJknqyUEmSJElSTxYqSZIkSerJQiVJkiRJPVmoJEmSJKknC5UkSZIk9WShkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnqyUIlSZIkST1ZqCRJkiSpJwuVJEmSJPVkoZIkSZKknixUkiRJktSThUqSJEmSerJQSZIkSVJPFipJkiRJ6slCJUmSJEk9WagkSZIkqScLlSRJkiT1ZKGSJEmSpJ46Faoky5NclWRNkpOnuH/rJOe39389yZLZnqgkSZIkjZsZC1WSBcBZwEHAUuCIJEsnDTsGuLmqHgG8C3jbbE9UkiRJksZNlyNU+wJrqurqqtoAnAccOmnMocA57e1PAAckyexNU5IkSZLGT6rq/gckhwHLq+rYdv2PgSdX1QkDY77TjlnXrn+/HXPDpMc6Hji+XX0UcNVs/UfmuEXADTOOEpjVsMyrO7PqzqyGY17dmdVwzKs7s+rOrH5tz6qamGnQwg4PNNWRpsktrMsYqups4OwO33NeSbK6qpaNeh5zgVkNx7y6M6vuzGo45tWdWQ3HvLozq+7ManhdTvlbB+w+sL4YuH66MUkWAjsAN83GBCVJkiRpXHUpVKuAfZLslWQr4HBgxaQxK4Aj29uHAV+smc4llCRJkqQ5bsZT/qrq7iQnABcCC4B/qKrLk5wOrK6qFcAHgXOTrKE5MnX4ppz0ZsjTILszq+GYV3dm1Z1ZDce8ujOr4ZhXd2bVnVkNacaLUkiSJEmSptbpg30lSZIkSfdloZIkSZKknixUGjtJ3C+H4IdoS5pLfM6StLnxD9ffIovC/UvyhCTbV9WvRj2XuSDJ3km28oqaM0uyo3/EaVNzH7t/SXZNstDnrG6SbDPqOcw1SRaMeg6an/wDfxNK8twkb01yZpIdLArTS3Ig8G/AKRbPmSV5AfA+4DGjnsu4S/I84M3Ag0c9l7mgfd56/6jnMRck2T/JCUmOSvIgi8L0khwCvBvYZ9RzmQuSPB84tf24Gt2PJM9przxNVd3j3xDTS/KUJH+Y5MAkW496PpsTd7pNJMly4ExgDfBQ4JUD95n7gDardwDnAA/eWDx9tXdqSR4DvAs4o6q+Oek+960BSQ4CzgA+XlW3TbrP/WuS9tXdZwDHJzltYLtZTZLkYJqfw52AZwNHDdxnXgOSPBF4J/C3VXXlpPt8zpqk/Z14BvCFqtow6vmMqzS2Bl5OUz7fAlBVv0qy5WhnN37a56z3AfsBxwHPHe2MNi8zfg6Vhtf+gjgMOLOqzk9yO/C77StO/15VtybZwiNWkGQ/4K3A8VW1Ksm3kpxaVW/21d5pPRT4WlV9Jcli4FBgEfD3VfXjJDE7SLIX8Dbg/Kr6UpKdgScABXy9qm735/A3ta/ufg7YFjgoycOq6lXAVsAvRzu78ZHk0cD/AE6oqq8mOQl4UJJHAuuq6k73rd/wn4HPVdWX2+es/YGtgU9W1S1m9WtJlgLvAd5RVRcl2Ql4BPAL4HtV9YuRTnCMtL/nfpnkw8A3gGOT7FRVr6qqu0Y8vbGS5AnAm4BXVtUlSd4GTLS/F2+3uD9wvjK0aSygefJ7RJJnAqcDewAvAS5tf+D95dG4ETiqqla16ycCj0my5wjnNO6uATYk2Rs4D9iFZv9amWRXy9S9fgH8E3B3kiOBFTRHEV4JrEjyYH8Ofy3JwvbIyt3AbcCBND+LK4DPJnmQR17udRvwmrZM7Qy8Gvh94DXAZ5I8xH3rN1xH84fvdsDHgSfSlKpV/j68j58Dq4AtkjyH5nnrz4E3Ame1GYrfeL/UDjSndO8LHJBkRfu8hUeq7nUzcGJbpnYB/oDmhf83A+9zv3rgLFSzKMnTkzy8fWXkHJpzxV8FXFRVL6+qI4GLgP86ynmOgyTPSLJbVX2vqi4b+EPtB8BDgMe24/wDjnv3rYe1q7fRHJH6U5pXfU+vqmOArwBvGNUcx0W7b+1aVT8C3k9zdOUk4B+r6mXAHwFrgYNHOM2xsXHfqqq7q/E1YHFV3QH8NfAs4O6qumO+l/WBfeu6qvqPdvPTgDdU1QtofiavB54/skmOiSQPS7LxfYvXAgfQnPZ3blX9WVX9MfBF4PhRzXGctHntWFU/oClPv0dzSum5VfUi4FSa57InjnCaY2Fg39pYxD8NTFTV7cDraV4M2hpgvh+parPaoap+UFWXtJsPoXnLwEE0+9i2NGVUD4CFapa0rySdA+wJ0B5xOQb4APCdgaFrgXt+6xMcI21WHwIWT76vqn4IfBJ4S1u45vUfcDDlvnUjzfn1TwN+L8lu7dDLgZtGMskxMbBv7QHQlqoPACdV1d+12zYAd9AcSZ7XJu9b7batgNuSvJnmfaB/AuzWniIybw3sW7sPbq+qFVV1fnv7LuBW5vnv1jQXgvkocH6SY6tqHXAC8FKa562NfkhzRHReG8jro21eP6A5cnBaVZ0NUFXfpXnOmtcX1xnI6jzgFe0RqO2A7ZKcQvOcdTzwRJ+z7s3qY0mO3nhEr6r+V1V9sL39PZpiuv3oZrqZqCqXB7jQvLHvMmC/dn0bYEF7+1HADcCftctq4NGjnvMYZbU1sMWkMVsCHwZeNOr5jnqZZt/asr29FPgszS/e97bjfmfUcx6jrLYGFk4x7rD253DvUc95DPPa+Ly1HPgR8JJ2fUdgr1HPecyy2mKKcS9u961HjHrOI8zqecA3gSfRHAW+CFjU3rcvcAXwWpoXhb4JLB31nMcwr12mGPci4FJgyajnPEZZfRHYub3vj2iODm98ztrJn8P7ZLXTFONeSPP+s3m7X83W4kUpZsdzgG2rOTd1Avgr4CFJvgqcS3P6xyvasUfWpKsczTPTZfVlmgt2fKuq7kqyCvj6SGc6HqbKa4ckX6F5L8JRNG/4fizw3qpaM7KZjt6M+1aSP6A5/e+oqvr+KCc7Bqbbt/4d+BpNOb8xyZZVdQtwyygnO2Jd9q1jaU7xfvk8/zl8EvDGai4y9DCaU7jf2u5XK2n+0Hsc8Ejg8Kq6anRTHQvT5fUl4NvtvnUMzemkR1TVNSOc66hNzmoH4MwkF9O8cP2kqrqufc66meZ9Q/PVdFl9mV/vVyfSnIHw0nm+X82KtA1VPaT5ENrb29sfpPmj9i7gH2l+uJ8I3FxVb02yELin5mngHbJ6As0FKv6mqub1KZHQed+6qarOHN0sx0PXfauq3t5e+e+eqrp2ZBMesY553VBV75jvV18bct/am2bfumZU8x2lNJ/DdcfA+nbAF4DPA5fQXI7/9qo6Y0RTHCsd8no6cFtV/VWS/0LzPsZ5WdRnyOrrwFOBn7V/a83356xh9qvHABuqOe1PD9SoD5HN1QU4iObo01MGtp1F8+bkjevPojkla5tRz3eOZPXPwNajnu+oF/PaZFltO+r5jnrpmNcB7lvuWz2zevKk7YsHbj/T/apXXtuNer5zJKvPzPd9y/1qtIun/PX3KJr3sDwryVZVdXFVvTrJFgOvkCyiecPtfM+5a1b30Lx/ar5/3o15dTdMVvP+IhR0y2sX3LfAfWsYG7N6dnu61Vfa7dclWVDNWQe70Lz5fb7vVzBcXu5b3bIq3Lfcr0Zovv+hP5SNl/CupuZfS3PKxxbA8iQ/pfmsjTureQ/Qq4CjgVdUe8rIfGJWwzGv7sxqOObVnVl1dz9ZPT/JDTQXNRnM6hia9y7Ou6zgAeX1s1HNeVTct7pzvxof8/rSrj1s3e600JyP+iWaS3feQvO5ERcAuyR5ELA3zS/ab49kpqNnVsMxr+7Majjm1Z1ZdXd/Wf0FTVaL2jfEP5Lmj7j5mhWY1zDMqjuzGhMWqo6SPBdYmeSh7aYFNJcWvprmszQOAm6nuTzsHcDr5+tOa1bDMa/uzGo45tWdWXU3RFY7V9WPgdfN16zAvIZhVt2Z1XixUHXQ7rTvoTlHdylANZcRPgf47zSfp3EK8G3gxUm2rXl6lRmzGo55dWdWwzGv7syquyGzOqzNat5+eK95dWdW3ZnVGKoxuDLGOC80bf8bwO8Drwc+M3Df0cAa4Pnt+qOAh456zmY1NxbzMivzGv1iVmZlXqNfzMqs5vri51DNIMk7gU9X1ZeSbAn8G/Dhqvpge//iqlqXJDXPwzSr4ZhXd2Y1HPPqzqy6M6vhmFd3ZtWdWY0nr/I3jSSH0Jx3elK7vlVVbUhyPrDXwNDrRjLBMWJWwzGv7sxqOObVnVl1Z1bDMa/uzKo7sxpvvodqCkmeA5wOXL9xW1VtaG9+FTg6yfJ2ew3+O9+Y1XDMqzuzGo55dWdW3ZnVcMyrO7PqzqzmgPs7H3A+LsBTgJ8A+7brOwB7AtsDW7bbjqP5NOpdRj1fs5o7i3mZlXmNfjErszKv0S9mZVab2+Ipf/d1I3AX8PAkuwCfAH5Oc+nJzyU5F/gusB+wYdpHmR/Majjm1Z1ZDce8ujOr7sxqOObVnVl1Z1Zzwagb3TguwONoruO/jqb1b0Fz5ZTzgJ3aMTuPep7jsJiVeZnVeCzmZVZmNfrFvMzKrObn4lX+ppFkKfDMqjprYNvngFOq6tLRzWz8mNVwzKs7sxqOeXVnVt2Z1XDMqzuz6s6sxpun/E2jqq4Arti4nuTFwATNqwMaYFbDMa/uzGo45tWdWXVnVsMxr+7MqjuzGm8WqhkkCfAK4HXAS6rqJyOe0tgyq+GYV3dmNRzz6s6sujOr4ZhXd2bVnVmNJ0/5m0G74z4D+HFV/b9Rz2ecmdVwzKs7sxqOeXVnVt2Z1XDMqzuz6s6sxpOFSpIkSZJ68oN9JUmSJKknC5UkSZIk9WShkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnq6f8DqWluwq3MrRwAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA14AAAEqCAYAAAAF9IEMAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAIABJREFUeJzt3XmYpFV59/Hvz2FzZxsQAR0XXDCJihPAl8QNZVUhURQ3RkQxr2jUNy64RBQ0wRg3IqJEUNyDK0RwGVFiNEEYFVBABFdGEEaGRQER9H7/OKelaGaYbpjqqpn6fq6rr6o6dar6rnP10/Xcz9lSVUiSJEmShucOow5AkiRJktZ2Jl6SJEmSNGQmXpIkSZI0ZCZekiRJkjRkJl6SJEmSNGQmXpIkSZI0ZCZekiTNgSQ/S/L4UcchSRoNEy9J0lRScF2S3w783HPUcY1CkjV6g8s1PX5JWluZeEmSpjypqu4y8HPx9ApJ1hlFYOMgjd+bkqTbxC8QSdJKJVmQpJIckOQXwNd6+Y5J/ifJlUnOSvKYgdfcJ8l/JflNksVJ3pPko/25xyRZOu13/GkIXpI7JDk4yY+TXJ7k+CQbT4tlUZJfJPl1ktcNvM+8JK/tr/1Nku8k2TrJkUnePu13/meSl83g85+a5C1JvgVcC9w3yd2THJPkkiS/TPLmJPMGXvOCJOf1GM5Nst0K3nf9JO9KcnH/eVeS9QfbKMk/JLms/579B167SY//6iRn9N//zVV9FknSaJl4SZJm4tHAg4Fdk2wJnAS8GdgYeAXwmSTze92PA98BNgUOAxbN4vf8PbB3/333BK4AjpxW56+ABwI7A29I8uBe/v+AZwB7AHcDnkdLlo4DnjHVW5Vk0/7aT6wogKrKtKLnAAcCdwV+3t/vRuD+wMOBXYDn9/feB3gjsF+P4cnA5Sv4Na8DdgQeBjwU2B54/cDz9wDuDmwJHAAcmWSj/tyRwDW9ziKmte8K4pckjQETL0nSlM/3Hqwrk3x+2nNvrKprquo64NnAyVV1clX9saoWA0uAPZLcC/hL4B+r6vqq+gbwn7OI4YXA66pqaVVdT0tinjptiOObquq6qjoLOIuWuEBLfl5fVedXc1ZVXV5VpwNX0ZItgH2BU6vq0hnG9KGqOqeqbqQlmrsDL+vtcRnwzv6eUzH8S1Wd0WO4sKp+voL3fBZwaFVdVlXLgDfRErwpN/Tnb6iqk4HfAg/sPWtPAQ6pqmur6lxaIihJGnMTO1ZfknQLe1fVV1fy3EUD9+8N7JPkSQNl6wJfp/dSVdU1A8/9HNh6hjHcG/hckj8OlP0B2Hzg8a8G7l8L3KXf3xr48Ure9zhawri43757hvHALT/7usAlyZ86lu4wUOfWYhh0T1q7TPl5L5tyeU/0pkx9zvm07+7BmAbvS5LGlImXJGkmBlfKuwj4SFW9YHqlJPcGNkpy54Hk614Dr78GuNNA/Xm0ZGLwvZ9XVd9awXsvWEWMFwH3A36wguc+CvwgyUNpQyan9+jdmumf/Xpg02mJ0fQYVuViWhJ3Tn98r162Kstowxy3An7Uy2aa1EqSRsihhpKk2foo8KQku/YFLTboC0Js1YfVLQHelGS9JH8FDPaM/QjYIMmeSdalzWtaf+D59wFv6QkcSeYn2WuGcX0AOCzJNn0Fwr9IsglAVS0FzgA+AnymD5mctaq6BPgK8PYkd+uLgdwvyaMHYnhFkkf0GO4/9Vmm+QTw+v75NgXeQGvXVf3+PwCfBd6Y5E5JHkSbTyZJGnMmXpKkWamqi4C9gNfSemAuAl7JTd8pzwR2AJYDhwAfHnjtVcCLaAnKL2k9YIOrHL4bOBH4SpLfAKf195qJdwDH0xKjq4FjgDsOPH8c8Oe05Ov22A9YDziXtvjHp4EtAKrqU8BbaAuM/IbWs7bxCt7jzbQE9Wzg+8B3e9lMvJi28MavaJ/lE7ReOEnSGEuV+yxKkoYnyRuB+1fVs0ccx6NovUoLquqPq6q/pkjyVuAeVTWb1SMlSXPMHi9J0lqvD2t8KfCBNT3pSvKgPowySbanLTf/uVHHJUm6dSZekqS1Wt/n60racMB3jTic1eGutHle19CGVr4dOGGkEUmSVsmhhpIkSZI0ZPZ4SZIkSdKQjfU+XptuumktWLBg1GFIkiRJ0i185zvf+XVVzV91zTFPvBYsWMCSJUtGHYYkSZIk3UKSn8+0rkMNJUmSJGnITLwkSZIkachMvCRJkiRpyEy8JEmSJGnITLwkSZIkachMvCRJkiRpyEy8JEmSJGnITLwkSZIkachMvCRJkiRpyNYZdQBrmgUHnzTqEObczw7fc9QhSJIkSWs0e7wkSZIkachMvCRJkiRpyEy8JEmSJGnITLwkSZIkachWmXgleWCSMwd+rk7ysiQbJ1mc5IJ+u1GvnyRHJLkwydlJtht4r0W9/gVJFg3zg0mSJEnSuFhl4lVV51fVw6rqYcAjgGuBzwEHA6dU1TbAKf0xwO7ANv3nQOAogCQbA4cAOwDbA4dMJWuSJEmStDab7VDDnYEfV9XPgb2A43r5ccDe/f5ewIerOQ3YMMkWwK7A4qpaXlVXAIuB3W73J5AkSZKkMTfbxGtf4BP9/uZVdQlAv92sl28JXDTwmqW9bGXlkiRJkrRWm3HilWQ94MnAp1ZVdQVldSvl03/PgUmWJFmybNmymYYnSZIkSWNrNj1euwPfrapL++NL+xBC+u1lvXwpsPXA67YCLr6V8pupqqOramFVLZw/f/4swpMkSZKk8TSbxOsZ3DTMEOBEYGplwkXACQPl+/XVDXcErupDEb8M7JJko76oxi69TJIkSZLWauvMpFKSOwFPAF44UHw4cHySA4BfAPv08pOBPYALaSsg7g9QVcuTHAac0esdWlXLb/cnkCRJkqQxN6PEq6quBTaZVnY5bZXD6XULOGgl73MscOzsw5QkSZKkNddsVzWUJEmSJM2SiZckSZIkDZmJlyRJkiQNmYmXJEmSJA2ZiZckSZIkDZmJlyRJkiQNmYmXJEmSJA2ZiZckSZIkDZmJlyRJkiQNmYmXJEmSJA2ZiZckSZIkDZmJlyRJkiQNmYmXJEmSJA2ZiZckSZIkDZmJlyRJkiQNmYmXJEmSJA2ZiZckSZIkDdk6ow5Aa7cFB5806hDm1M8O33PUIUiSJGkM2eMlSZIkSUM2o8QryYZJPp3kh0nOS/LIJBsnWZzkgn67Ua+bJEckuTDJ2Um2G3ifRb3+BUkWDetDSZIkSdI4mWmP17uBL1XVg4CHAucBBwOnVNU2wCn9McDuwDb950DgKIAkGwOHADsA2wOHTCVrkiRJkrQ2W2XileRuwKOAYwCq6vdVdSWwF3Bcr3YcsHe/vxfw4WpOAzZMsgWwK7C4qpZX1RXAYmC31fppJEmSJGkMzaTH677AMuCDSb6X5ANJ7gxsXlWXAPTbzXr9LYGLBl6/tJetrFySJEmS1mozSbzWAbYDjqqqhwPXcNOwwhXJCsrqVspv/uLkwCRLkixZtmzZDMKTJEmSpPE2k8RrKbC0qr7dH3+alohd2ocQ0m8vG6i/9cDrtwIuvpXym6mqo6tqYVUtnD9//mw+iyRJkiSNpVUmXlX1K+CiJA/sRTsD5wInAlMrEy4CTuj3TwT266sb7ghc1YcifhnYJclGfVGNXXqZJEmSJK3VZrqB8kuAjyVZD/gJsD8taTs+yQHAL4B9et2TgT2AC4Fre12qanmSw4Azer1Dq2r5avkUkiRJkjTGZpR4VdWZwMIVPLXzCuoWcNBK3udY4NjZBChJkiRJa7qZ7uMlSZIkSbqNTLwkSZIkachMvCRJkiRpyEy8JEmSJGnITLwkSZIkachMvCRJkiRpyEy8JEmSJGnITLwkSZIkachMvCRJkiRpyEy8JEmSJGnITLwkSZIkachMvCRJkiRpyEy8JEmSJGnITLwkSZIkachMvCRJkiRpyEy8JEmSJGnITLwkSZIkachMvCRJkiRpyEy8JEmSJGnITLwkSZIkachmlHgl+VmS7yc5M8mSXrZxksVJLui3G/XyJDkiyYVJzk6y3cD7LOr1L0iyaDgfSZIkSZLGy2x6vB5bVQ+rqoX98cHAKVW1DXBKfwywO7BN/zkQOApaogYcAuwAbA8cMpWsSZIkSdLa7PYMNdwLOK7fPw7Ye6D8w9WcBmyYZAtgV2BxVS2vqiuAxcBut+P3S5IkSdIaYZ0Z1ivgK0kKeH9VHQ1sXlWXAFTVJUk263W3BC4aeO3SXray8ptJciCtp4x73etes/go0pptwcEnjTqEOfWzw/ccdQiSJElzZqaJ105VdXFPrhYn+eGt1M0KyupWym9e0JK6owEWLlx4i+clSZIkaU0zo6GGVXVxv70M+BxtjtalfQgh/fayXn0psPXAy7cCLr6VckmSJElaq60y8Upy5yR3nboP7AL8ADgRmFqZcBFwQr9/IrBfX91wR+CqPiTxy8AuSTbqi2rs0sskSZIkaa02k6GGmwOfSzJV/+NV9aUkZwDHJzkA+AWwT69/MrAHcCFwLbA/QFUtT3IYcEavd2hVLV9tn0SSJEmSxtQqE6+q+gnw0BWUXw7svILyAg5ayXsdCxw7+zAlSZIkac11e5aTlyRJkiTNgImXJEmSJA2ZiZckSZIkDZmJlyRJkiQNmYmXJEmSJA2ZiZckSZIkDZmJlyRJkiQNmYmXJEmSJA2ZiZckSZIkDZmJlyRJkiQNmYmXJEmSJA2ZiZckSZIkDZmJlyRJkiQNmYmXJEmSJA2ZiZckSZIkDZmJlyRJkiQNmYmXJEmSJA2ZiZckSZIkDZmJlyRJkiQN2YwTryTzknwvyRf64/sk+XaSC5L8R5L1evn6/fGF/fkFA+/xml5+fpJdV/eHkSRJkqRxNJser5cC5w08fivwzqraBrgCOKCXHwBcUVX3B97Z65FkW2Bf4CHAbsB7k8y7feFLkiRJ0vibUeKVZCtgT+AD/XGAxwGf7lWOA/bu9/fqj+nP79zr7wV8sqqur6qfAhcC26+ODyFJkiRJ42ymPV7vAl4F/LE/3gS4sqpu7I+XAlv2+1sCFwH056/q9f9UvoLX/EmSA5MsSbJk2bJls/gokiRJkjSeVpl4JXkicFlVfWeweAVVaxXP3dprbiqoOrqqFlbVwvnz568qPEmSJEkae+vMoM5OwJOT7AFsANyN1gO2YZJ1eq/WVsDFvf5SYGtgaZJ1gLsDywfKpwy+RpIkSZLWWqvs8aqq11TVVlW1gLY4xteq6lnA14Gn9mqLgBP6/RP7Y/rzX6uq6uX79lUP7wNsA5y+2j6JJEmSJI2pmfR4rcyrgU8meTPwPeCYXn4M8JEkF9J6uvYFqKpzkhwPnAvcCBxUVX+4Hb9fkiRJktYIs0q8qupU4NR+/yesYFXCqvodsM9KXv8W4C2zDVKSJEmS1mSz2cdLkiRJknQbmHhJkiRJ0pCZeEmSJEnSkJl4SZIkSdKQmXhJkiRJ0pCZeEmSJEnSkJl4SZIkSdKQmXhJkiRJ0pCZeEmSJEnSkJl4SZIkSdKQmXhJkiRJ0pCZeEmSJEnSkJl4SZIkSdKQmXhJkiRJ0pCZeEmSJEnSkJl4SZIkSdKQmXhJkiRJ0pCZeEmSJEnSkJl4SZIkSdKQrTLxSrJBktOTnJXknCRv6uX3SfLtJBck+Y8k6/Xy9fvjC/vzCwbe6zW9/Pwkuw7rQ0mSJEnSOJlJj9f1wOOq6qHAw4DdkuwIvBV4Z1VtA1wBHNDrHwBcUVX3B97Z65FkW2Bf4CHAbsB7k8xbnR9GkiRJksbROquqUFUF/LY/XLf/FPA44Jm9/DjgjcBRwF79PsCngfckSS//ZFVdD/w0yYXA9sD/ro4PImlyLDj4pFGHMOd+dvieow5BkiTdDjOa45VkXpIzgcuAxcCPgSur6sZeZSmwZb+/JXARQH/+KmCTwfIVvGbwdx2YZEmSJcuWLZv9J5IkSZKkMTOjxKuq/lBVDwO2ovVSPXhF1fptVvLcysqn/66jq2phVS2cP3/+TMKTJEmSpLE2q1UNq+pK4FRgR2DDJFNDFbcCLu73lwJbA/Tn7w4sHyxfwWskSZIkaa01k1UN5yfZsN+/I/B44Dzg68BTe7VFwAn9/on9Mf35r/V5YicC+/ZVD+8DbAOcvro+iCRJkiSNq1UurgFsARzXVyC8A3B8VX0hybnAJ5O8GfgecEyvfwzwkb54xnLaSoZU1TlJjgfOBW4EDqqqP6zejyNJkiRJ42cmqxqeDTx8BeU/oc33ml7+O2CflbzXW4C3zD5MSZIkSVpzzWqOlyRJkiRp9ky8JEmSJGnITLwkSZIkachMvCRJkiRpyEy8JEmSJGnITLwkSZIkachMvCRJkiRpyEy8JEmSJGnITLwkSZIkachMvCRJkiRpyEy8JEmSJGnITLwkSZIkachMvCRJkiRpyEy8JEmSJGnITLwkSZIkachMvCRJkiRpyEy8JEmSJGnITLwkSZIkachMvCRJkiRpyFaZeCXZOsnXk5yX5JwkL+3lGydZnOSCfrtRL0+SI5JcmOTsJNsNvNeiXv+CJIuG97EkSZIkaXzMpMfrRuAfqurBwI7AQUm2BQ4GTqmqbYBT+mOA3YFt+s+BwFHQEjXgEGAHYHvgkKlkTZIkSZLWZqtMvKrqkqr6br//G+A8YEtgL+C4Xu04YO9+fy/gw9WcBmyYZAtgV2BxVS2vqiuAxcBuq/XTSJIkSdIYmtUcryQLgIcD3wY2r6pLoCVnwGa92pbARQMvW9rLVlY+/XccmGRJkiXLli2bTXiSJEmSNJZmnHgluQvwGeBlVXX1rVVdQVndSvnNC6qOrqqFVbVw/vz5Mw1PkiRJksbWjBKvJOvSkq6PVdVne/GlfQgh/fayXr4U2Hrg5VsBF99KuSRJkiSt1WayqmGAY4DzquodA0+dCEytTLgIOGGgfL++uuGOwFV9KOKXgV2SbNQX1dill0mSJEnSWm2dGdTZCXgO8P0kZ/ay1wKHA8cnOQD4BbBPf+5kYA/gQuBaYH+Aqlqe5DDgjF7v0Kpavlo+hSRJkiSNsVUmXlX1TVY8Pwtg5xXUL+CglbzXscCxswlQkiRJktZ0s1rVUJIkSZI0eyZekiRJkjRkJl6SJEmSNGQmXpIkSZI0ZCZekiRJkjRkJl6SJEmSNGQmXpIkSZI0ZCZekiRJkjRkq9xAWZK0Zltw8EmjDmFO/ezwPUcdgiRJt2CPlyRJkiQNmYmXJEmSJA2ZiZckSZIkDZmJlyRJkiQNmYmXJEmSJA2ZiZckSZIkDZmJlyRJkiQNmYmXJEmSJA2ZiZckSZIkDZmJlyRJkiQN2SoTryTHJrksyQ8GyjZOsjjJBf12o16eJEckuTDJ2Um2G3jNol7/giSLhvNxJEmSJGn8zKTH60PAbtPKDgZOqaptgFP6Y4DdgW36z4HAUdASNeAQYAdge+CQqWRNkiRJktZ2q0y8quobwPJpxXsBx/X7xwF7D5R/uJrTgA2TbAHsCiyuquVVdQWwmFsmc5IkSZK0Vrqtc7w2r6pLAPrtZr18S+CigXpLe9nKym8hyYFJliRZsmzZstsYniRJkiSNj9W9uEZWUFa3Un7Lwqqjq2phVS2cP3/+ag1OkiRJkkbhtiZel/YhhPTby3r5UmDrgXpbARffSrkkSZIkrfVua+J1IjC1MuEi4ISB8v366oY7Alf1oYhfBnZJslFfVGOXXiZJkiRJa711VlUhySeAxwCbJllKW53wcOD4JAcAvwD26dVPBvYALgSuBfYHqKrlSQ4Dzuj1Dq2q6Qt2SJIkSdJaaZWJV1U9YyVP7byCugUctJL3ORY4dlbRSZIkSdJaYHUvriFJkiRJmsbES5IkSZKGzMRLkiRJkobMxEuSJEmShmyVi2tIkjQpFhx80qhDmFM/O3zP2/zaSWsruH3tJUn2eEmSJEnSkJl4SZIkSdKQOdRQkiRpyCZtaKbDMqVbssdLkiRJkobMHi9JkiSNDXsHtbayx0uSJEmShszES5IkSZKGzMRLkiRJkobMxEuSJEmShszES5IkSZKGzFUNJUmSpDXQpK0ACWv2KpD2eEmSJEnSkJl4SZIkSdKQmXhJkiRJ0pDNeeKVZLck5ye5MMnBc/37JUmSJGmuzWnilWQecCSwO7At8Iwk285lDJIkSZI01+a6x2t74MKq+klV/R74JLDXHMcgSZIkSXMqVTV3vyx5KrBbVT2/P34OsENVvXigzoHAgf3hA4Hz5yzA8bYp8OtRB7EGsb1mzraaOdtqdmyvmbOtZs62mh3ba+Zsq5mzrW5y76qaP5OKc72PV1ZQdrPMr6qOBo6em3DWHEmWVNXCUcexprC9Zs62mjnbanZsr5mzrWbOtpod22vmbKuZs61um7keargU2Hrg8VbAxXMcgyRJkiTNqblOvM4AtklynyTrAfsCJ85xDJIkSZI0p+Z0qGFV3ZjkxcCXgXnAsVV1zlzGsAZz+OXs2F4zZ1vNnG01O7bXzNlWM2dbzY7tNXO21czZVrfBnC6uIUmSJEmTaM43UJYkSZKkSWPiJUmSJElDZuIlSZIkSUNm4iVJuk2SrGhvRul28e9q5mwrac1i4jUG/Mc5c0keluTBSR486ljWBEmekOTvRx2H1i5J7gJQrs6k1SzJXwI79fvzRhzOWLOtNCxJNkpijjAENuqIJcnUyUuSv06yU5LNRx3XOEqyO/CfwIuATyXZf8QhjbUkuwDHAHsm2XpV9SdZkscmeVOSJyf5s1HHM86SPBk4Jsknk+yR5F6jjmmcJXlMkuePOo41QZJdgW8DhwFU1R9GG9H4sq1mx+Nw5pI8kXbucNdRx7I2MvEasYGk65XAocALgPcn2W6kgY2RNHcBXgIcVFUvAZ4PvC7J3402uvHUv5T/GXgtcDmwfS+3d3WaJI8HPgD8Efgr4N+S7DnaqMZTkgcARwFHAP9Lu9r+D0keONLAxlQ/Dt8B/HBaud+90/Rj7o3ALsB1SZ412ojGl201Ox6HM5dkN1oy/+6qumrU8ayN5nQDZa1YkocCO1XVY5McBmwCnJlk3aq6YcThjVxPTn+bZAlwt94upyXZl9bzdV1VHTfiMMdGki2Bg4GXV9U3+pfLG5Msqaqfjzi8cXR/4Jiq+qee4O8CvLV3Rp884tjGzfrAN6vqW8C3+gWiPYAXJnlHVS0dbXjjI8lfA58Hdqyqs5LcDaCqrq6qP442uvGS5N60/1mvr6pTkjwceFB/Lg5pvYltNTsehzPXRy+8GfiPqvqvJBvTvg+vBJZW1Q9GGuBawmx/BFbQ63AN8PMkRwCPAJ7S/yE8Pskd5zzA8fUrYGfgjgBVtQR4DvCSJPcZZWDjIsldquqXwNN60jUP+AzwdeDhvY7H/c39DngoQFX9tqo+C7wBeFmSB400svHzQ+C+Sf4vQFV9F/gicCPwALBXdcBvgJ8DD0lyJ+BTwLFJvp7kL8C2AkiyQb8g9PSqOqUXnwI8N8nOJhI3sa1uE4/DGaqqXwBfoF3gfjawGNgVeBpweJJHjDK+tYUnYHNs2pyuuyRZD/gxsCHtxPiAqvp9H4v8ZuDOo4t2PEz9U6yq9wJ3At6X5O695+ubwNnAxH/hJNkbODLJPWjDC6mqP1TVNcClwMt6mVf5BlTVh4AFSY4aKP4aLcnYciRBjaEk83oP/OuBHZI8DaCqvgMsB57dH0/0sZjk0UkeVVVnAvsAr6AdjycCzwBOA96T5A62VXal/c+6S1Vd3IeV36En9IcDz0iy4YjDHAu21ex4HM7O1AXZqnoj7WLky4EPVNX+wCtp34cOKV8NTLzm0LSk6xXAR2ld4BsDRwPnA+9I8lbaH/1+VfXrUcU7SkkemOSRSdZl4O+0qp7eH78LeF6Sg4BH0664T6w+nOKfgI9U1a+mJ1dV9RbacM2XjiTAMZVkarj1bsD9k7wfoKqupP2dLRxVbONgoH0GJ++fTrvKvmduWjHzl8Adkqw/xyGOlSRPoF1RPyjJ3arq+8DzgP9XVUdW1Q1V9RrgamCrUcY6ar2tjgI+XVW/hVsk7d8BNuo/E822mh2Pw9mrqj8OJF+H0qYqHNUfXw6sC2wxwhDXGs7xmkMDSdfjgCcCfwccSFuZaCFtIYQn0OZR/HtVXTiiUEcqyd/Skohf9p8lST5UVVcDVNW+SZ4H3JM2ROzJzi3hIcCHq+qrfY7XI4ArgB9V1aW9zteAzZOsV1W/H1Wgo5S2YuiN/YuEqrqxXxC5Isk+wBeTHA9cQhtiMbGLbCR5LLBjn7t1/VR5b6sv0I7NtyV5FPCXtOPw+pW83Vqv90j8E/BW2v/zTYGrq+rMJGcN1Hsm7QTmtyMJdHxsB7yuqr6YZD5wD+CGqvohQJ/HewNtUYS/mfD5S7bVDHkczlwfccXU+UBPvub1kTLfGKi3L+0C9z6jiXTtkgk9NkcmyWOAvwe+V1WH9bJ/BZ4MPKaqLh5heCPXe7g+ChxRVd9K8hRgR+B64G01bZWdJOtP8snelLTVHbeoqkOSfBP4GXADcBfgZVX1y7Ql5WtSk9Qke9F6kjegrdj0iYHn5k316CT5G1q7nTF1YjNp0rZueC/wwqr6ykD51BXRP/bH6wALaCc2l40g1LGQZFvgg7SrxP/Te063BvYcuOC2DvB02lDNp1bVOSMLeAwkeS1wd+A9wAnAd2kXO15XVR/ude4G3L2qLhpZoGPAtpoZj8OZ6+dWzwDuBrwb+J+quqI/Nzg660m0FZKfPqlttbqZeA3Z9CtPaSsSvZa2P8Jbq+qsXn4k7YrCQ2knxxM5D6cnXifSVtX5UD/R+2taz8NPqup9Sban9Vp8d8Kv7D0aWFZV56atRnQy8CPg5Kr6QE+0Xgl8tapOHGWso5a20ei/0+Yh3Rt4DbB7Vf1mpIGNoX4V9O3AF6vq5D5vZD3gj1X166ljLsl9q+ono412fCS5X1X9uN/fCngLcFRVndbL7go8CfhOVZ0/ukhHp58Y31hVP0qyA/BUWo/D5VX1nrS9B99HW2Dqe6OMddRsq9vG43DV0rYFOYE2/HIBLQE7BfjCVNtNq79ltUW7tBo4x2uIpl816FeR70nbj+pKYJ+0peSpqoOAx/Uu3olMugCqTd5/B/C3Sf66t8UxLYBlAAAQmUlEQVQ3gTOBR6Wt8rgTcHGvP6lJ1xOB99NWmEu11YheCvwZ8DCAfuVzHRyXDXAf4Kxqy+F+E5gHvCvJ/ukrYiZ5YpIDRxnkOOjDTn4PbNpPXBYDbwO+l2SnnnQtAN6Z5G7JZK8INtALOHjC8mvavnBPnyroSf4nJ/hkby/gs9w0D+kHtJ7lPYE79V7nr9BWydx4NFGOB9tq9jwOZ2UT4NKq+t8+8uOfaecOe/TEdOqc9UUAJl2rlz1eQzRwZfhFtI2RTwb+ltYV/m7aVeXrgQ9W1Q8mufdmUJINaBsk/wXw0amxxklOpa36eIsrMpMkyUbAScCrq+q/ey8hVXVDkifTxrYfC1wFvJA2RGBS5wtuUVWXJPkz2oajVwOPobXP94G/oQ0pPLInE/Mm9e8ryf1oX8hn066A3ou2utXVVXVUT0rfBGzX2/Su9hje0sD//a1pSzO/uqq+NOq4RinJfYFPcNMQsPWqrd67AXAobQTI92nfh68DHlsTuuegbTV7Kzp38ji8pYHvw9C+A78IfLbaXOf/A/wj8M6q+kofnbXOpH4fDpOLawxBH/Z1eVVdk2Qz2oTEZ1bVeX0+1xLaxPS3AK+m7U81sb0301XV75J8jLZE/GvS9lK6HpjPBE+EHTAPuK4nXZvRegjvmuRLwOdoG9ruRxu7vWiCk67dgf2TvIq2Yui/0DZL3qSq3tzr/AZ4fZKPVdXPRhbsiPUe1H+iLQv/Y+A/aMnXprS5EFTV0UkeSZtrcskkJ12DFzumP9dP9uZV1UVJvgw8KMniumlVyEn0W+C8nkjcF3htksuAX1XVq9JWoftLWs/0Eyc8kbCtZiht8+hLawVz4z0Ob27g+/C1wE9oK2FuD1yW5Fv97+2TwPOT/Nck/10Nm4nXapa2ato/ABcleV9VXZbk17ShO1Mrgr0c2KmqPpbklSv68p50vZ3+HTiX1mvzO+DZddMKfRMnycZVtbzPszk3bRPbXWhX8y6mJfh3rKp30HomJlafE/F+4LkDCdXpwOlJdkjy+Kr6Km34znVM8HYE/UrnvwLPqKrvpU1Ifzxt08zP0Ib4Xk070XskrSd1YvVhYE8D1ktyNHDaVBI6dZV94OTuS8AFk3qyN+COwD3SNqx9MfBTWq/NS/r/tTcCi+NiSWBbzUja6oVvAg6gTz2YzuOwmfZ9eGEv+yBtsam9aQuQfIR2sft3TPD34Vww8Vr9lgFn0JZ/3T/Je2hXFz6Z5JFVdSNtcv9WSebhH/hK9bkmX0/yDSZ4wRGAJHsCT0zyL1X1U9oWBI/sT3+w2jKwVwGHJflA9aX3J9g2tD3NvpbknrTNyTetquNoPczPT/Ji2nC6/avvjTPBDq+bJui/nvY3dWWS/Wi9p7vR/qc9paouGVWQo5ZkIe1kb2o7i9cDpyY5pqp+MTVqIcmdq+qaqvraCMMdqZ6g7gocVFU/70PFvwCcVG1fQZJcAjxz6jWTmkjYVrMz0EP/3Ko6Z2XTNDwO/2T69+FC2hoP/wQ8B9g1yQG0oeb7TWqCOldMvFaTJNsAd6iq8/swuauA3YEXVNWrkxwFfCPJ2cAOwLP8456ZSW+nJDvRNs98bk+6oG0OuQntH+gL+/ObANfSJhNPuqXAI/v4/hOArwC79Lb8O9pE4vvTtnX46crfZiJ8mzZ5n34xaH1gyyT3qLYNwXFVdfHUScxIIx29BcCZVfVd4Ltpi4+8kDbC4dh+AeSxwEP6iIeJvLDWeyPeDLxh6oS4qg5P22D7kCRv71feHw48YJJ7b2yr2enzk54MbFBtZeO70IaLzwc+Dyzu0xUm/jgcsKLvw12BJ1XVAcBxvYf1VzXB24LMFRfXWA2SbELr6fo17WroH4CjaVen7k+bD/H+3t17R+DnnuxpppK8lLY/y6FJtgC2Bf5QVaem7cXxGNoJ4ebAgVV15siCHaEkD6iqH/X7DwUOoyUV1/XhlyQ5HfhQVb13dJGOr7Q9bjYATqiqnZM8m7adw8uq6rrRRjc6U0ln/x9+APCpqlrc50vcD3ggbfL+t5LcA1iv2kqjEynJG2jzlD7VT4jvT/veuzjJK2mL2vwPbaj0vlV17gjDHSnbauaS3Kmqrk1baORw2h6fAT5NGzL+VODfquoz/bty3Uk9Dmf4fXga8ImqevfoIp089nitBlV1eZLHA1+ldd8+lDY5/be0uV1/3q/SfHCSr1TpNvsF8OB+Ze8k4DTg4UkuqqqnAZ9JW4Dk8qpaNspAR6UPPTk+yQlV9YyqOivJycCrgJOS3L3a5ttTX9BagX5l+LdJLkryz7STvedOeNK1K/AXSd5Bm2vzS9ow8pfThkDvmbbi4y7At6rqVyMMd1xsAfwuyZ1pvRCXAesn+XJVvS3JF2k98//mJH7ugW21Sv04fGiSI3qP1sG0kR4/raq39TrLgUVJvjDhQ6Jn+n34WdpKv5pD9nitRmmrDR1BS7w2Bx4H7EtbOeYS2oIaEz0xXTPTr6xfXG1Fpr+i/V19hraC0wd6nW8C76+qj4ww1JHrJyyfoX2J/B9ab8Mz+3MvoF0F/SJtT5ynAX9TVT8cUbhjrV8gWhc4r9/uXFUXjDaq0UlbCexw4KVVdWovW5+2pPeWwLnVtnF4Ne3v7rCRBTti/ar6NVV1Ydr2DS+nXdw9tao+2Ocx7UfrGZzIlVZXxLZatRUdh718Htw0HSHJc4FHA8+f1CkKfh+OPzdQXo2qajHwCtp8iWv6RP5nAg+mLftq0qVV6l8yn6dtjjyvqr4JHAe8gTahf8p/0VYgmmh93tHzgI/Tjr/1knyiP/fvtCEWP6Ytw7+3XzIrV83vaW22+4QnXdsC7wWO7MN6N0nyYGCbqvp1VZ3Vk66/p01Q//RIAx6hJHvTVk3boifvv6KtSLstbe4pVXUCbRj+tqOKcxwk2bZPT5hyKa2tHoxtdQsrOQ4fmOTPq+oPA0nX/rRVIN8+qUkX+H24JrDHawj6ifO7gUdW1eWjjkdrhn7Cshlt8uvrq+qrU2Pa+/PPpQ0VeBNtZcxnA0+dGsetpp/UHA38vqqe0ScNX15VvxxxaGuMxM3ckzyCNp/ru7TJ6a8ALgc2Bn5YVS9Jsh7wWuBzVXXWyIIdoT5h//PAi6rq21N/O0nuRuvJeQDwv7QE4w3AnhM872YP2mqFRwD/XH17lCT3oSXvD6ANJZ/4tpoyw+Pw/rQ9UY+oqu+PLNgx5Pfh+DHxGpI+VOCNwCNqgpdB1+z0YQJHVNUBfXLwv9I2j/4hbTz7w2lLwz4MeG9VnTeyYMdYkk2Bt9GGWswDHlNVS0cbldYE0yal70QbmvNk2t/T+4GtgA8Cb6q2iflEJ6n9pPfIqtq1J2GvAu4E/Cdt9bRtgQOBG2hDo88eWbAj1BPR19N6A7ejJRHvHEi+Nqf9bb2QCW8rmNVx+I+0PRrXn7pIqZvz+3C8uLjGkFTVCUlOMenSLF0H3CfJa4CH0IYTXtnvv6KqDgG+McL41gjVNpk+m7alwxP8ktFMDExKP7Gq9u2rFN4AfKOqPterXZRkKX0PxklOurpLgB8n2Y52EvzftOTibcBdq+ojSb5L225lYpf1rqqr07ZmOKevXvjvwMuTvLuqLukJ2KVJ/i/tovjEttUsj8P0oYUmXSvh9+F4cY7XEJWbsmoWktyhJ+qvpS1PvXFVHV1Vx9NOZu470gDXIEk2AvYAdnHoiWai9za/GHgZbZW5jwNU1enAlwbqPYV2IeTiUcQ5bvqckhtoc0d+WlXvqKqP09ryKWn7Tv1xwhOJANRNm/0uo/UCPhh4aa/zlCR/2ectTXJbzfY4dMjcKvh9OF5MvKQxMdA7+kPg68CfJfmHXrYRsGmSu48kuDVMVV1B2xxyYofqaHZWMin9Y/256wCSLAIOBvavCV7ae8pAQvFS2rybxyV5WH96E9riPxO70EGSO8DNe0X7/Ld51TaqfT5wzyT/Bbwdl/b2OBwCvw/Hi3O8pBHpk4avqKqfrOC5OwE70OYJ/gR4JG0hjR/MaZDShBqYlH5dVT27r2j4WOBLKzpmJ0GSdQGq6oaBsnWmemiSvIc2BHMj2tyuAyb1ZC9te5l7VtVxA6MZBp+fV1V/SPIq2sIQj6qqc0YS7BjzONTaxsRLGoG0zSCPBP526sRk6uox3HSFtO9Tci/a9gSXjSJWaVJNm5Qe4NE1oRuz9gWjngasRzsRPq2qftOfW3cqGUtyb9rCBxdN6op8SR4P/AdwR+DPq+rHK6k3n7bgxger6sw5DHGN4nGotYmJlzTHkjyWduJyYFV9Pckdq+q6qSvHA1dCt3ISrDRaSV5O65F4wqTOj0iyEPgAbQjYPYFXAqcCxwwmV30+1/UjCXJMJNkTeAvwd8Bf05bxfvf0Xq8kG1TV76ZuRxXvmsLjUGsL53hJc283YAlwepJ7AUckOQJ4c5Kte9L1AOAfk9x5sCdM0txxUvqfLADOrKrvVtUXgE/QlvbeZWoeU7+g9IIkE7tacpLNaL2CL6uq02gr0j5xKuma+l+e5DG0tppn0rVqHodam9jjJc2RJH9GG6bze9qk6nWBnWl7klxMmxOxGW2Vqw1oyy9fOZpoJcFNPROjjmMUkty5qq5JsgNtE9tPVdXiJK8F7kdbffXVfbnvewDrTfDwwvVp/9s3qqrlA+VfAs6uqlcNlE10W90Wk3wcau1i4iXNgSS7A/8C/Ay4iraC1XOBH1XVkb3Oo4DnVNULRhSmJAF/mof6F8A7gPVpwwsfCGxIm4a6Z5IDgS37/oITq7fVw2gbIv++l00NHX8s8HTgNVV1xdRQ8lHGK2l0JnZIgDRX+rCSdwPPrqrTk5xIO/Zexc2H+94b2DLJXYHfujGrpFHoF4oOB146tTltksOBuwJbAuf2qhvRN5KeVNPa6vdT5QN7cZ0PPBzYFzjKpEuabPZ4SUPWl7+9R19I4x7A94AzaMMLTwM+SttM8/m0Hi+XFJY0Ekm2BU4C/rmqju7LeW8GzBvcziLJ39P+b+1TVeeNJtrRWklbbUobRvj9gXpPobXV39CWRffES5pQ9nhJQ9ZPSqZOTA4AjqyqNyfZH9gd+BawPbCfSZekEbsj8EXgj0l2o21iezmwcZIfVtVLkqwHbAw8a1KTrm6VbdXrnQJ8vaquHVGcksaEPV7SCPWJ1y9yI0hJo5TkAVX1o35/J+CptJUL30ZbAGgr4IPAm6rqv5NkUntuZttWIwtU0tixx0uaI9NPVPrwk/nAdaOLStKkS/JE4PgkJ1bVvn2VwhuAb1TV53q1i5Ispc/pmuCkazZtdcPoIpU0jtzHS5ojUycqSdZPcgBwKLCoqi4ZbWSSJlWSOwMvBl4G/C7JxwGq6nTgSwP1ngI8hDY3dSLdhrbyf7ukm3GooTTHkqwLPAH4cVWdP+p4JE22JPcErqbtH/g+4PqqetbA84toCcf+gwtsTCLbStLtYeIlSZIA6CvzHU1bfe/ZfVXWxwJfci7qzdlWkmbLxEuSJP1Jkk1pC0X8HyDAox0SvWK2laTZcI6XJEn6k6r6NXA2cHfgKSYSK2dbSZoNEy9JkvQnSTYC9gB2GdwIWLdkW0maDYcaSpKkm0myQVX9btRxrAlsK0kzZeIlSZIkSUPmUENJkiRJGjITL0mSJEkaMhMvSZIkSRoyEy9JkiRJGjITL0mSJEkaMhMvSZIkSRqy/w/NqTbEc1DpkQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.plots.frequency([\"id\", \"reclong\"], 10)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAV0AAAD8CAYAAADUv3dIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAADtxJREFUeJzt3X+s3fVdx/Hnqxc2soBbInHDtnPEFWdFI0KIBOdQwBXi1v2xmLJM3CBc/5D9cM6JUXFiNBPjpkYy181mY1G6icu8LlWMk2ULDmyZC6EluKaacdcZjEN0mQq99+0f90APl3vPObc953NOv30+km9yvj/O53xyQ1598/l+vp9vqgpJUhubpt0BSTqdGLqS1JChK0kNGbqS1JChK0kNGbqS1JChK0nrSLInyeNJHl7nfJL8YZLDSR5K8kPD2jR0JWl9HwV2DDh/DbCtt80DHxzWoKErSeuoqs8D3xhwyU7gzlpxP/CSJOcNavOMcXZwLV/5kdf6yNuE3XTV66bdhc674crLpt2F08L1r744J9vGRjLngvv+9mdZqVCfsbuqdm/g5zYDj/XtL/aOfX29L0w8dCVpVvUCdiMhu9pa/0gMDH1DV1K3pOmo6SKwtW9/C3B00Bcc05XUKZnbNPI2BgvA9b1ZDD8MPFlV6w4tgJWupK4ZY6Wb5C7gCuDcJIvArwNnAlTVHwP7gGuBw8C3gLcOa9PQldQtOel7cc+qquuGnC/g5zbSpqErqVs2jS90J8HQldQpGWOlOwmGrqRu2TTb8wMMXUndYuhKUjsxdCWpIUNXkhryRpoktePsBUlqaW5u2j0YyNCV1C0+HCFJ7Ti8IEkttV3accMMXUnd4vCCJLXjwxGS1JKhK0kNGbqS1I6zFySpJW+kSVJDThmTpHbG9JbfiTF0JXWLY7qS1JCzFySpHR+OkKSWHF6QpIYMXUlqJy5iLkkNWelKUkM+kSZJDZ3qT6QleRWwE9gMFHAUWKiqRybcN0nasMx4pTvwn4QkvwTsBQL8I7C/9/muJLdMvnuStEGbNo2+TcGwSvdG4Puq6un+g0neDxwE3rfWl5LMA/MAt333dna9bMsYuipJw836wxHDercMfOcax8/rnVtTVe2uqkuq6hIDV1JTp3il+07gs0m+AjzWO/Zy4JXAzZPsmCSdkFN5ylhV/U2SC4BLWbmRFmAR2F9VSw36J0kbM8bQTbID+ANgDvhIVb1v1fmXAx8DXtK75paq2jeozaGzF6pqGbj/RDstSS2Na0w3yRxwB3A1vWIzyUJVHeq77FeBT1bVB5NsB/YBrxjUrvN0JXXL+BYxvxQ4XFVHAJLsZWX6bH/oFvBtvc8vZmVK7UCzfZtPkjYqGXlLMp/kQN8239fSZo7fy4KVanfzql97L/DmJIusVLlvG9Y9K11JnbKR4YWq2g3sXq+ptb6yav864KNV9XtJLgM+nuTC3rDsmgxdSd0yvseAF4GtfftbeP7wwY3ADoCq+mKSs4BzgcfXa9ThBUndsimjb4PtB7YlOT/JC4BdwMKqa74KXAmQ5HuBs4B/H9Sola6kTsmYpoxV1bEkNwP3sDIdbE9VHUxyG3CgqhaAXwA+nOTnWRl6eEtVrR6CeA5DV1K3jHER896c232rjt3a9/kQcPlG2jR0JXXLqfxEmiSdamZ9aUdDV1K3nOqLmEvSKcXhBUlqyOEFSWonc7Mda7PdO0naKCtdSWpnXA9HTIqhK6lbDF1JamjGX0xp6ErqlFl/G7ChK6lbDF1JasgxXUlqyEpXktpxwRtJasnhBUlqJ2NcxHwSDF1J3WKlK0kNuZ6uJDXkjTRJascFbySpJStdSWrodF/E/KarXjfpnzjtffjv/mraXei895z9oml34bRw/asvPuk2HF6QpJYcXpCkhqx0Jakh5+lKUjuZM3QlqR2XdpSkdpy9IEktWelKUkNWupLUkPN0JamdbJrtRcxne/BDkjZqU0bfhkiyI8mjSQ4nuWWda34qyaEkB5P82bA2rXQldcuYHo5IMgfcAVwNLAL7kyxU1aG+a7YBvwxcXlVPJPmOYe0aupI6ZYxvA74UOFxVRwCS7AV2Aof6rrkJuKOqngCoqseHNerwgqRuSUbekswnOdC3zfe1tBl4rG9/sXes3wXABUnuS3J/kh3DumelK6lTNvI24KraDexer6m1vrJq/wxgG3AFsAX4QpILq+o/1/tNQ1dSt4zv4YhFYGvf/hbg6BrX3F9VTwP/kuRRVkJ4/7rdG1fvJGkmbGB4YYj9wLYk5yd5AbALWFh1zaeBH1v52ZzLynDDkUGNWulK6pYx3UirqmNJbgbuAeaAPVV1MMltwIGqWuid+4kkh4Al4Ber6j8GtWvoSuqUjHE93araB+xbdezWvs8FvKu3jcTQldQtrr0gSQ25iLkktTPO4YVJMHQldYurjElSQy5iLknt+LoeSWrJSleSGjJ0JamdMS7tOBGGrqRuccqYJDXkjTRJasjhBUlqZyOLmE+DoSupU/7nrBeOfO05E+zHemZ7xFmSOsbQlaSGTjh0k7x1nB2RpNPByVS6v7Heif7XGh998L6T+AlJ6paBN9KSPLTeKeCl632v/7XGV7z3j1a/sliSTlvDZi+8FHgt8MSq4wH+YSI9kqQOGxa6nwHOrqovrz6R5HMT6ZEkddjA0K2qGwece9P4uyNJ3ebDEZI65em5M6fdhYEMXUmdUjN+697QldQpS8vL0+7CQIaupE6pGS91DV1JnbJs6EpSOzOeuYaupG5xeEGSGloqb6RJUjOO6UpSQ8vLhq4kNTPjha5vjpDULVU18jZMkh1JHk1yOMktA657Y5JKcsmwNq10JXXKMuMpdZPMAXcAVwOLwP4kC1V1aNV15wBvBx4YpV0rXUmdsry8PPI2xKXA4ao6UlVPAXuBnWtc95vA7cD/jtI/Q1dSpyzX6Fv/q8V623xfU5uBx/r2F3vHnpXkImBrVX1m1P45vCCpUzbycET/q8XWkLW+8uzJZBPwAeAtG+ieoSupW8b4RNoisLVvfwtwtG//HOBC4HNJAF4GLCR5fVUdWK9RQ1dSp4zx4Yj9wLYk5wNfA3YBz74xp6qeBM59Zr/3CrN3DwpcMHQldcy4QreqjiW5GbgHmAP2VNXBJLcBB6pq4UTaNXQldco4FzGvqn3AvlXHbl3n2itGadPQldQps/5EmqErqVNc2lGSGnKVMUlqyEpXkhpacmlHSWrHSleSGnJMV5IaMnQlqSGHFySpIUNXkho67Wcv3HDlZZP+idPee85+0bS70Hm3f/oT0+7C6eHdN5x0E1a6ktTQuN6RNimGrqROsdKVpIZmfEjX0JXULUtL41tPdxIMXUmd4vCCJDXkjTRJashKV5IamvHMNXQldYsL3khSQ8tjfBvwJBi6kjrFSleSGjJ0JakhZy9IUkM+BixJDVnpSlJDS85ekKR2ZrzQNXQldYvDC5LUkFPGJKkhK11JauiYN9IkqZ1Zr3Q3TbsDkjROVaNvwyTZkeTRJIeT3LLG+XclOZTkoSSfTfJdw9o0dCV1ynLVyNsgSeaAO4BrgO3AdUm2r7rsn4BLquoHgLuB24f1z9CV1ClVNfI2xKXA4ao6UlVPAXuBnat+696q+lZv935gy7BGDV1JnbKR0E0yn+RA3zbf19Rm4LG+/cXesfXcCPz1sP55I01Spyxt4EZaVe0Gdq9zOmt9Zc0LkzcDlwCvGfabhq6kThnjwxGLwNa+/S3A0dUXJbkK+BXgNVX1f8MaNXQldcoYp4ztB7YlOR/4GrALeFP/BUkuAj4E7Kiqx0dp1NCV1CnLY1pQt6qOJbkZuAeYA/ZU1cEktwEHqmoB+F3gbODPkwB8tapeP6hdQ1dSp4zz4Yiq2gfsW3Xs1r7PV220TUNXUqe44I0kNWToSlJDsx66Qx+OSPKqJFcmOXvV8R2T65YknZgxPpE2EQNDN8nbgb8E3gY8nKT/EbjfnmTHJOlEjGvthUkZVuneBFxcVW8ArgB+Lck7eufWelpj5UTfo3X3LnxqPD2VpBGMc5WxSRg2pjtXVd8EqKp/TXIFcHdv+bJ1Q7f/0bo7v/DgbA+wSOqUWX8b8LBK99+S/OAzO70A/kngXOD7J9kxSToRsz68MKzSvR441n+gqo4B1yf50MR6JUknaNbfHDEwdKtqccC5+8bfHUk6Oad06ErSqWZMSy9MjKErqVOsdCWpoVmfvWDoSuoUK11JasgxXUlqaLkcXpCkZmZ8dMHQldQtjulKUkPOXpCkhqx0JakhZy9IUkNWupLU0DKGriQ1s7TkjTRJasbhBUlqyBtpktSQla4kNVTeSJOkdqb1wslRGbqSOmVpxgd1DV1JneKYriQ1ZOhKUkOO6UpSQ7Meupum3QFJGqeqGnkbJsmOJI8mOZzkljXOvzDJJ3rnH0jyimFtGrqSOmVpuUbeBkkyB9wBXANsB65Lsn3VZTcCT1TVK4EPAL8zrH+GrqROGWOleylwuKqOVNVTwF5g56prdgIf632+G7gySQY1auhK6pTlqpG3JPNJDvRt831NbQYe69tf7B1jrWuq6hjwJPDtg/rnjTRJnbKRKWNVtRvYvc7ptSrW1Y2Pcs1zWOlK6pSq0bchFoGtfftbgKPrXZPkDODFwDcGNWroSuqUpeXlkbch9gPbkpyf5AXALmBh1TULwM/0Pr8R+PsaUmpn1p/emIYk873/7dCE+DeePP/GJy/JtcDvA3PAnqr6rSS3AQeqaiHJWcDHgYtYqXB3VdWRgW0aus+X5EBVXTLtfnSZf+PJ8288mxxekKSGDF1JasjQXZvjYJPn33jy/BvPIMd0JakhK11JasjQlaSGDN0+w5Zx08lLsifJ40kennZfuirJ1iT3JnkkycEk75h2n3ScY7o9vWXc/hm4mpVH+/YD11XVoal2rGOS/CjwTeDOqrpw2v3poiTnAedV1ZeSnAM8CLzB/5Zng5XucaMs46aTVFWfZ8iz6To5VfX1qvpS7/N/A4/w/NWxNCWG7nGjLOMmnVJ6bzK4CHhguj3RMwzd4za8RJs0y5KcDfwF8M6q+q9p90crDN3jRlnGTTolJDmTlcD906r61LT7o+MM3eNGWcZNmnm918X8CfBIVb1/2v3Rcxm6Pb1XbdwM3MPKjYdPVtXB6faqe5LcBXwR+J4ki0lunHafOuhy4KeBH0/y5d527bQ7pRVOGZOkhqx0JakhQ1eSGjJ0JakhQ1eSGjJ0JakhQ1eSGjJ0Jamh/we0mwq/RR2gWAAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.plots.correlation([\"id\",\"mass (g)\", \"reclat\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 1. , -0.01888518, 0.25706522],\n", + " [-0.01888518, 1. , 0.02892697],\n", + " [ 0.25706522, 0.02892697, 1. ]])" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.correlation([\"id\",\"mass (g)\", \"reclat\"], output=\"array\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Benchmark " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "df = op.load.csv(\"order_products__prior.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", "\n", - " \n", - "
\n", - " String\n", - " \n", - " 0\n", - "
\n", - " Integer\n", - " \n", - " 0\n", - "
\n", - " Float\n", - " \n", - " 45716\n", - "
\n", - " Bool\n", - " \n", - " 0\n", - "
\n", - " Date\n", - " \n", - " 0\n", - "
\n", - " Missing\n", - " \n", - " 0\n", - "
\n", - " Null\n", - " \n", - " 0\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", "\n", - " \n", - "
Mean61.07431878848027
Minimum-165.43333435058594
Maximum354.47332763671875
Zeros(%)6214
\n", - " \n", "\n", - "
\n", - "
\n", - "

Frequency

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + "
Viewing 100 of 32434489 rows / 4 columns
\n", "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + "
ValueCountFrecuency (%)
None731516.001%
0.0621413.593%
35.66667175292969498510.904%
168.030406.65%
26.015063.294%
159.756571.437%
159.66667175292976371.393%
157.16667175292975421.186%
155.754731.035%
\n", + " \n", + " \n", + " \n", + " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", + "
order_id
\n", + "
1 (int)
\n", "\n", - " \n", - "
160.52630.575%
\n", + "
product_id
\n", + "
2 (int)
\n", "\n", - " \n", - "
\"Missing\"731516.0%
\n", - "
\n", + " \n", " \n", + " \n", + "
add_to_cart_order
\n", + "
3 (int)
\n", "\n", + " \n", " \n", - "
\n", + " \n", + "
reordered
\n", + "
4 (int)
\n", "\n", + " \n", + " \n", + " \n", "\n", - "

Quantile statistics

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Minimum-165.43333435058594
5-th percentile-90.36556243896484
Q10.0
Median35.66667175292969
Q3157.1666717529297
95-th percentile168.0
Maximum354.47332763671875
Range519.9066619873047
Interquartile range157.1666717529297
\n", - "
\n", - "
\n", - "

Descriptive statistics

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Standard deviation80.64729806550085
Coef of variation1.32048
Kurtosis-0.7312421309648038
Mean61.07431878848027
MAD39.53972
Skewness6214
Sum2345314.915796431
Variance6503.986685265737
\n", - "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 2\n", + " \n", + " \n", + " \n", + " 33120\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 2\n", + " \n", + " \n", + " \n", + " 28985\n", + " \n", + " \n", + " \n", + " 2\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 2\n", + " \n", + " \n", + " \n", + " 9327\n", + " \n", + " \n", + " \n", + " 3\n", + " \n", + " \n", + " \n", + " 0\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 2\n", + " \n", + " \n", + " \n", + " 45918\n", + " \n", + " \n", + " \n", + " 4\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 2\n", + " \n", + " \n", + " \n", + " 30035\n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 0\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 2\n", + " \n", + " \n", + " \n", + " 17794\n", + " \n", + " \n", + " \n", + " 6\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 2\n", + " \n", + " \n", + " \n", + " 40141\n", + " \n", + " \n", + " \n", + " 7\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 2\n", + " \n", + " \n", + " \n", + " 1819\n", + " \n", + " \n", + " \n", + " 8\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 2\n", + " \n", + " \n", + " \n", + " 43668\n", + " \n", + " \n", + " \n", + " 9\n", + " \n", + " \n", + " \n", + " 0\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 3\n", + " \n", + " \n", + " \n", + " 33754\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 3\n", + " \n", + " \n", + " \n", + " 24838\n", + " \n", + " \n", + " \n", + " 2\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 3\n", + " \n", + " \n", + " \n", + " 17704\n", + " \n", + " \n", + " \n", + " 3\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 3\n", + " \n", + " \n", + " \n", + " 21903\n", + " \n", + " \n", + " \n", + " 4\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 3\n", + " \n", + " \n", + " \n", + " 17668\n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 3\n", + " \n", + " \n", + " \n", + " 46667\n", + " \n", + " \n", + " \n", + " 6\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 3\n", + " \n", + " \n", + " \n", + " 17461\n", + " \n", + " \n", + " \n", + " 7\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 3\n", + " \n", + " \n", + " \n", + " 32665\n", + " \n", + " \n", + " \n", + " 8\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 4\n", + " \n", + " \n", + " \n", + " 46842\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " 0\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 4\n", + " \n", + " \n", + " \n", + " 26434\n", + " \n", + " \n", + " \n", + " 2\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 4\n", + " \n", + " \n", + " \n", + " 39758\n", + " \n", + " \n", + " \n", + " 3\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 4\n", + " \n", + " \n", + " \n", + " 27761\n", + " \n", + " \n", + " \n", + " 4\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 4\n", + " \n", + " \n", + " \n", + " 10054\n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 4\n", + " \n", + " \n", + " \n", + " 21351\n", + " \n", + " \n", + " \n", + " 6\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 4\n", + " \n", + " \n", + " \n", + " 22598\n", + " \n", + " \n", + " \n", + " 7\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 4\n", + " \n", + " \n", + " \n", + " 34862\n", + " \n", + " \n", + " \n", + " 8\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 4\n", + " \n", + " \n", + " \n", + " 40285\n", + " \n", + " \n", + " \n", + " 9\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 4\n", + " \n", + " \n", + " \n", + " 17616\n", + " \n", + " \n", + " \n", + " 10\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 4\n", + " \n", + " \n", + " \n", + " 25146\n", + " \n", + " \n", + " \n", + " 11\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 4\n", + " \n", + " \n", + " \n", + " 32645\n", + " \n", + " \n", + " \n", + " 12\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 4\n", + " \n", + " \n", + " \n", + " 41276\n", + " \n", + " \n", + " \n", + " 13\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 13176\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 15005\n", + " \n", + " \n", + " \n", + " 2\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 47329\n", + " \n", + " \n", + " \n", + " 3\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 27966\n", + " \n", + " \n", + " \n", + " 4\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 23909\n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 48370\n", + " \n", + " \n", + " \n", + " 6\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 13245\n", + " \n", + " \n", + " \n", + " 7\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 9633\n", + " \n", + " \n", + " \n", + " 8\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 27360\n", + " \n", + " \n", + " \n", + " 9\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 6348\n", + " \n", + " \n", + " \n", + " 10\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 40878\n", + " \n", + " \n", + " \n", + " 11\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 6184\n", + " \n", + " \n", + " \n", + " 12\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 48002\n", + " \n", + " \n", + " \n", + " 13\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 20914\n", + " \n", + " \n", + " \n", + " 14\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 37011\n", + " \n", + " \n", + " \n", + " 15\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 12962\n", + " \n", + " \n", + " \n", + " 16\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 45698\n", + " \n", + " \n", + " \n", + " 17\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 24773\n", + " \n", + " \n", + " \n", + " 18\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 18569\n", + " \n", + " \n", + " \n", + " 19\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 41176\n", + " \n", + " \n", + " \n", + " 20\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 48366\n", + " \n", + " \n", + " \n", + " 21\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 47209\n", + " \n", + " \n", + " \n", + " 22\n", + " \n", + " \n", + " \n", + " 0\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 46522\n", + " \n", + " \n", + " \n", + " 23\n", + " \n", + " \n", + " \n", + " 0\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 38693\n", + " \n", + " \n", + " \n", + " 24\n", + " \n", + " \n", + " \n", + " 0\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 48825\n", + " \n", + " \n", + " \n", + " 25\n", + " \n", + " \n", + " \n", + " 0\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 8479\n", + " \n", + " \n", + " \n", + " 26\n", + " \n", + " \n", + " \n", + " 0\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 6\n", + " \n", + " \n", + " \n", + " 40462\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " 0\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 6\n", + " \n", + " \n", + " \n", + " 15873\n", + " \n", + " \n", + " \n", + " 2\n", + " \n", + " \n", + " \n", + " 0\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 6\n", + " \n", + " \n", + " \n", + " 41897\n", + " \n", + " \n", + " \n", + " 3\n", + " \n", + " \n", + " \n", + " 0\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 7\n", + " \n", + " \n", + " \n", + " 34050\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " 0\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 7\n", + " \n", + " \n", + " \n", + " 46802\n", + " \n", + " \n", + " \n", + " 2\n", + " \n", + " \n", + " \n", + " 0\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 8\n", + " \n", + " \n", + " \n", + " 23423\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 9\n", + " \n", + " \n", + " \n", + " 21405\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " 0\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 9\n", + " \n", + " \n", + " \n", + " 47890\n", + " \n", + " \n", + " \n", + " 2\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 9\n", + " \n", + " \n", + " \n", + " 11182\n", + " \n", + " \n", + " \n", + " 3\n", + " \n", + " \n", + " \n", + " 0\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 9\n", + " \n", + " \n", + " \n", + " 2014\n", + " \n", + " \n", + " \n", + " 4\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 9\n", + " \n", + " \n", + " \n", + " 29193\n", + " \n", + " \n", + " \n", + " 5\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " 9\n", + " \n", + " \n", + " \n", + " 34203\n", + " \n", + " \n", + " \n", + " 6\n", + " \n", + " \n", + " \n", + " 1\n", + " \n", " \n", - "
\n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", - "\n", - " \n", - " \n", + " \n", " \n", + " \n", " \n", - " \n", - " \n", - "\n", - " \n", + " \n", " \n", + " \n", " \n", + " \n", + " \n", + " \n", " \n", + " \n", " \n", + " \n", " \n", + " \n", " \n", - "\n", - "
\n", - "\n", - "
\n", - " \n", - "
\n", - "
\n", + " 9\n", + " \n", + " 14992\n", + "
\n", - "
\n", - " \n", - "
\n", - "\n", - "
\n", + " 7\n", + " \n", + " 1\n", + "
\n", + " 9\n", + " \n", + " 31506\n", + " \n", + " 8\n", + "
\n", - "
\n", - "\n", - "
\n", - "
\n", - "\n", + " \n", + " 1\n", + " \n", " \n", - "\n", - "
\n", - "
\n", - "

GeoLocation

\n", - "
categorical
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Distinct count 16686
Unique (%) 36.499
Missing (%)7315
Missing (n)16.0
\n", - "
\n", - "

\n", - " Datatypes\n", - "

\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "
\n", - " String\n", - " \n", - " 38401\n", - "
\n", - " Integer\n", - " \n", - " 0\n", - "
\n", - " Float\n", - " \n", - " 0\n", - "
\n", - " Bool\n", - " \n", - " 0\n", - "
\n", - " Date\n", - " \n", - " 0\n", - "
\n", - " Missing\n", - " \n", - " 0\n", - "
\n", - " Null\n", - " \n", - " 7315\n", - "
\n", - " \n", - "\n", - "
\n", - "
\n", - "

Frequency

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ValueCountFrecuency (%)
None731516.001%
(0.000000, 0.000000)621413.593%
(-71.500000, 35.666670)476110.414%
(-84.000000, 168.000000)30406.65%
(-72.000000, 26.000000)15053.292%
(-79.683330, 159.750000)6571.437%
(-76.716670, 159.666670)6371.393%
(-76.183330, 157.166670)5391.179%
(-79.683330, 155.750000)4731.035%
(-84.216670, 160.500000)2630.575%
\"Missing\"731516.0%
\n", - "
\n", + " \n", + " \n", + " \n", " \n", - "\n", + " \n", + " 9\n", + " \n", " \n", - "
\n", - " \n", + " \n", " \n", - " \n", - "\n", - " \n", - " \n", + " \n", " \n", + " \n", " \n", - " \n", - " \n", - "\n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", " \n", + " \n", + " \n", + " \n", " \n", + " \n", " \n", + " \n", " \n", + " \n", " \n", - "\n", - "
\n", + " 23288\n", + "
\n", - "\n", - "
\n", - " \n", - "
\n", - "
\n", + " 9\n", + " \n", + " 0\n", + "
\n", - "
\n", - " \n", - "
\n", - "\n", - "
\n", + " 9\n", + " \n", + " 44533\n", + " \n", + " 10\n", + " \n", + " 1\n", + "
\n", + " 9\n", + " \n", + " 18362\n", + " \n", + " 11\n", + "
\n", - "
\n", - "\n", - "\n", - "\n", - "\n", - "
Viewing 10 of 45716 rows / 10 columns
\n", - "\n", - "\n", - " \n", - " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - "\n", - " \n", - " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -3706,7 +3328,11 @@ " \n", " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", " \n", " \n", @@ -3750,43 +3388,59 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", " \n", " \n", @@ -3794,43 +3448,59 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", @@ -3838,43 +3508,59 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -3882,43 +3568,39 @@ " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -3926,43 +3608,39 @@ " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -3970,43 +3648,39 @@ " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -4014,43 +3688,39 @@ " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -4058,43 +3728,39 @@ " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -4102,7 +3768,7 @@ " \n", "
\n", + " 0\n", + " \n", - "
name
\n", - "
1 (string)
\n", - "\n", - "
\n", - "
id
\n", - "
2 (int)
\n", - "\n", - "
\n", + " 9\n", + " \n", - "
nametype
\n", - "
3 (string)
\n", - "\n", - "
\n", + " 27366\n", + " \n", - "
recclass
\n", - "
4 (string)
\n", - "\n", - "
\n", + " 12\n", + " \n", - "
mass (g)
\n", - "
5 (double)
\n", - "\n", - "
\n", + " 1\n", + " \n", - "
fall
\n", - "
6 (string)
\n", - "\n", - "
\n", - "
year
\n", - "
7 (string)
\n", - "\n", - "
\n", + " 9\n", + " \n", - "
reclat
\n", - "
8 (double)
\n", - "\n", - "
\n", + " 432\n", + " \n", - "
reclong
\n", - "
9 (double)
\n", - "\n", - "
\n", + " 13\n", + " \n", - "
GeoLocation
\n", - "
10 (string)
\n", - "\n", - "
\n", + " 1\n", + "
\n", - " Aachen\n", + " 9\n", + " \n", + " 3990\n", + " \n", + " 14\n", " \n", " 1\n", "
\n", - " Valid\n", + " 9\n", " \n", - " L5\n", + " 14183\n", " \n", - " 21.0\n", + " 15\n", " \n", - " Fell\n", + " 0\n", "
\n", - " 01/01/1880⸱12:00:00⸱AM\n", + " 10\n", " \n", - " 50.775\n", + " 24852\n", " \n", - " 6.08333\n", + " 1\n", " \n", - " (50.775000,⸱6.083330)\n", + " 1\n", "
\n", - " Aarhus\n", + " 10\n", + " \n", + " 4796\n", " \n", @@ -3714,35 +3340,47 @@ " \n", - " Valid\n", + " 1\n", "
\n", - " H6\n", + " 10\n", " \n", - " 720.0\n", + " 31717\n", " \n", - " Fell\n", + " 3\n", " \n", - " 01/01/1951⸱12:00:00⸱AM\n", + " 0\n", "
\n", - " 56.18333\n", + " 10\n", " \n", - " 10.23333\n", + " 47766\n", " \n", - " (56.183330,⸱10.233330)\n", + " 4\n", + " \n", + " 1\n", "
\n", - " Abee\n", + " 10\n", " \n", - " 6\n", + " 4605\n", " \n", - " Valid\n", + " 5\n", " \n", - " EH4\n", + " 1\n", "
\n", - " 107000.0\n", + " 10\n", " \n", - " Fell\n", + " 1529\n", " \n", - " 01/01/1952⸱12:00:00⸱AM\n", + " 6\n", " \n", - " 54.21667\n", + " 0\n", + "
\n", + " 10\n", " \n", - " -113.0\n", + " 21137\n", " \n", - " (54.216670,⸱-113.000000)\n", + " 7\n", + " \n", + " 1\n", "
\n", - " Acapulco\n", + " 10\n", " \n", - " 10\n", + " 22122\n", " \n", - " Valid\n", + " 8\n", " \n", - " Acapulcoite\n", + " 1\n", "
\n", - " 1914.0\n", + " 10\n", " \n", - " Fell\n", + " 34134\n", " \n", - " 01/01/1976⸱12:00:00⸱AM\n", + " 9\n", " \n", - " 16.88333\n", + " 1\n", "
\n", - " -99.9\n", + " 10\n", " \n", - " (16.883330,⸱-99.900000)\n", + " 27156\n", + " \n", + " 10\n", + " \n", + " 0\n", "
\n", - " Achiras\n", + " 10\n", " \n", - " 370\n", + " 14992\n", " \n", - " Valid\n", + " 11\n", " \n", - " L6\n", + " 0\n", + "
\n", + " 10\n", " \n", - " 780.0\n", + " 49235\n", " \n", - " Fell\n", + " 12\n", " \n", - " 01/01/1902⸱12:00:00⸱AM\n", + " 1\n", + "
\n", + " 10\n", " \n", - " -33.16667\n", + " 26842\n", " \n", - " -64.95\n", + " 13\n", " \n", - " (-33.166670,⸱-64.950000)\n", + " 0\n", "
\n", - " Adhi⸱Kot\n", - " \n", - " 379\n", + " 10\n", " \n", - " Valid\n", + " 3464\n", " \n", - " EH4\n", + " 14\n", " \n", - " 4239.0\n", + " 0\n", " \n", - " Fell\n", - "
\n", - " 01/01/1919⸱12:00:00⸱AM\n", + " 10\n", " \n", - " 32.1\n", + " 25720\n", " \n", - " 71.8\n", + " 15\n", " \n", - " (32.100000,⸱71.800000)\n", + " 0\n", "
\n", - " Adzhi-Bogdo⸱(stone)\n", - " \n", - " 390\n", + " 11\n", " \n", - " Valid\n", + " 30162\n", " \n", - " LL3-6\n", + " 1\n", " \n", - " 910.0\n", + " 1\n", " \n", - " Fell\n", - "
\n", - " 01/01/1949⸱12:00:00⸱AM\n", + " 11\n", " \n", - " 44.83333\n", + " 27085\n", " \n", - " 95.16667\n", + " 2\n", " \n", - " (44.833330,⸱95.166670)\n", + " 1\n", "
\n", - " Agen\n", - " \n", - " 392\n", + " 11\n", " \n", - " Valid\n", + " 5994\n", " \n", - " H5\n", + " 3\n", " \n", - " 30000.0\n", + " 1\n", " \n", - " Fell\n", - "
\n", - " 01/01/1814⸱12:00:00⸱AM\n", + " 11\n", " \n", - " 44.21667\n", + " 1313\n", " \n", - " 0.61667\n", + " 4\n", " \n", - " (44.216670,⸱0.616670)\n", + " 1\n", "
\n", - " Aguada\n", - " \n", - " 398\n", + " 11\n", " \n", - " Valid\n", + " 31506\n", " \n", - " L6\n", + " 5\n", " \n", - " 1620.0\n", + " 1\n", " \n", - " Fell\n", - "
\n", - " 01/01/1930⸱12:00:00⸱AM\n", + " 12\n", " \n", - " -31.6\n", + " 30597\n", " \n", - " -65.23333\n", + " 1\n", " \n", - " (-31.600000,⸱-65.233330)\n", + " 1\n", "
\n", - " Aguila⸱Blanca\n", - " \n", - " 417\n", + " 12\n", " \n", - " Valid\n", + " 15221\n", " \n", - " L\n", + " 2\n", " \n", - " 1440.0\n", + " 1\n", " \n", - " Fell\n", - "
\n", - " 01/01/1920⸱12:00:00⸱AM\n", + " 12\n", " \n", - " -30.86667\n", + " 43772\n", " \n", - " -64.55\n", + " 3\n", " \n", - " (-30.866670,⸱-64.550000)\n", + " 1\n", "
\n", "\n", - "
Viewing 10 of 45716 rows / 10 columns
\n" + "
Viewing 100 of 32434489 rows / 4 columns
\n" ], "text/plain": [ "" @@ -4113,161 +3779,108 @@ } ], "source": [ - "op.profiler.run(df, \"*\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Plot profile for a specific column" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "start_time = timeit.default_timer()\n", - "Profiler.columns(df, \"reclat\")\n", - "timeit.default_timer() - start_time" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Output a json file" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Plot histagram for multiple columns" + "df.table()" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Using 'column_exp' to process column 'id_buckets' with function _bucketizer\n", - "Using 'column_exp' to process column 'reclong_buckets' with function _bucketizer\n" + "Error while sending.\n", + "Traceback (most recent call last):\n", + " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 1145, in send_command\n", + " self.socket.sendall(command.encode(\"utf-8\"))\n", + "ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host\n", + "Exception while sending command.\n", + "Traceback (most recent call last):\n", + " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 1145, in send_command\n", + " self.socket.sendall(command.encode(\"utf-8\"))\n", + "ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 985, in send_command\n", + " response = connection.send_command(command)\n", + " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 1149, in send_command\n", + " \"Error while sending\", e, proto.ERROR_ON_SEND)\n", + "py4j.protocol.Py4JNetworkError: Error while sending\n", + "An error occurred while trying to connect to the Java server (127.0.0.1:50332)\n", + "Traceback (most recent call last):\n", + " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 1145, in send_command\n", + " self.socket.sendall(command.encode(\"utf-8\"))\n", + "ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 985, in send_command\n", + " response = connection.send_command(command)\n", + " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 1149, in send_command\n", + " \"Error while sending\", e, proto.ERROR_ON_SEND)\n", + "py4j.protocol.Py4JNetworkError: Error while sending\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 929, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 1067, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it\n" ] }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df.plots.hist([\"id\", \"reclong\"], 20)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA1QAAAEaCAYAAAAWrBZoAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAHURJREFUeJzt3X2UZHV95/H3hxkeRXmadhUGGBbRdUz0qCMSNYqCOqCCR9FAYgR5iqssJqi7RJAY0Iia+HSCMaxmRYwC6kbHOJEYBY26uDNoUIFFR0RmwIfhWUAdwO/+ce9g2XTTty49VvX0+3XOPVP31q+rf/M5t6v7U/fWrVQVkiRJkqThbTHqCUiSJEnSXGWhkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnqyUIlSVJPSf4lyZHT3LckSSVZ+NuelyTpt8dCJUmaUZJrkvw8ye0Dy66jntcoJLn3Axyr6qCqOqfj112cZP9NNjFJ0khYqCRJXb2gqrYfWK6fPMCjMZKk+cZCJUnqbeC0tmOSXAt8sd2+X5KvJbklyWWDR2aS7JXkS0l+luTzSf42yUfa+/ZPsm7S97gmyYHt7S2SnJzk+0luTHJBkp0nzeXIJNcmuSHJKQOPsyDJG9qv/VmSS5PsnuSsJH8z6Xt+Jsmfdvj/X5zk2IHH/+v2+14NPK9nrJKkOcRCJUmaDc8AHg08N8luwGeBNwM7A68DPplkoh37UeBSYBFwBjDle5CmcSLwwvb77QrcDJw1aczTgEcBBwCnJXl0u/0k4AjgYOAhwNHAncA5wBFJtgBIsqj92o9NNYGqyjRzOw54PvB4YBlw2KSv27+qLu74/5QkzREWKklSV59qjzjdkuRTk+57U1XdUVU/B14GrKyqlVX1q6r6PLAaODjJHsCTgDdW1S+r6svAZ4aYw58Ap1TVuqr6JfAm4LBJpxr+ZVX9vKouAy4DHtduPxY4taquqsZlVXVjVf1f4FaaEgVwOHBxVf1kiHkBvBR4d1WtraqbgLcO+fWSpDnIQiVJ6uqFVbVju7xw0n1rB27vCbxkoHzdQnPU6OG0R5Wq6o6B8T8cYg57Av808LhXAvcA/2lgzI8Hbt8JbN/e3h34/jSPew5NEaT999wh5rTRrvxmDsP8vyRJc5RvHpYkzYYauL0WOLeqjps8KMmewE5JHjRQqvYY+Po7gO0Gxi8AJgYeYi1wdFV9dYrHXjLDHNcCewPfmeK+jwDfSfI4mlMXJx+B6+JHNKVtoz16PIYkaY7xCJUkabZ9BHhBkue2F2rYpr3YxOKq+iHN6X9/mWSrJE8DXjDwtd8FtknyvCRbAqcCWw/c/37gLW0xI8lEkkM7zusDwBlJ9knjsUl2AaiqdcAqmiNTn2xPXRzWBcCJSRYn2Qk4ucdjSJLmGAuVJGlWVdVa4FDgDcB6miNDr+fXv3P+EHgycBPwF8CHB772VuBVNOXnOpojVoNX/XsPsAL41yQ/Ay5pH6uLd9KUnn8FbgM+CGw7cP85wO/S73Q/gP8JXEjzvq1vAP+75+NIkuaQVNXMoyRJ2kSSvAl4RFW9bKaxm3geT6c5urakqn41yrlIkuYOj1BJkua99vTC1wAfsExJkoZhoZIkzWvt51TdQnMVwnePeDqSpDnGU/4kSZIkqSePUEmSJElSTyP7HKpFixbVkiVLRvXtJUmSJGlal1566Q1VNTHTuJEVqiVLlrB69epRfXtJkiRJmlaSH3YZ5yl/kiRJktSThUqSJEmSerJQSZIkSVJPFipJkiRJ6slCJUmSJEk9zViokvxDkp8m+c409yfJe5OsSfKtJE+Y/WlKkiRJ0vjpcoTqQ8Dy+7n/IGCfdjke+LsHPi1JkiRJGn8zFqqq+jJw0/0MORT4cDUuAXZM8vDZmqAkSZIkjavZeA/VbsDagfV17TZJkiRJ2qwtnIXHyBTbasqByfE0pwWyxx57zMK3nl1LTv7sqKfwW3XNmc/r/bXzLSswr2GYVXdmNRzz6s6sujOr4ZhXd2bV3QPJatRm4wjVOmD3gfXFwPVTDayqs6tqWVUtm5iYmIVvLUmSJEmjMxuFagXw8vZqf/sBt1bVj2bhcSVJkiRprM14yl+SjwH7A4uSrAP+AtgSoKreD6wEDgbWAHcCr9hUk5UkSZKkcTJjoaqqI2a4v4BXz9qMJEmSJGmOmI1T/iRJkiRpXrJQSZIkSVJPFipJkiRJ6slCJUmSJEk9WagkSZIkqScLlSRJkiT1ZKGSJEmSpJ4sVJIkSZLUk4VKkiRJknqyUEmSJElSTxYqSZIkSerJQiVJkiRJPVmoJEmSJKknC5UkSZIk9WShkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnqyUIlSZIkST1ZqCRJkiSpJwuVJEmSJPVkoZIkSZKknixUkiRJktSThUqSJEmSerJQSZIkSVJPFipJkiRJ6slCJUmSJEk9WagkSZIkqScLlSRJkiT1ZKGSJEmSpJ4sVJIkSZLUk4VKkiRJknrqVKiSLE9yVZI1SU6e4v49klyU5JtJvpXk4NmfqiRJkiSNlxkLVZIFwFnAQcBS4IgkSycNOxW4oKoeDxwOvG+2JypJkiRJ46bLEap9gTVVdXVVbQDOAw6dNKaAh7S3dwCun70pSpIkSdJ46lKodgPWDqyva7cNehPwsiTrgJXAf5vqgZIcn2R1ktXr16/vMV1JkiRJGh9dClWm2FaT1o8APlRVi4GDgXOT3Oexq+rsqlpWVcsmJiaGn60kSZIkjZEuhWodsPvA+mLue0rfMcAFAFX1f4BtgEWzMUFJkiRJGlddCtUqYJ8keyXZiuaiEysmjbkWOAAgyaNpCpXn9EmSJEnarM1YqKrqbuAE4ELgSpqr+V2e5PQkh7TDXgscl+Qy4GPAUVU1+bRASZIkSdqsLOwyqKpW0lxsYnDbaQO3rwCeOrtTkyRJkqTx1umDfSVJkiRJ92WhkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnqyUIlSZIkST1ZqCRJkiSpJwuVJEmSJPVkoZIkSZKknixUkiRJktSThUqSJEmSerJQSZIkSVJPFipJkiRJ6slCJUmSJEk9WagkSZIkqScLlSRJkiT1ZKGSJEmSpJ4sVJIkSZLUk4VKkiRJknqyUEmSJElSTxYqSZIkSerJQiVJkiRJPVmoJEmSJKknC5UkSZIk9WShkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnqyUIlSZIkST1ZqCRJkiSpp06FKsnyJFclWZPk5GnGvDTJFUkuT/LR2Z2mJEmSJI2fhTMNSLIAOAt4NrAOWJVkRVVdMTBmH+DPgadW1c1JHrqpJixJkiRJ46LLEap9gTVVdXVVbQDOAw6dNOY44Kyquhmgqn46u9OUJEmSpPHTpVDtBqwdWF/Xbhv0SOCRSb6a5JIky6d6oCTHJ1mdZPX69ev7zViSJEmSxkSXQpUpttWk9YXAPsD+wBHAB5LseJ8vqjq7qpZV1bKJiYlh5ypJkiRJY6VLoVoH7D6wvhi4fooxn66qu6rqB8BVNAVLkiRJkjZbXQrVKmCfJHsl2Qo4HFgxacyngGcCJFlEcwrg1bM5UUmSJEkaNzMWqqq6GzgBuBC4Erigqi5PcnqSQ9phFwI3JrkCuAh4fVXduKkmLUmSJEnjYMbLpgNU1Upg5aRtpw3cLuCkdpEkSZKkeaHTB/tKkiRJku7LQiVJkiRJPVmoJEmSJKknC5UkSZIk9WShkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnqyUIlSZIkST1ZqCRJkiSpJwuVJEmSJPVkoZIkSZKknixUkiRJktSThUqSJEmSerJQSZIkSVJPFipJkiRJ6slCJUmSJEk9WagkSZIkqScLlSRJkiT1ZKGSJEmSpJ4sVJIkSZLUk4VKkiRJknqyUEmSJElSTxYqSZIkSerJQiVJkiRJPVmoJEmSJKknC5UkSZIk9WShkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk+dClWS5UmuSrImycn3M+6wJJVk2exNUZIkSZLG04yFKskC4CzgIGApcESSpVOMezBwIvD12Z6kJEmSJI2jLkeo9gXWVNXVVbUBOA84dIpxZwBvB34xi/OTJEmSpLHVpVDtBqwdWF/XbrtXkscDu1fVP9/fAyU5PsnqJKvXr18/9GQlSZIkaZx0KVSZYlvde2eyBfAu4LUzPVBVnV1Vy6pq2cTERPdZSpIkSdIY6lKo1gG7D6wvBq4fWH8w8DvAxUmuAfYDVnhhCkmSJEmbuy6FahWwT5K9kmwFHA6s2HhnVd1aVYuqaklVLQEuAQ6pqtWbZMaSJEmSNCZmLFRVdTdwAnAhcCVwQVVdnuT0JIds6glKkiRJ0rha2GVQVa0EVk7adto0Y/d/4NOSJEmSpPHX6YN9JUmSJEn3ZaGSJEmSpJ4sVJIkSZLUk4VKkiRJknqyUEmSJElSTxYqSZIkSerJQiVJkiRJPVmoJEmSJKknC5UkSZIk9WShkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnqyUIlSZIkST1ZqCRJkiSpJwuVJEmSJPVkoZIkSZKknixUkiRJktSThUqSJEmSerJQSZIkSVJPFipJkiRJ6slCJUmSJEk9WagkSZIkqScLlSRJkiT1ZKGSJEmSpJ4sVJIkSZLUk4VKkiRJknqyUEmSJElSTxYqSZIkSerJQiVJkiRJPVmoJEmSJKmnToUqyfIkVyVZk+TkKe4/KckVSb6V5AtJ9pz9qUqSJEnSeJmxUCVZAJwFHAQsBY5IsnTSsG8Cy6rqscAngLfP9kQlSZIkadx0OUK1L7Cmqq6uqg3AecChgwOq6qKqurNdvQRYPLvTlCRJkqTx06VQ7QasHVhf126bzjHAv0x1R5Ljk6xOsnr9+vXdZylJkiRJY6hLocoU22rKgcnLgGXAO6a6v6rOrqplVbVsYmKi+ywlSZIkaQwt7DBmHbD7wPpi4PrJg5IcCJwCPKOqfjk705MkSZKk8dXlCNUqYJ8keyXZCjgcWDE4IMnjgb8HDqmqn87+NCVJkiRp/MxYqKrqbuAE4ELgSuCCqro8yelJDmmHvQPYHvh4kv9IsmKah5MkSZKkzUaXU/6oqpXAyknbThu4feAsz0uSJEmSxl6nD/aVJEmSJN2XhUqSJEmSerJQSZIkSVJPFipJkiRJ6slCJUmSJEk9WagkSZIkqScLlSRJkiT1ZKGSJEmSpJ4sVJIkSZLUk4VKkiRJknqyUEmSJElSTxYqSZIkSerJQiVJkiRJPVmoJEmSJKknC5UkSZIk9WShkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnqyUIlSZIkST1ZqCRJkiSpJwuVJEmSJPVkoZIkSZKknixUkiRJktSThUqSJEmSerJQSZIkSVJPFipJkiRJ6slCJUmSJEk9WagkSZIkqScLlSRJkiT1ZKGSJEmSpJ46Faoky5NclWRNkpOnuH/rJOe39389yZLZnqgkSZIkjZsZC1WSBcBZwEHAUuCIJEsnDTsGuLmqHgG8C3jbbE9UkiRJksZNlyNU+wJrqurqqtoAnAccOmnMocA57e1PAAckyexNU5IkSZLGT6rq/gckhwHLq+rYdv2PgSdX1QkDY77TjlnXrn+/HXPDpMc6Hji+XX0UcNVs/UfmuEXADTOOEpjVsMyrO7PqzqyGY17dmdVwzKs7s+rOrH5tz6qamGnQwg4PNNWRpsktrMsYqups4OwO33NeSbK6qpaNeh5zgVkNx7y6M6vuzGo45tWdWQ3HvLozq+7ManhdTvlbB+w+sL4YuH66MUkWAjsAN83GBCVJkiRpXHUpVKuAfZLslWQr4HBgxaQxK4Aj29uHAV+smc4llCRJkqQ5bsZT/qrq7iQnABcCC4B/qKrLk5wOrK6qFcAHgXOTrKE5MnX4ppz0ZsjTILszq+GYV3dm1Z1ZDce8ujOr4ZhXd2bVnVkNacaLUkiSJEmSptbpg30lSZIkSfdloZIkSZKknixUGjtJ3C+H4IdoS5pLfM6StLnxD9ffIovC/UvyhCTbV9WvRj2XuSDJ3km28oqaM0uyo3/EaVNzH7t/SXZNstDnrG6SbDPqOcw1SRaMeg6an/wDfxNK8twkb01yZpIdLArTS3Ig8G/AKRbPmSV5AfA+4DGjnsu4S/I84M3Ag0c9l7mgfd56/6jnMRck2T/JCUmOSvIgi8L0khwCvBvYZ9RzmQuSPB84tf24Gt2PJM9przxNVd3j3xDTS/KUJH+Y5MAkW496PpsTd7pNJMly4ExgDfBQ4JUD95n7gDardwDnAA/eWDx9tXdqSR4DvAs4o6q+Oek+960BSQ4CzgA+XlW3TbrP/WuS9tXdZwDHJzltYLtZTZLkYJqfw52AZwNHDdxnXgOSPBF4J/C3VXXlpPt8zpqk/Z14BvCFqtow6vmMqzS2Bl5OUz7fAlBVv0qy5WhnN37a56z3AfsBxwHPHe2MNi8zfg6Vhtf+gjgMOLOqzk9yO/C77StO/15VtybZwiNWkGQ/4K3A8VW1Ksm3kpxaVW/21d5pPRT4WlV9Jcli4FBgEfD3VfXjJDE7SLIX8Dbg/Kr6UpKdgScABXy9qm735/A3ta/ufg7YFjgoycOq6lXAVsAvRzu78ZHk0cD/AE6oqq8mOQl4UJJHAuuq6k73rd/wn4HPVdWX2+es/YGtgU9W1S1m9WtJlgLvAd5RVRcl2Ql4BPAL4HtV9YuRTnCMtL/nfpnkw8A3gGOT7FRVr6qqu0Y8vbGS5AnAm4BXVtUlSd4GTLS/F2+3uD9wvjK0aSygefJ7RJJnAqcDewAvAS5tf+D95dG4ETiqqla16ycCj0my5wjnNO6uATYk2Rs4D9iFZv9amWRXy9S9fgH8E3B3kiOBFTRHEV4JrEjyYH8Ofy3JwvbIyt3AbcCBND+LK4DPJnmQR17udRvwmrZM7Qy8Gvh94DXAZ5I8xH3rN1xH84fvdsDHgSfSlKpV/j68j58Dq4AtkjyH5nnrz4E3Ame1GYrfeL/UDjSndO8LHJBkRfu8hUeq7nUzcGJbpnYB/oDmhf83A+9zv3rgLFSzKMnTkzy8fWXkHJpzxV8FXFRVL6+qI4GLgP86ynmOgyTPSLJbVX2vqi4b+EPtB8BDgMe24/wDjnv3rYe1q7fRHJH6U5pXfU+vqmOArwBvGNUcx0W7b+1aVT8C3k9zdOUk4B+r6mXAHwFrgYNHOM2xsXHfqqq7q/E1YHFV3QH8NfAs4O6qumO+l/WBfeu6qvqPdvPTgDdU1QtofiavB54/skmOiSQPS7LxfYvXAgfQnPZ3blX9WVX9MfBF4PhRzXGctHntWFU/oClPv0dzSum5VfUi4FSa57InjnCaY2Fg39pYxD8NTFTV7cDraV4M2hpgvh+parPaoap+UFWXtJsPoXnLwEE0+9i2NGVUD4CFapa0rySdA+wJ0B5xOQb4APCdgaFrgXt+6xMcI21WHwIWT76vqn4IfBJ4S1u45vUfcDDlvnUjzfn1TwN+L8lu7dDLgZtGMskxMbBv7QHQlqoPACdV1d+12zYAd9AcSZ7XJu9b7batgNuSvJnmfaB/AuzWniIybw3sW7sPbq+qFVV1fnv7LuBW5vnv1jQXgvkocH6SY6tqHXAC8FKa562NfkhzRHReG8jro21eP6A5cnBaVZ0NUFXfpXnOmtcX1xnI6jzgFe0RqO2A7ZKcQvOcdTzwRJ+z7s3qY0mO3nhEr6r+V1V9sL39PZpiuv3oZrqZqCqXB7jQvLHvMmC/dn0bYEF7+1HADcCftctq4NGjnvMYZbU1sMWkMVsCHwZeNOr5jnqZZt/asr29FPgszS/e97bjfmfUcx6jrLYGFk4x7rD253DvUc95DPPa+Ly1HPgR8JJ2fUdgr1HPecyy2mKKcS9u961HjHrOI8zqecA3gSfRHAW+CFjU3rcvcAXwWpoXhb4JLB31nMcwr12mGPci4FJgyajnPEZZfRHYub3vj2iODm98ztrJn8P7ZLXTFONeSPP+s3m7X83W4kUpZsdzgG2rOTd1Avgr4CFJvgqcS3P6xyvasUfWpKsczTPTZfVlmgt2fKuq7kqyCvj6SGc6HqbKa4ckX6F5L8JRNG/4fizw3qpaM7KZjt6M+1aSP6A5/e+oqvr+KCc7Bqbbt/4d+BpNOb8xyZZVdQtwyygnO2Jd9q1jaU7xfvk8/zl8EvDGai4y9DCaU7jf2u5XK2n+0Hsc8Ejg8Kq6anRTHQvT5fUl4NvtvnUMzemkR1TVNSOc66hNzmoH4MwkF9O8cP2kqrqufc66meZ9Q/PVdFl9mV/vVyfSnIHw0nm+X82KtA1VPaT5ENrb29sfpPmj9i7gH2l+uJ8I3FxVb02yELin5mngHbJ6As0FKv6mqub1KZHQed+6qarOHN0sx0PXfauq3t5e+e+eqrp2ZBMesY553VBV75jvV18bct/am2bfumZU8x2lNJ/DdcfA+nbAF4DPA5fQXI7/9qo6Y0RTHCsd8no6cFtV/VWS/0LzPsZ5WdRnyOrrwFOBn7V/a83356xh9qvHABuqOe1PD9SoD5HN1QU4iObo01MGtp1F8+bkjevPojkla5tRz3eOZPXPwNajnu+oF/PaZFltO+r5jnrpmNcB7lvuWz2zevKk7YsHbj/T/apXXtuNer5zJKvPzPd9y/1qtIun/PX3KJr3sDwryVZVdXFVvTrJFgOvkCyiecPtfM+5a1b30Lx/ar5/3o15dTdMVvP+IhR0y2sX3LfAfWsYG7N6dnu61Vfa7dclWVDNWQe70Lz5fb7vVzBcXu5b3bIq3Lfcr0Zovv+hP5SNl/CupuZfS3PKxxbA8iQ/pfmsjTureQ/Qq4CjgVdUe8rIfGJWwzGv7sxqOObVnVl1dz9ZPT/JDTQXNRnM6hia9y7Ou6zgAeX1s1HNeVTct7pzvxof8/rSrj1s3e600JyP+iWaS3feQvO5ERcAuyR5ELA3zS/ab49kpqNnVsMxr+7Majjm1Z1ZdXd/Wf0FTVaL2jfEP5Lmj7j5mhWY1zDMqjuzGhMWqo6SPBdYmeSh7aYFNJcWvprmszQOAm6nuTzsHcDr5+tOa1bDMa/uzGo45tWdWXU3RFY7V9WPgdfN16zAvIZhVt2Z1XixUHXQ7rTvoTlHdylANZcRPgf47zSfp3EK8G3gxUm2rXl6lRmzGo55dWdWwzGv7syquyGzOqzNat5+eK95dWdW3ZnVGKoxuDLGOC80bf8bwO8Drwc+M3Df0cAa4Pnt+qOAh456zmY1NxbzMivzGv1iVmZlXqNfzMqs5vri51DNIMk7gU9X1ZeSbAn8G/Dhqvpge//iqlqXJDXPwzSr4ZhXd2Y1HPPqzqy6M6vhmFd3ZtWdWY0nr/I3jSSH0Jx3elK7vlVVbUhyPrDXwNDrRjLBMWJWwzGv7sxqOObVnVl1Z1bDMa/uzKo7sxpvvodqCkmeA5wOXL9xW1VtaG9+FTg6yfJ2ew3+O9+Y1XDMqzuzGo55dWdW3ZnVcMyrO7PqzqzmgPs7H3A+LsBTgJ8A+7brOwB7AtsDW7bbjqP5NOpdRj1fs5o7i3mZlXmNfjErszKv0S9mZVab2+Ipf/d1I3AX8PAkuwCfAH5Oc+nJzyU5F/gusB+wYdpHmR/Majjm1Z1ZDce8ujOr7sxqOObVnVl1Z1Zzwagb3TguwONoruO/jqb1b0Fz5ZTzgJ3aMTuPep7jsJiVeZnVeCzmZVZmNfrFvMzKrObn4lX+ppFkKfDMqjprYNvngFOq6tLRzWz8mNVwzKs7sxqOeXVnVt2Z1XDMqzuz6s6sxpun/E2jqq4Arti4nuTFwATNqwMaYFbDMa/uzGo45tWdWXVnVsMxr+7MqjuzGm8WqhkkCfAK4HXAS6rqJyOe0tgyq+GYV3dmNRzz6s6sujOr4ZhXd2bVnVmNJ0/5m0G74z4D+HFV/b9Rz2ecmdVwzKs7sxqOeXVnVt2Z1XDMqzuz6s6sxpOFSpIkSZJ68oN9JUmSJKknC5UkSZIk9WShkiRJkqSeLFSSJEmS1JOFSpIkSZJ6slBJkiRJUk8WKkmSJEnq6f8DqWluwq3MrRwAAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df.plots.frequency([\"id\", \"reclong\"], 10)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAV0AAAD8CAYAAADUv3dIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAADtxJREFUeJzt3X+s3fVdx/Hnqxc2soBbInHDtnPEFWdFI0KIBOdQwBXi1v2xmLJM3CBc/5D9cM6JUXFiNBPjpkYy181mY1G6icu8LlWMk2ULDmyZC6EluKaacdcZjEN0mQq99+0f90APl3vPObc953NOv30+km9yvj/O53xyQ1598/l+vp9vqgpJUhubpt0BSTqdGLqS1JChK0kNGbqS1JChK0kNGbqS1JChK0nrSLInyeNJHl7nfJL8YZLDSR5K8kPD2jR0JWl9HwV2DDh/DbCtt80DHxzWoKErSeuoqs8D3xhwyU7gzlpxP/CSJOcNavOMcXZwLV/5kdf6yNuE3XTV66bdhc674crLpt2F08L1r744J9vGRjLngvv+9mdZqVCfsbuqdm/g5zYDj/XtL/aOfX29L0w8dCVpVvUCdiMhu9pa/0gMDH1DV1K3pOmo6SKwtW9/C3B00Bcc05XUKZnbNPI2BgvA9b1ZDD8MPFlV6w4tgJWupK4ZY6Wb5C7gCuDcJIvArwNnAlTVHwP7gGuBw8C3gLcOa9PQldQtOel7cc+qquuGnC/g5zbSpqErqVs2jS90J8HQldQpGWOlOwmGrqRu2TTb8wMMXUndYuhKUjsxdCWpIUNXkhryRpoktePsBUlqaW5u2j0YyNCV1C0+HCFJ7Ti8IEkttV3accMMXUnd4vCCJLXjwxGS1JKhK0kNGbqS1I6zFySpJW+kSVJDThmTpHbG9JbfiTF0JXWLY7qS1JCzFySpHR+OkKSWHF6QpIYMXUlqJy5iLkkNWelKUkM+kSZJDZ3qT6QleRWwE9gMFHAUWKiqRybcN0nasMx4pTvwn4QkvwTsBQL8I7C/9/muJLdMvnuStEGbNo2+TcGwSvdG4Puq6un+g0neDxwE3rfWl5LMA/MAt333dna9bMsYuipJw836wxHDercMfOcax8/rnVtTVe2uqkuq6hIDV1JTp3il+07gs0m+AjzWO/Zy4JXAzZPsmCSdkFN5ylhV/U2SC4BLWbmRFmAR2F9VSw36J0kbM8bQTbID+ANgDvhIVb1v1fmXAx8DXtK75paq2jeozaGzF6pqGbj/RDstSS2Na0w3yRxwB3A1vWIzyUJVHeq77FeBT1bVB5NsB/YBrxjUrvN0JXXL+BYxvxQ4XFVHAJLsZWX6bH/oFvBtvc8vZmVK7UCzfZtPkjYqGXlLMp/kQN8239fSZo7fy4KVanfzql97L/DmJIusVLlvG9Y9K11JnbKR4YWq2g3sXq+ptb6yav864KNV9XtJLgM+nuTC3rDsmgxdSd0yvseAF4GtfftbeP7wwY3ADoCq+mKSs4BzgcfXa9ThBUndsimjb4PtB7YlOT/JC4BdwMKqa74KXAmQ5HuBs4B/H9Sola6kTsmYpoxV1bEkNwP3sDIdbE9VHUxyG3CgqhaAXwA+nOTnWRl6eEtVrR6CeA5DV1K3jHER896c232rjt3a9/kQcPlG2jR0JXXLqfxEmiSdamZ9aUdDV1K3nOqLmEvSKcXhBUlqyOEFSWonc7Mda7PdO0naKCtdSWpnXA9HTIqhK6lbDF1JamjGX0xp6ErqlFl/G7ChK6lbDF1JasgxXUlqyEpXktpxwRtJasnhBUlqJ2NcxHwSDF1J3WKlK0kNuZ6uJDXkjTRJascFbySpJStdSWrodF/E/KarXjfpnzjtffjv/mraXei895z9oml34bRw/asvPuk2HF6QpJYcXpCkhqx0Jakh5+lKUjuZM3QlqR2XdpSkdpy9IEktWelKUkNWupLUkPN0JamdbJrtRcxne/BDkjZqU0bfhkiyI8mjSQ4nuWWda34qyaEkB5P82bA2rXQldcuYHo5IMgfcAVwNLAL7kyxU1aG+a7YBvwxcXlVPJPmOYe0aupI6ZYxvA74UOFxVRwCS7AV2Aof6rrkJuKOqngCoqseHNerwgqRuSUbekswnOdC3zfe1tBl4rG9/sXes3wXABUnuS3J/kh3DumelK6lTNvI24KraDexer6m1vrJq/wxgG3AFsAX4QpILq+o/1/tNQ1dSt4zv4YhFYGvf/hbg6BrX3F9VTwP/kuRRVkJ4/7rdG1fvJGkmbGB4YYj9wLYk5yd5AbALWFh1zaeBH1v52ZzLynDDkUGNWulK6pYx3UirqmNJbgbuAeaAPVV1MMltwIGqWuid+4kkh4Al4Ber6j8GtWvoSuqUjHE93araB+xbdezWvs8FvKu3jcTQldQtrr0gSQ25iLkktTPO4YVJMHQldYurjElSQy5iLknt+LoeSWrJSleSGjJ0JamdMS7tOBGGrqRuccqYJDXkjTRJasjhBUlqZyOLmE+DoSupU/7nrBeOfO05E+zHemZ7xFmSOsbQlaSGTjh0k7x1nB2RpNPByVS6v7Heif7XGh998L6T+AlJ6paBN9KSPLTeKeCl632v/7XGV7z3j1a/sliSTlvDZi+8FHgt8MSq4wH+YSI9kqQOGxa6nwHOrqovrz6R5HMT6ZEkddjA0K2qGwece9P4uyNJ3ebDEZI65em5M6fdhYEMXUmdUjN+697QldQpS8vL0+7CQIaupE6pGS91DV1JnbJs6EpSOzOeuYaupG5xeEGSGloqb6RJUjOO6UpSQ8vLhq4kNTPjha5vjpDULVU18jZMkh1JHk1yOMktA657Y5JKcsmwNq10JXXKMuMpdZPMAXcAVwOLwP4kC1V1aNV15wBvBx4YpV0rXUmdsry8PPI2xKXA4ao6UlVPAXuBnWtc95vA7cD/jtI/Q1dSpyzX6Fv/q8V623xfU5uBx/r2F3vHnpXkImBrVX1m1P45vCCpUzbycET/q8XWkLW+8uzJZBPwAeAtG+ieoSupW8b4RNoisLVvfwtwtG//HOBC4HNJAF4GLCR5fVUdWK9RQ1dSp4zx4Yj9wLYk5wNfA3YBz74xp6qeBM59Zr/3CrN3DwpcMHQldcy4QreqjiW5GbgHmAP2VNXBJLcBB6pq4UTaNXQldco4FzGvqn3AvlXHbl3n2itGadPQldQps/5EmqErqVNc2lGSGnKVMUlqyEpXkhpacmlHSWrHSleSGnJMV5IaMnQlqSGHFySpIUNXkho67Wcv3HDlZZP+idPee85+0bS70Hm3f/oT0+7C6eHdN5x0E1a6ktTQuN6RNimGrqROsdKVpIZmfEjX0JXULUtL41tPdxIMXUmd4vCCJDXkjTRJashKV5IamvHMNXQldYsL3khSQ8tjfBvwJBi6kjrFSleSGjJ0JakhZy9IUkM+BixJDVnpSlJDS85ekKR2ZrzQNXQldYvDC5LUkFPGJKkhK11JauiYN9IkqZ1Zr3Q3TbsDkjROVaNvwyTZkeTRJIeT3LLG+XclOZTkoSSfTfJdw9o0dCV1ynLVyNsgSeaAO4BrgO3AdUm2r7rsn4BLquoHgLuB24f1z9CV1ClVNfI2xKXA4ao6UlVPAXuBnat+696q+lZv935gy7BGDV1JnbKR0E0yn+RA3zbf19Rm4LG+/cXesfXcCPz1sP55I01Spyxt4EZaVe0Gdq9zOmt9Zc0LkzcDlwCvGfabhq6kThnjwxGLwNa+/S3A0dUXJbkK+BXgNVX1f8MaNXQldcoYp4ztB7YlOR/4GrALeFP/BUkuAj4E7Kiqx0dp1NCV1CnLY1pQt6qOJbkZuAeYA/ZU1cEktwEHqmoB+F3gbODPkwB8tapeP6hdQ1dSp4zz4Yiq2gfsW3Xs1r7PV220TUNXUqe44I0kNWToSlJDsx66Qx+OSPKqJFcmOXvV8R2T65YknZgxPpE2EQNDN8nbgb8E3gY8nKT/EbjfnmTHJOlEjGvthUkZVuneBFxcVW8ArgB+Lck7eufWelpj5UTfo3X3LnxqPD2VpBGMc5WxSRg2pjtXVd8EqKp/TXIFcHdv+bJ1Q7f/0bo7v/DgbA+wSOqUWX8b8LBK99+S/OAzO70A/kngXOD7J9kxSToRsz68MKzSvR441n+gqo4B1yf50MR6JUknaNbfHDEwdKtqccC5+8bfHUk6Oad06ErSqWZMSy9MjKErqVOsdCWpoVmfvWDoSuoUK11JasgxXUlqaLkcXpCkZmZ8dMHQldQtjulKUkPOXpCkhqx0JakhZy9IUkNWupLU0DKGriQ1s7TkjTRJasbhBUlqyBtpktSQla4kNVTeSJOkdqb1wslRGbqSOmVpxgd1DV1JneKYriQ1ZOhKUkOO6UpSQ7Meupum3QFJGqeqGnkbJsmOJI8mOZzkljXOvzDJJ3rnH0jyimFtGrqSOmVpuUbeBkkyB9wBXANsB65Lsn3VZTcCT1TVK4EPAL8zrH+GrqROGWOleylwuKqOVNVTwF5g56prdgIf632+G7gySQY1auhK6pTlqpG3JPNJDvRt831NbQYe69tf7B1jrWuq6hjwJPDtg/rnjTRJnbKRKWNVtRvYvc7ptSrW1Y2Pcs1zWOlK6pSq0bchFoGtfftbgKPrXZPkDODFwDcGNWroSuqUpeXlkbch9gPbkpyf5AXALmBh1TULwM/0Pr8R+PsaUmpn1p/emIYk873/7dCE+DeePP/GJy/JtcDvA3PAnqr6rSS3AQeqaiHJWcDHgYtYqXB3VdWRgW0aus+X5EBVXTLtfnSZf+PJ8288mxxekKSGDF1JasjQXZvjYJPn33jy/BvPIMd0JakhK11JasjQlaSGDN0+w5Zx08lLsifJ40kennZfuirJ1iT3JnkkycEk75h2n3ScY7o9vWXc/hm4mpVH+/YD11XVoal2rGOS/CjwTeDOqrpw2v3poiTnAedV1ZeSnAM8CLzB/5Zng5XucaMs46aTVFWfZ8iz6To5VfX1qvpS7/N/A4/w/NWxNCWG7nGjLOMmnVJ6bzK4CHhguj3RMwzd4za8RJs0y5KcDfwF8M6q+q9p90crDN3jRlnGTTolJDmTlcD906r61LT7o+MM3eNGWcZNmnm918X8CfBIVb1/2v3Rcxm6Pb1XbdwM3MPKjYdPVtXB6faqe5LcBXwR+J4ki0lunHafOuhy4KeBH0/y5d527bQ7pRVOGZOkhqx0JakhQ1eSGjJ0JakhQ1eSGjJ0JakhQ1eSGjJ0Jamh/we0mwq/RR2gWAAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df.plots.correlation([\"id\",\"mass (g)\", \"reclat\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[ 1. , -0.01888518, 0.25706522],\n", - " [-0.01888518, 1. , 0.02892697],\n", - " [ 0.25706522, 0.02892697, 1. ]])" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" + "ename": "Py4JNetworkError", + "evalue": "An error occurred while trying to connect to the Java server (127.0.0.1:50332)", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mConnectionResetError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command)\u001b[0m\n\u001b[0;32m 1144\u001b[0m \u001b[1;31m# if it sent a RST packet (SO_LINGER)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1145\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msendall\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"utf-8\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1146\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mConnectionResetError\u001b[0m: [WinError 10054] An existing connection was forcibly closed by the remote host", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[1;31mPy4JNetworkError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command, retry, binary)\u001b[0m\n\u001b[0;32m 984\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 985\u001b[1;33m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 986\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mbinary\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command)\u001b[0m\n\u001b[0;32m 1148\u001b[0m raise Py4JNetworkError(\n\u001b[1;32m-> 1149\u001b[1;33m \"Error while sending\", e, proto.ERROR_ON_SEND)\n\u001b[0m\u001b[0;32m 1150\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mPy4JNetworkError\u001b[0m: Error while sending", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[1;31mIndexError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m_get_connection\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 928\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 929\u001b[1;33m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdeque\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 930\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mIndexError\u001b[0m: pop from an empty deque", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[1;31mConnectionRefusedError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36mstart\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1066\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1067\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maddress\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mport\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1068\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstream\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmakefile\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"rb\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mConnectionRefusedError\u001b[0m: [WinError 10061] No connection could be made because the target machine actively refused it", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[1;31mPy4JNetworkError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mop\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mprofiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"product_id\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\profiler\\profiler.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self, df, columns, buckets)\u001b[0m\n\u001b[0;32m 343\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 344\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparse_columns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 345\u001b[1;33m \u001b[0moutput\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mProfiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_json\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbuckets\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 346\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 347\u001b[0m \u001b[1;31m# Load jinja\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\profiler\\profiler.py\u001b[0m in \u001b[0;36mto_json\u001b[1;34m(df, columns, buckets)\u001b[0m\n\u001b[0;32m 402\u001b[0m \"\"\"\n\u001b[0;32m 403\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 404\u001b[1;33m \u001b[0moutput\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mProfiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbuckets\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 405\u001b[0m \u001b[0mdataset\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mProfiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdataset_info\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 406\u001b[0m \u001b[0moutput\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"summary\"\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdataset\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\profiler\\profiler.py\u001b[0m in \u001b[0;36mcolumns\u001b[1;34m(df, columns, buckets)\u001b[0m\n\u001b[0;32m 163\u001b[0m \u001b[0mcolumn_info\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'columns'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 164\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 165\u001b[1;33m \u001b[0mrows_count\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 166\u001b[0m \u001b[0mcolumn_info\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'rows_count'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mrows_count\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 167\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pyspark\\sql\\dataframe.py\u001b[0m in \u001b[0;36mcount\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 453\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 454\u001b[0m \"\"\"\n\u001b[1;32m--> 455\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 456\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 457\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mignore_unicode_prefix\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m 1253\u001b[0m \u001b[0mproto\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mEND_COMMAND_PART\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1254\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1255\u001b[1;33m \u001b[0manswer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1256\u001b[0m return_value = get_return_value(\n\u001b[0;32m 1257\u001b[0m answer, self.gateway_client, self.target_id, self.name)\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command, retry, binary)\u001b[0m\n\u001b[0;32m 998\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_should_retry\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mretry\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpne\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 999\u001b[0m \u001b[0mlogging\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Exception while sending command.\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mexc_info\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1000\u001b[1;33m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbinary\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbinary\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1001\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1002\u001b[0m logging.exception(\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command, retry, binary)\u001b[0m\n\u001b[0;32m 981\u001b[0m \u001b[1;32mif\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0mbinary\u001b[0m\u001b[0;31m`\u001b[0m \u001b[1;32mis\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[0;31m`\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 982\u001b[0m \"\"\"\n\u001b[1;32m--> 983\u001b[1;33m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_connection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 984\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 985\u001b[0m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m_get_connection\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 929\u001b[0m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdeque\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 930\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 931\u001b[1;33m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_create_connection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 932\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 933\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m_create_connection\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 935\u001b[0m connection = GatewayConnection(\n\u001b[0;32m 936\u001b[0m self.gateway_parameters, self.gateway_property)\n\u001b[1;32m--> 937\u001b[1;33m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 938\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 939\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36mstart\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1077\u001b[0m \u001b[1;34m\"server ({0}:{1})\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maddress\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mport\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1078\u001b[0m \u001b[0mlogger\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexception\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1079\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mPy4JNetworkError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1080\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1081\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_authenticate_connection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mPy4JNetworkError\u001b[0m: An error occurred while trying to connect to the Java server (127.0.0.1:50332)" + ] } ], "source": [ - "df.correlation([\"id\",\"mass (g)\", \"reclat\"], output=\"array\")" + "op.profiler.run(df, \"product_id\")" ] }, { diff --git a/optimus/dataframe/columns.py b/optimus/dataframe/columns.py index e931557f8..f281e4108 100644 --- a/optimus/dataframe/columns.py +++ b/optimus/dataframe/columns.py @@ -2,6 +2,7 @@ import itertools import re import string +import timeit import unicodedata from fastnumbers import fast_float from functools import reduce @@ -16,9 +17,8 @@ from optimus.functions import abstract_udf as audf, concat from optimus.functions import filter_row_by_data_type as fbdt -from optimus.helpers.checkit \ - import is_num_or_str, is_list, is_, is_tuple, is_list_of_dataframes, is_list_of_tuples, \ - is_function, is_one_element, is_type, is_int, is_dict, is_str, has_ +from optimus.helpers.checkit import is_num_or_str, is_list, is_tuple, is_list_of_dataframes, is_list_of_tuples, \ + is_function, is_one_element, is_type, is_int, is_dict, is_str, is_ # Helpers from optimus.helpers.constants import * from optimus.helpers.decorators import add_attr @@ -83,7 +83,8 @@ def append(cols_values=None): col_name = c[0] value = c[1] df_result = df_result.cols.append(col_name, value) - + else: + raise Exception("Must be List of dataframes or list of tuples") return df_result @add_attr(cols) @@ -229,10 +230,10 @@ def rename(old_column, new_column, func=None): def rename(old_column, new_column): return rename([(old_column, new_column)], None) - def _cast(cols, args): + def _cast(column, args): """ Helper function to support the multiple params implementation - :param cols: + :param column: :param args: :return: """ @@ -243,23 +244,27 @@ def _cast(cols, args): # if parse_spark_dtypes(attr[0]) def cast_factory(cls): - # Parse standard data types - if get_spark_dtypes_object(cls): - func_type = "column_exp" - - def cast_to_vectors(col_name, attr): - return F.col(col_name).cast(get_spark_dtypes_object(cls)) - - func_return_type = None - # Parse to Vector - elif is_type(cls, Vectors): + func_return_type = None + cast_to_vectors = None + func_type = None + + if is_type(cls, Vectors): func_type = "udf" def cast_to_vectors(val, attr): return Vectors.dense(val) func_return_type = VectorUDT() + # Parse standard data types + elif get_spark_dtypes_object(cls): + + func_type = "column_exp" + + def cast_to_vectors(col_name, attr): + return F.col(col_name).cast(get_spark_dtypes_object(cls)) + + func_return_type = None # Add here any other parse you want else: @@ -268,7 +273,7 @@ def cast_to_vectors(val, attr): return func_return_type, cast_to_vectors, func_type df = self - for col, args in zip(cols, args): + for col, args in zip(column, args): return_type, func, func_type = cast_factory(args[0]) df = df.withColumn(col, audf(col, func, func_return_type=return_type, @@ -523,13 +528,15 @@ def median(columns): return percentile(columns, [0.5]) @add_attr(cols) - def percentile(columns, values=None, error=0): + def percentile(columns, values=None, error=1): """ Return the percentile of a dataframe :param columns: '*', list of columns names or a single column name. :param values: list of percentiles to be calculated :return: percentiles per columns """ + start_time = timeit.default_timer() + if values is None: values = [0.05, 0.25, 0.5, 0.75, 0.95] @@ -547,6 +554,8 @@ def percentile(columns, values=None, error=0): percentile_results = dict(zip(columns, percentile_results)) + logging.info("percentile") + logging.info(timeit.default_timer() - start_time) return format_dict(percentile_results) # Descriptive Analytics @@ -894,7 +903,6 @@ def count_zeros(columns): """ Return the NAN and Null count in a Column :param columns: '*', list of columns names or a single column name. - :param type: Accepts integer, float, string or None :return: """ columns = parse_columns(self, columns) @@ -1220,6 +1228,7 @@ def hist(columns, min_value, max_value, buckets=10): """ columns = parse_columns(self, columns) + for col_name in columns: # Create splits splits = create_buckets(min_value, max_value, buckets) diff --git a/optimus/helpers/functions.py b/optimus/helpers/functions.py index ca0df71fb..8634e654c 100644 --- a/optimus/helpers/functions.py +++ b/optimus/helpers/functions.py @@ -49,7 +49,7 @@ def get_spark_dtypes_object(value): try: data_type = [SPARK_DTYPES_DICT_OBJECTS[SPARK_SHORT_DTYPES[v]] for v in value] - except KeyError: + except (KeyError, TypeError): data_type = value data_type = one_list_to_val(data_type) diff --git a/optimus/io/load.py b/optimus/io/load.py index 680f29a3e..6b3eee641 100644 --- a/optimus/io/load.py +++ b/optimus/io/load.py @@ -109,14 +109,9 @@ def parquet(path, *args, **kwargs): @staticmethod def avro(path, *args, **kwargs): - try: - df = (Spark.instance.spark.read - .format("com.databricks.spark.avro") - .load(path, *args, **kwargs)) - except IOError as error: - logging.error(error) - raise - return df + print("Not yet implemented") + return + """ diff --git a/optimus/optimus.py b/optimus/optimus.py index ecf218ba0..b08596d20 100644 --- a/optimus/optimus.py +++ b/optimus/optimus.py @@ -41,7 +41,6 @@ def __init__(self, master="local[*]", app_name="optimus", checkpoint=False, path from optimus.dl.models import DL self.dl = DL() else: - Optimus.add_spark_packages(["com.databricks:spark-avro_2.11:4.0.0 pyspark-shell"]) Spark.instance = Spark(master, app_name) pass @@ -85,19 +84,34 @@ def enrich(self, df, func_request, func_response): @property def spark(self): + """ + Return a Spark session object + :return: + """ return Spark.instance.spark @property def sc(self): + """ + Return a Spark Context object + :return: + """ return Spark.instance.sc - @staticmethod - def concat(dfs, like): - return concat(dfs, like) + def stop(self): + """ + Stop Spark Session + :return: + """ + Spark.instance.spark.stop() @staticmethod def add_spark_packages(packages): - p = "--packages " + " ".join(packages) + """ + Define the Spark packages that must be loaded at start time + :param packages: + :return: + """ os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages " + " ".join(packages) @staticmethod @@ -176,3 +190,7 @@ def delete_check_point_folder(path, file_system): logging.info("Folder deleted.") else: RaiseIt.value_error(file_system, ["hadoop", "local"]) + + @staticmethod + def concat(dfs, like): + return concat(dfs, like) diff --git a/optimus/profiler/functions.py b/optimus/profiler/functions.py index 674cc0468..dcc081863 100644 --- a/optimus/profiler/functions.py +++ b/optimus/profiler/functions.py @@ -1,5 +1,6 @@ import json import math +import timeit from pyspark.sql import functions as F from pyspark.sql.functions import when @@ -109,6 +110,7 @@ def bucketizer(df, columns, splits): :param splits: :return: """ + start_time = timeit.default_timer() columns = parse_columns(df, columns) def _bucketizer(col_name, args): @@ -137,9 +139,11 @@ def _bucketizer(col_name, args): return expr output_columns = [c + "_buckets" for c in columns] + # TODO: This seems weird but I can not find another way. Send the actual column name to the func not seems right df = df.cols.apply_expr(output_columns, _bucketizer, [splits, dict(zip(output_columns, columns))]) - + logging.info("bucketizer") + logging.info(timeit.default_timer() - start_time) return df diff --git a/optimus/profiler/profiler.py b/optimus/profiler/profiler.py index 2c468f433..1cbc74240 100644 --- a/optimus/profiler/profiler.py +++ b/optimus/profiler/profiler.py @@ -141,7 +141,7 @@ def _count_data_types(col_name): return results @staticmethod - def columns(df, columns, buckets=10): + def columns(df, columns, buckets=40, relative_error=1): """ Return statistical information about a specific column in json format count_data_type() @@ -242,7 +242,9 @@ def zeros(col_name): # https://stackoverflow.com/questions/45287832/pyspark-approxquantile-function max_value = fast_float(max_value) min_value = fast_float(min_value) - col_info['stats']['quantile'] = df.cols.percentile(col_name, [0.05, 0.25, 0.5, 0.75, 0.95]) + col_info['stats']['quantile'] = df.cols.percentile(col_name, [0.05, 0.25, 0.5, 0.75, 0.95], + relative_error) + col_info['stats']['range'] = max_value - min_value col_info['stats']['median'] = col_info['stats']['quantile'][0.5] col_info['stats']['interquartile_range'] = col_info['stats']['quantile'][0.75] - \ @@ -328,17 +330,19 @@ def infer_date(value, args): return column_info - def run(self, df, columns, buckets=40): + def run(self, df, columns, buckets=40, relative_error=1): """ - Return statistical information in HTML Format + Return dataframe statistical information in HTML Format + :param df: Dataframe to be analyzed :param columns: Columns to be analized - :param buckets: number of buckets calculated to print the histogram + :param buckets: Number of buckets calculated to print the histogram + :param relative_error: Relative Error for quantile discretizer calculation :return: """ columns = parse_columns(df, columns) - output = Profiler.to_json(df, columns, buckets) + output = Profiler.to_json(df, columns, buckets, relative_error) # Load jinja path = os.path.dirname(os.path.abspath(__file__)) @@ -379,7 +383,6 @@ def run(self, df, columns, buckets=40): html = html + template.render(data=col, freq_pic=freq_pic, **hist_pic) html = html + df.table_html(10) - # df.plots.correlation(columns) # Display HTML display(HTML(html)) @@ -388,7 +391,7 @@ def run(self, df, columns, buckets=40): write_json(output, self.path) @staticmethod - def to_json(df, columns, buckets=20): + def to_json(df, columns, buckets=40, relative_error=1): """ Return the profiling data in json format :param df: Dataframe to be processed @@ -397,12 +400,14 @@ def to_json(df, columns, buckets=20): :return: json file """ - output = Profiler.columns(df, columns, buckets) - dataset = Profiler.dataset_info(df) - output["summary"] = dataset + # Get the stats for all the columns + output = Profiler.columns(df, columns, buckets, relative_error) + + # Add the data summary to the output + output["summary"] = Profiler.dataset_info(df) + # Get a data sample and transform it to friendly json format data = [] - # Get a sample of the data and transform it to friendly json format for l in df.sample_n(10).to_json(): data.append([v for k, v in l.items()]) output["sample"] = {"columns": df.columns, "data": data} diff --git a/optimus/spark.py b/optimus/spark.py index 94456f60e..2b5d60f53 100644 --- a/optimus/spark.py +++ b/optimus/spark.py @@ -1,5 +1,3 @@ -from functools import lru_cache - from pyspark.sql import SparkSession from optimus.helpers.constants import * @@ -32,8 +30,11 @@ def __init__(self, master="local[*]", app_name="optimus"): logging.info(STARTING_SPARK) # Build the spark session - self.spark - + self._spark = (SparkSession + .builder + .master(self.master) + .appName(self.app_name) + .getOrCreate()) @property def spark(self): @@ -42,12 +43,7 @@ def spark(self): :return: None """ - return (SparkSession - .builder - .master(self.master) - .appName(self.app_name) - .getOrCreate() - ) + return self._spark @property def sc(self): @@ -55,4 +51,4 @@ def sc(self): Return the Spark Context :return: """ - return self.spark.sparkContext + return self._spark.sparkContext diff --git a/optimus/version.py b/optimus/version.py index c13da0f32..59f084ee3 100644 --- a/optimus/version.py +++ b/optimus/version.py @@ -5,5 +5,5 @@ def _safe_int(string): return string -__version__ = '2.0.4' +__version__ = '2.0.6' VERSION = tuple(_safe_int(x) for x in __version__.split('.')) diff --git a/requirements-docs.txt b/requirements-docs.txt index a66a5ed5a..d4ad1024a 100644 --- a/requirements-docs.txt +++ b/requirements-docs.txt @@ -1,5 +1,5 @@ findspark==1.3.0 -pytest==3.7.2 +pytest==3.8.0 numpy==1.15.1 matplotlib==2.2.3 ipython==6.5.0 diff --git a/requirements-test.txt b/requirements-test.txt index 7251c66c3..1a5e55cb6 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -7,12 +7,12 @@ python_dateutil==2.7.3 numpy==1.15.1 matplotlib==2.2.3 pyspark==2.3.1 -pytest==3.7.2 +pytest==3.8.0 findspark==1.3.0 nose==1.3.7 seaborn==0.9.0 setuptools==40.2.0 -deprecated==1.2.0 +deprecated==1.2.2 pyarrow==0.10.0 tabulate==0.8.2 Jinja2==2.10 @@ -24,4 +24,4 @@ six>=1.10.0 h5py>=2.7.0 flask==1.0.2 ipython==6.5.0 -pytest-cov==2.5.1 +pytest-cov==2.6.0 diff --git a/requirements.txt b/requirements.txt index 8f053bfd4..da5d64de8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,12 +8,12 @@ nose==1.3.7 numpy==1.15.1 matplotlib==2.2.3 pyspark==2.3.1 -pytest==3.7.2 +pytest==3.8.0 findspark==1.3.0 nose==1.3.7 seaborn==0.9.0 setuptools==40.2.0 -deprecated==1.2.0 +deprecated==1.2.2 pyarrow==0.10.0 tabulate==0.8.2 Jinja2==2.10 diff --git a/setup.py b/setup.py index 4b9f29b93..2eaa372f7 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ def readme(): author='Favio Vazquez and Argenis Leon', author_email='favio.vazquez@ironmussa.com', url='https://github.com/ironmussa/Optimus/', - download_url='https://github.com/ironmussa/Optimus/archive/2.0.4.tar.gz', + download_url='https://github.com/ironmussa/Optimus/archive/2.0.6.tar.gz', description=('Optimus is the missing framework for cleaning and pre-processing data in a distributed fashion with ' 'pyspark.'), long_description=readme(), @@ -60,7 +60,7 @@ def readme(): }, dependency_links=dependency_links, test_suite='nose.collector', - include_package_data=False, + include_package_data=True, classifiers=[ 'Development Status :: 4 - Beta', 'Intended Audience :: Developers', From 029a900b3b536b2e2b0c905a643d3e0df369d6ca Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Mon, 10 Sep 2018 15:27:15 -0500 Subject: [PATCH 70/94] Revert "Merge branch 'master' into feature/data_enrichment" This reverts commit ed0e7560ed74fb3e4c79859219307e7c7fa630f4. --- .gitignore | 1 - .travis.yml | 2 +- MANIFEST.in | 2 - README.md | 2 +- docs/source/conf.py | 2 +- examples/new-api-column.ipynb | 72 +- examples/new-api-profiler-test.ipynb | 2958 --------------- examples/new-api-profiler.ipynb | 5219 ++++++++++++++------------ optimus/dataframe/columns.py | 47 +- optimus/helpers/functions.py | 2 +- optimus/io/load.py | 11 +- optimus/optimus.py | 28 +- optimus/profiler/functions.py | 6 +- optimus/profiler/profiler.py | 29 +- optimus/spark.py | 18 +- optimus/version.py | 2 +- requirements-docs.txt | 2 +- requirements-test.txt | 6 +- requirements.txt | 4 +- setup.py | 4 +- 20 files changed, 2909 insertions(+), 5508 deletions(-) delete mode 100644 MANIFEST.in delete mode 100644 examples/new-api-profiler-test.ipynb diff --git a/.gitignore b/.gitignore index f720a9d5e..e75b8a61d 100644 --- a/.gitignore +++ b/.gitignore @@ -31,4 +31,3 @@ examples/example.json examples/random.csv data.json .pytest_cache/README.md -examples/order_products__prior.csv diff --git a/.travis.yml b/.travis.yml index d7ed0b820..7754106a9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,7 +21,7 @@ jdk: - oraclejdk8 script: - - py.test -v --ignore=optimus/dl/ --ignore=tests/test_dl.py + - py.test -v --ignore=optimus/dl/ #deploy: # provider: pypi diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 6c28df639..000000000 --- a/MANIFEST.in +++ /dev/null @@ -1,2 +0,0 @@ -include optimus/templates/* -include optimus/profiler/templates/* \ No newline at end of file diff --git a/README.md b/README.md index f623f93c0..71d2b4701 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ If you want to load from a URL you just need to use load.url() with the path and ```python df = op.load.url("https://raw.githubusercontent.com/ironmussa/Optimus/feature/load_save_improvements/examples/data/foo.json", "json") ``` -## Cleaning and Processing +## Data loading, cleaning and processing Optimus V2 was created to make data cleaning a breeze. The API was designed to be super easy to newcomers and very familiar for people that comes from Pandas. Optimus expand the Spark DataFrame functionality adding .rows and .cols attributes. diff --git a/docs/source/conf.py b/docs/source/conf.py index b2e63c4f8..566770848 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -60,7 +60,7 @@ # The short X.Y version. version = '2.0' # The full version, including alpha/beta/rc tags. -release = '2.0.6' +release = '2.0.4' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/examples/new-api-column.ipynb b/examples/new-api-column.ipynb index c860cf37c..8c3d52a8e 100644 --- a/examples/new-api-column.ipynb +++ b/examples/new-api-column.ipynb @@ -2016,13 +2016,13 @@ " \n", " \n", " \n", - "
filter
\n", + "
num 2
\n", "
1 (string)
\n", "\n", " \n", " \n", " \n", - "
two strings
\n", + "
words
\n", "
2 (string)
\n", "\n", " \n", @@ -2040,13 +2040,13 @@ " \n", " \n", " \n", - "
words
\n", + "
filter
\n", "
5 (string)
\n", "\n", " \n", " \n", " \n", - "
num 2
\n", + "
two strings
\n", "
6 (string)
\n", "\n", " \n", @@ -2059,11 +2059,11 @@ " \n", " \n", " \n", - " a\n", + " 1\n", " \n", " \n", " \n", - " cat-car\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", " \n", " \n", " \n", @@ -2075,11 +2075,11 @@ " \n", " \n", " \n", - " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " a\n", " \n", " \n", " \n", - " 1\n", + " cat-car\n", " \n", " \n", " \n", @@ -2087,11 +2087,11 @@ " \n", " \n", " \n", - " b\n", + " 2\n", " \n", " \n", " \n", - " dog-tv\n", + " ⸱⸱⸱⸱zombies\n", " \n", " \n", " \n", @@ -2103,11 +2103,11 @@ " \n", " \n", " \n", - " ⸱⸱⸱⸱zombies\n", + " b\n", " \n", " \n", " \n", - " 2\n", + " dog-tv\n", " \n", " \n", " \n", @@ -2115,11 +2115,11 @@ " \n", " \n", " \n", - " 1\n", + " 3\n", " \n", " \n", " \n", - " eagle-tv-plus\n", + " simpsons⸱⸱⸱cat⸱lady\n", " \n", " \n", " \n", @@ -2131,11 +2131,11 @@ " \n", " \n", " \n", - " simpsons⸱⸱⸱cat⸱lady\n", + " 1\n", " \n", " \n", " \n", - " 3\n", + " eagle-tv-plus\n", " \n", " \n", " \n", @@ -2143,11 +2143,11 @@ " \n", " \n", " \n", - " c\n", + " 4\n", " \n", " \n", " \n", - " lion-pc\n", + " None\n", " \n", " \n", " \n", @@ -2159,11 +2159,11 @@ " \n", " \n", " \n", - " None\n", + " c\n", " \n", " \n", " \n", - " 4\n", + " lion-pc\n", " \n", " \n", " \n", @@ -3515,25 +3515,27 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 19, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" + "ename": "TypeError", + "evalue": "unhashable type: 'VectorUDT'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpyspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinalg\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mVectors\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcols\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"col_int\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mVectors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/optimus/optimus/helpers/decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mwraps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 26\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 27\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0msetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcls\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda/lib/python3.6/site-packages/multipledispatch/dispatcher.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 276\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_cache\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mtypes\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 277\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 278\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 279\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 280\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mMDNotImplementedError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/optimus/optimus/dataframe/columns.py\u001b[0m in \u001b[0;36mcast\u001b[0;34m(columns, dtype)\u001b[0m\n\u001b[1;32m 311\u001b[0m \u001b[0mattrs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 312\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 313\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_cast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcols\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattrs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 314\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 315\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0madd_attr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcols\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/optimus/optimus/dataframe/columns.py\u001b[0m in \u001b[0;36m_cast\u001b[0;34m(cols, args)\u001b[0m\n\u001b[1;32m 273\u001b[0m \u001b[0mfunc_return_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreturn_type\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 274\u001b[0m \u001b[0mattrs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 275\u001b[0;31m func_type=func_type, verbose=False)\n\u001b[0m\u001b[1;32m 276\u001b[0m )\n\u001b[1;32m 277\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/optimus/optimus/functions.py\u001b[0m in \u001b[0;36mabstract_udf\u001b[0;34m(col, func, func_return_type, attrs, func_type, verbose)\u001b[0m\n\u001b[1;32m 41\u001b[0m .format(func_type=func_type, column=col, func_name=func.__name__))\n\u001b[1;32m 42\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 43\u001b[0;31m \u001b[0mdf_func\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc_factory\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc_return_type\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 44\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdf_func\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mattrs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 45\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/optimus/optimus/functions.py\u001b[0m in \u001b[0;36mfunc_factory\u001b[0;34m(func_type, func_return_type)\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[0;31m# if func_return_type is not None:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m \u001b[0mfunc_return_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_spark_dtypes_object\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc_return_type\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 57\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mpandas_udf_func\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mattr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/optimus/optimus/helpers/functions.py\u001b[0m in \u001b[0;36mget_spark_dtypes_object\u001b[0;34m(value)\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mval_to_list\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 50\u001b[0;31m \u001b[0mdata_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mSPARK_DTYPES_DICT_OBJECTS\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mSPARK_SHORT_DTYPES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 51\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/optimus/optimus/helpers/functions.py\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mval_to_list\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 50\u001b[0;31m \u001b[0mdata_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mSPARK_DTYPES_DICT_OBJECTS\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mSPARK_SHORT_DTYPES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 51\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: unhashable type: 'VectorUDT'" ] - }, - { - "data": { - "text/plain": [ - "DataFrame[words: string, num: int, animals: string, thing: string, two strings: string, filter: string, num 2: string, col_array: array, col_int: vector, new_col_1: int]" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ diff --git a/examples/new-api-profiler-test.ipynb b/examples/new-api-profiler-test.ipynb deleted file mode 100644 index 3fe92a02f..000000000 --- a/examples/new-api-profiler-test.ipynb +++ /dev/null @@ -1,2958 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "sys.path.append(\"..\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Just check that Spark and all necessary environments vars are present...\n", - "-----\n", - "SPARK_HOME=C:\\opt\\spark\\spark-2.3.1-bin-hadoop2.7\n", - "HADOOP_HOME=C:\\opt\\spark\\spark-2.3.1-bin-hadoop2.7\n", - "You don't have PYSPARK_PYTHON set\n", - "You don't have PYSPARK_DRIVER_PYTHON set\n", - "JAVA_HOME=C:\\Program Files\\Java\\jdk1.8.0_181\n", - "Pyarrow Installed\n", - "-----\n", - "Starting or getting SparkSession and SparkContext...\n", - "\n", - " ____ __ _ \n", - " / __ \\____ / /_(_)___ ___ __ _______\n", - " / / / / __ \\/ __/ / __ `__ \\/ / / / ___/\n", - " / /_/ / /_/ / /_/ / / / / / / /_/ (__ ) \n", - " \\____/ .___/\\__/_/_/ /_/ /_/\\__,_/____/ \n", - " /_/ \n", - " \n", - "Transform and Roll out...\n", - "Setting checkpoint folder local. If you are in a cluster initialize Optimus with master='your_ip' as param\n", - "Deleting previous folder if exists...\n", - "Creating the checkpoint directory...\n", - "Optimus successfully imported. Have fun :).\n" - ] - } - ], - "source": [ - "# Create optimus\n", - "from optimus import Optimus\n", - "op = Optimus(master=\"local[*]\", app_name = \"optimus\" ,verbose =True, checkpoint= True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Benchmark " - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "df = op.load.csv(\"C:\\\\Users\\\\argenisleon\\\\Desktop\\\\order_products__prior.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
Viewing 100 of 32434489 rows / 4 columns
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
order_id
\n", - "
1 (int)
\n", - "\n", - "
\n", - "
product_id
\n", - "
2 (int)
\n", - "\n", - "
\n", - "
add_to_cart_order
\n", - "
3 (int)
\n", - "\n", - "
\n", - "
reordered
\n", - "
4 (int)
\n", - "\n", - "
\n", - " 2\n", - " \n", - " 33120\n", - " \n", - " 1\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 28985\n", - " \n", - " 2\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 9327\n", - " \n", - " 3\n", - " \n", - " 0\n", - "
\n", - " 2\n", - " \n", - " 45918\n", - " \n", - " 4\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 30035\n", - " \n", - " 5\n", - " \n", - " 0\n", - "
\n", - " 2\n", - " \n", - " 17794\n", - " \n", - " 6\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 40141\n", - " \n", - " 7\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 1819\n", - " \n", - " 8\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 43668\n", - " \n", - " 9\n", - " \n", - " 0\n", - "
\n", - " 3\n", - " \n", - " 33754\n", - " \n", - " 1\n", - " \n", - " 1\n", - "
\n", - " 3\n", - " \n", - " 24838\n", - " \n", - " 2\n", - " \n", - " 1\n", - "
\n", - " 3\n", - " \n", - " 17704\n", - " \n", - " 3\n", - " \n", - " 1\n", - "
\n", - " 3\n", - " \n", - " 21903\n", - " \n", - " 4\n", - " \n", - " 1\n", - "
\n", - " 3\n", - " \n", - " 17668\n", - " \n", - " 5\n", - " \n", - " 1\n", - "
\n", - " 3\n", - " \n", - " 46667\n", - " \n", - " 6\n", - " \n", - " 1\n", - "
\n", - " 3\n", - " \n", - " 17461\n", - " \n", - " 7\n", - " \n", - " 1\n", - "
\n", - " 3\n", - " \n", - " 32665\n", - " \n", - " 8\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 46842\n", - " \n", - " 1\n", - " \n", - " 0\n", - "
\n", - " 4\n", - " \n", - " 26434\n", - " \n", - " 2\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 39758\n", - " \n", - " 3\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 27761\n", - " \n", - " 4\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 10054\n", - " \n", - " 5\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 21351\n", - " \n", - " 6\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 22598\n", - " \n", - " 7\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 34862\n", - " \n", - " 8\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 40285\n", - " \n", - " 9\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 17616\n", - " \n", - " 10\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 25146\n", - " \n", - " 11\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 32645\n", - " \n", - " 12\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 41276\n", - " \n", - " 13\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 13176\n", - " \n", - " 1\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 15005\n", - " \n", - " 2\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 47329\n", - " \n", - " 3\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 27966\n", - " \n", - " 4\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 23909\n", - " \n", - " 5\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 48370\n", - " \n", - " 6\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 13245\n", - " \n", - " 7\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 9633\n", - " \n", - " 8\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 27360\n", - " \n", - " 9\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 6348\n", - " \n", - " 10\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 40878\n", - " \n", - " 11\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 6184\n", - " \n", - " 12\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 48002\n", - " \n", - " 13\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 20914\n", - " \n", - " 14\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 37011\n", - " \n", - " 15\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 12962\n", - " \n", - " 16\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 45698\n", - " \n", - " 17\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 24773\n", - " \n", - " 18\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 18569\n", - " \n", - " 19\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 41176\n", - " \n", - " 20\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 48366\n", - " \n", - " 21\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 47209\n", - " \n", - " 22\n", - " \n", - " 0\n", - "
\n", - " 5\n", - " \n", - " 46522\n", - " \n", - " 23\n", - " \n", - " 0\n", - "
\n", - " 5\n", - " \n", - " 38693\n", - " \n", - " 24\n", - " \n", - " 0\n", - "
\n", - " 5\n", - " \n", - " 48825\n", - " \n", - " 25\n", - " \n", - " 0\n", - "
\n", - " 5\n", - " \n", - " 8479\n", - " \n", - " 26\n", - " \n", - " 0\n", - "
\n", - " 6\n", - " \n", - " 40462\n", - " \n", - " 1\n", - " \n", - " 0\n", - "
\n", - " 6\n", - " \n", - " 15873\n", - " \n", - " 2\n", - " \n", - " 0\n", - "
\n", - " 6\n", - " \n", - " 41897\n", - " \n", - " 3\n", - " \n", - " 0\n", - "
\n", - " 7\n", - " \n", - " 34050\n", - " \n", - " 1\n", - " \n", - " 0\n", - "
\n", - " 7\n", - " \n", - " 46802\n", - " \n", - " 2\n", - " \n", - " 0\n", - "
\n", - " 8\n", - " \n", - " 23423\n", - " \n", - " 1\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 21405\n", - " \n", - " 1\n", - " \n", - " 0\n", - "
\n", - " 9\n", - " \n", - " 47890\n", - " \n", - " 2\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 11182\n", - " \n", - " 3\n", - " \n", - " 0\n", - "
\n", - " 9\n", - " \n", - " 2014\n", - " \n", - " 4\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 29193\n", - " \n", - " 5\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 34203\n", - " \n", - " 6\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 14992\n", - " \n", - " 7\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 31506\n", - " \n", - " 8\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 23288\n", - " \n", - " 9\n", - " \n", - " 0\n", - "
\n", - " 9\n", - " \n", - " 44533\n", - " \n", - " 10\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 18362\n", - " \n", - " 11\n", - " \n", - " 0\n", - "
\n", - " 9\n", - " \n", - " 27366\n", - " \n", - " 12\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 432\n", - " \n", - " 13\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 3990\n", - " \n", - " 14\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 14183\n", - " \n", - " 15\n", - " \n", - " 0\n", - "
\n", - " 10\n", - " \n", - " 24852\n", - " \n", - " 1\n", - " \n", - " 1\n", - "
\n", - " 10\n", - " \n", - " 4796\n", - " \n", - " 2\n", - " \n", - " 1\n", - "
\n", - " 10\n", - " \n", - " 31717\n", - " \n", - " 3\n", - " \n", - " 0\n", - "
\n", - " 10\n", - " \n", - " 47766\n", - " \n", - " 4\n", - " \n", - " 1\n", - "
\n", - " 10\n", - " \n", - " 4605\n", - " \n", - " 5\n", - " \n", - " 1\n", - "
\n", - " 10\n", - " \n", - " 1529\n", - " \n", - " 6\n", - " \n", - " 0\n", - "
\n", - " 10\n", - " \n", - " 21137\n", - " \n", - " 7\n", - " \n", - " 1\n", - "
\n", - " 10\n", - " \n", - " 22122\n", - " \n", - " 8\n", - " \n", - " 1\n", - "
\n", - " 10\n", - " \n", - " 34134\n", - " \n", - " 9\n", - " \n", - " 1\n", - "
\n", - " 10\n", - " \n", - " 27156\n", - " \n", - " 10\n", - " \n", - " 0\n", - "
\n", - " 10\n", - " \n", - " 14992\n", - " \n", - " 11\n", - " \n", - " 0\n", - "
\n", - " 10\n", - " \n", - " 49235\n", - " \n", - " 12\n", - " \n", - " 1\n", - "
\n", - " 10\n", - " \n", - " 26842\n", - " \n", - " 13\n", - " \n", - " 0\n", - "
\n", - " 10\n", - " \n", - " 3464\n", - " \n", - " 14\n", - " \n", - " 0\n", - "
\n", - " 10\n", - " \n", - " 25720\n", - " \n", - " 15\n", - " \n", - " 0\n", - "
\n", - " 11\n", - " \n", - " 30162\n", - " \n", - " 1\n", - " \n", - " 1\n", - "
\n", - " 11\n", - " \n", - " 27085\n", - " \n", - " 2\n", - " \n", - " 1\n", - "
\n", - " 11\n", - " \n", - " 5994\n", - " \n", - " 3\n", - " \n", - " 1\n", - "
\n", - " 11\n", - " \n", - " 1313\n", - " \n", - " 4\n", - " \n", - " 1\n", - "
\n", - " 11\n", - " \n", - " 31506\n", - " \n", - " 5\n", - " \n", - " 1\n", - "
\n", - " 12\n", - " \n", - " 30597\n", - " \n", - " 1\n", - " \n", - " 1\n", - "
\n", - " 12\n", - " \n", - " 15221\n", - " \n", - " 2\n", - " \n", - " 1\n", - "
\n", - " 12\n", - " \n", - " 43772\n", - " \n", - " 3\n", - " \n", - " 1\n", - "
\n", - "\n", - "
Viewing 100 of 32434489 rows / 4 columns
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df.table()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Processing column 'product_id'...\n", - "percentile\n", - "12.428871233101177\n", - "percentile\n", - "13.294262981479164\n", - "percentile\n", - "12.101534748881022\n", - "Using 'column_exp' to process column 'product_id_buckets' with function _bucketizer\n", - "bucketizer\n", - "0.27864982148264517\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "

Overview

\n", - "
\n", - "
\n", - "
\n", - "

Dataset info

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "
Number of columns4
Number of rows32434489
Total Missing (%)0.0%
Total size in memory58.9MiB
\n", - "
\n", - "
\n", - "

Variables types

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "
Categorical0
Numeric1
Date0
Bool
Not available0
\n", - "
\n", - "
\n", - "\n", - "
\n", - "
\n", - "\n", - " \n", - "\n", - "
\n", - "
\n", - "

product_id

\n", - "
numeric
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Distinct count 45888
Unique (%) 0.141
Missing (%)0
Missing (n)0.0
\n", - "
\n", - "

\n", - " Datatypes\n", - "

\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "
\n", - " String\n", - " \n", - " 0\n", - "
\n", - " Integer\n", - " \n", - " 32434489\n", - "
\n", - " Float\n", - " \n", - " 0\n", - "
\n", - " Bool\n", - " \n", - " 0\n", - "
\n", - " Date\n", - " \n", - " 0\n", - "
\n", - " Missing\n", - " \n", - " 0\n", - "
\n", - " Null\n", - " \n", - " 0\n", - "
\n", - " \n", - "
\n", - "

\n", - " Basic Stats\n", - "

\n", - "\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
Mean25576.337535424096
Minimum1
Maximum49688
Zeros(%)0
\n", - " \n", - "\n", - "
\n", - "
\n", - "

Frequency

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ValueCountFrecuency (%)
248524725651.457%
131763794501.17%
211372646830.816%
219032419210.746%
472092135840.659%
477661768150.545%
476261526570.471%
167971429510.441%
262091406270.434%
278451379050.425%
\"Missing\"00.0%
\n", - "
\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "

Quantile statistics

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Minimum1
5-th percentile1.0
Q11.0
Median1.0
Q349688.0
95-th percentile49688.0
Maximum49688
Range49687.0
Interquartile range49687.0
\n", - "
\n", - "
\n", - "

Descriptive statistics

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Standard deviation14096.689090257127
Coef of variation0.55116
Kurtosis-1.1408165030229254
Mean25576.337535424096
MAD0.0
Skewness0
Sum829555438453
Variance198716643.3073743
\n", - "
\n", - " \n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "
\n", - "\n", - "
\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "\n", - "
\n", - "
\n", - "\n", - "\n", - "\n", - "\n", - "
Viewing 10 of 32434489 rows / 4 columns
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
order_id
\n", - "
1 (int)
\n", - "\n", - "
\n", - "
product_id
\n", - "
2 (int)
\n", - "\n", - "
\n", - "
add_to_cart_order
\n", - "
3 (int)
\n", - "\n", - "
\n", - "
reordered
\n", - "
4 (int)
\n", - "\n", - "
\n", - " 2\n", - " \n", - " 33120\n", - " \n", - " 1\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 28985\n", - " \n", - " 2\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 9327\n", - " \n", - " 3\n", - " \n", - " 0\n", - "
\n", - " 2\n", - " \n", - " 45918\n", - " \n", - " 4\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 30035\n", - " \n", - " 5\n", - " \n", - " 0\n", - "
\n", - " 2\n", - " \n", - " 17794\n", - " \n", - " 6\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 40141\n", - " \n", - " 7\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 1819\n", - " \n", - " 8\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 43668\n", - " \n", - " 9\n", - " \n", - " 0\n", - "
\n", - " 3\n", - " \n", - " 33754\n", - " \n", - " 1\n", - " \n", - " 1\n", - "
\n", - "\n", - "
Viewing 10 of 32434489 rows / 4 columns
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "161.76729593380855" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import timeit\n", - "start_time = timeit.default_timer()\n", - "op.profiler.run(df, \"product_id\", relative_error=0.5)\n", - "timeit.default_timer() - start_time" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[1.0, 1.0, 1.0, 49688.0, 49688.0]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.approxQuantile(\"product_id\", [0.05, 0.25, 0.5, 0.75, 0.95], 0.5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/new-api-profiler.ipynb b/examples/new-api-profiler.ipynb index 19c2c8b0c..dcb89d9e4 100644 --- a/examples/new-api-profiler.ipynb +++ b/examples/new-api-profiler.ipynb @@ -65,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -74,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 137, "metadata": { "scrolled": false }, @@ -630,7 +630,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 150, "metadata": { "scrolled": false }, @@ -641,7 +641,34 @@ "text": [ "Processing column 'name'...\n", "Using 'column_exp' to process column 'name_len' with function func_col_exp\n", - "Using 'column_exp' to process column 'name_len_buckets' with function _bucketizer\n" + "Using 'column_exp' to process column 'name_len_buckets' with function _bucketizer\n", + "Processing column 'id'...\n", + "Using 'column_exp' to process column 'id_buckets' with function _bucketizer\n", + "Processing column 'nametype'...\n", + "Using 'column_exp' to process column 'nametype_len' with function func_col_exp\n", + "Using 'column_exp' to process column 'nametype_len_buckets' with function _bucketizer\n", + "Processing column 'recclass'...\n", + "Using 'column_exp' to process column 'recclass_len' with function func_col_exp\n", + "Using 'column_exp' to process column 'recclass_len_buckets' with function _bucketizer\n", + "Processing column 'mass (g)'...\n", + "Using 'column_exp' to process column 'mass (g)_buckets' with function _bucketizer\n", + "Processing column 'fall'...\n", + "Using 'column_exp' to process column 'fall_len' with function func_col_exp\n", + "Using 'column_exp' to process column 'fall_len_buckets' with function _bucketizer\n", + "Processing column 'year'...\n", + "Using 'pandas_udf' to process column 'year' with function infer_date\n", + "Using 'column_exp' to process column 'year_0_buckets' with function _bucketizer\n", + "Using 'column_exp' to process column 'year_1_buckets' with function _bucketizer\n", + "Using 'column_exp' to process column 'year_2_buckets' with function _bucketizer\n", + "Using 'column_exp' to process column 'year_3_buckets' with function _bucketizer\n", + "Using 'column_exp' to process column 'year_4_buckets' with function _bucketizer\n", + "Processing column 'reclat'...\n", + "Using 'column_exp' to process column 'reclat_buckets' with function _bucketizer\n", + "Processing column 'reclong'...\n", + "Using 'column_exp' to process column 'reclong_buckets' with function _bucketizer\n", + "Processing column 'GeoLocation'...\n", + "Using 'column_exp' to process column 'GeoLocation_len' with function func_col_exp\n", + "Using 'column_exp' to process column 'GeoLocation_len_buckets' with function _bucketizer\n" ] }, { @@ -695,7 +722,7 @@ " \n", " \n", " Total size in memory\n", - " 44.6MiB\n", + " 142.0MiB\n", "\n", " \n", " \n", @@ -707,17 +734,17 @@ " \n", " \n", " Categorical\n", - " 1\n", + " 5\n", "\n", " \n", " \n", " Numeric\n", - " 0\n", + " 4\n", "\n", " \n", " \n", " Date\n", - " 0\n", + " 1\n", "\n", " \n", " \n", @@ -977,2350 +1004,2701 @@ "\n", " \n", "\n", "\n", + "
\n", + "
\n", "\n", + " \n", "\n", + "
\n", + "
\n", + "

id

\n", + "
numeric
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Distinct count 42365
Unique (%) 92.67
Missing (%)0
Missing (n)0.0
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", "\n", - "
Viewing 10 of 45716 rows / 10 columns
\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 0\n", + "
\n", + " Integer\n", + " \n", + " 45716\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 0\n", + "
\n", + " \n", + "
\n", + "

\n", + " Basic Stats\n", + "

\n", "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + "\n", " \n", - " \n", + "
\n", + "\n", + "\n", + "

Quantile statistics

\n", + "
\n", - "
name
\n", - "
1 (string)
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", "\n", - " \n", - " \n", - " \n", + "
Mean26889.73510368361
Minimum1
Maximum57458
Zeros(%)0
\n", - "
id
\n", - "
2 (int)
\n", + "
\n", + " \n", "\n", - "
\n", - "
nametype
\n", - "
3 (string)
\n", + " \n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", "\n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", "\n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", "\n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", "\n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", "\n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", "\n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", "\n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", "\n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
5745810.002%
\n", - "
recclass
\n", - "
4 (string)
\n", + " \n", + "
5745710.002%
\n", - "
mass (g)
\n", - "
5 (double)
\n", + " \n", + "
5745610.002%
\n", - "
fall
\n", - "
6 (string)
\n", + " \n", + "
5745510.002%
\n", - "
year
\n", - "
7 (string)
\n", + " \n", + "
5745410.002%
\n", - "
reclat
\n", - "
8 (double)
\n", + " \n", + "
5745310.002%
\n", - "
reclong
\n", - "
9 (double)
\n", + " \n", + "
5743610.002%
\n", - "
GeoLocation
\n", - "
10 (string)
\n", + " \n", + "
5743510.002%
5743410.002%
\n", - " Aachen\n", - " \n", - " 1\n", - " \n", - " Valid\n", - " \n", - " L5\n", - " \n", - " 21.0\n", - " \n", - " Fell\n", - " \n", - " 01/01/1880⸱12:00:00⸱AM\n", - " \n", - " 50.775\n", - " \n", - " 6.08333\n", - " \n", - " (50.775000,⸱6.083330)\n", - "
\n", - " Aarhus\n", - " \n", - " 2\n", - " \n", - " Valid\n", - " \n", - " H6\n", - " \n", - " 720.0\n", - " \n", - " Fell\n", - " \n", - " 01/01/1951⸱12:00:00⸱AM\n", - " \n", - " 56.18333\n", - " \n", - " 10.23333\n", - " \n", - " (56.183330,⸱10.233330)\n", - "
\n", - " Abee\n", - " \n", - " 6\n", - " \n", - " Valid\n", - " \n", - " EH4\n", - " \n", - " 107000.0\n", - "
5743310.002%
\"Missing\"00.0%
\n", + "
\n", " \n", - "
\n", - " Fell\n", - " \n", - " 01/01/1952⸱12:00:00⸱AM\n", - "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Minimum1
5-th percentile2434.0
Q112688.0
Median24261.0
Q340656.0
95-th percentile54892.0
Maximum57458
Range57457.0
Interquartile range27968.0
\n", + "
\n", + "
\n", + "

Descriptive statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Standard deviation16860.68303027627
Coef of variation0.62703
Kurtosis-1.1602608393254032
Mean26889.73510368361
MAD13263.0
Skewness0
Sum1229291130
Variance284282632.2474462
\n", + "
\n", " \n", - " \n", - " 54.21667\n", - " \n", + "
\n", + " \n", " \n", - " \n", + " \n", + "\n", + " \n", + " \n", " \n", - " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + "\n", + " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", + "\n", + "
\n", - " -113.0\n", - "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", - " (54.216670,⸱-113.000000)\n", - "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", - " Acapulco\n", - " \n", - " 10\n", - " \n", - " Valid\n", - " \n", - " Acapulcoite\n", - " \n", - " 1914.0\n", - " \n", - " Fell\n", - "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", " \n", - " \n", - " 01/01/1976⸱12:00:00⸱AM\n", - " \n", - " \n", - " \n", - " 16.88333\n", - " \n", - " \n", - " \n", - " -99.9\n", - " \n", - " \n", - " \n", - " (16.883330,⸱-99.900000)\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " Achiras\n", - " \n", - " \n", - " \n", - " 370\n", - " \n", - " \n", - " \n", - " Valid\n", - " \n", - " \n", - " \n", - " L6\n", - " \n", - " \n", - " \n", - " 780.0\n", - " \n", - " \n", - " \n", - " Fell\n", - " \n", - " \n", - " \n", - " 01/01/1902⸱12:00:00⸱AM\n", - " \n", - " \n", - " \n", - " -33.16667\n", - " \n", - " \n", - " \n", - " -64.95\n", - " \n", - " \n", - " \n", - " (-33.166670,⸱-64.950000)\n", - " \n", + "\n", + "
\n", + "
\n", + "

nametype

\n", + "
categorical
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Distinct count 2
Unique (%) 0.004
Missing (%)0
Missing (n)0.0
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 45716\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 0\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
Valid4564199.836%
Relict750.164%
\"Missing\"00.0%
\n", + "
\n", " \n", - " \n", - " \n", - " \n", + "\n", " \n", - " \n", - " Adhi⸱Kot\n", - " \n", + "
\n", + " \n", " \n", - " \n", + " \n", + "\n", + " \n", + " \n", " \n", - " \n", " \n", - " \n", + " \n", + " \n", + "\n", + " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", + "\n", + "
\n", - " 379\n", - "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", - " Valid\n", - " \n", - " EH4\n", - "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", - " 4239.0\n", - " \n", - " Fell\n", - " \n", - " 01/01/1919⸱12:00:00⸱AM\n", - " \n", - " 32.1\n", - " \n", - " 71.8\n", - " \n", - " (32.100000,⸱71.800000)\n", - "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " Adzhi-Bogdo⸱(stone)\n", - " \n", - " \n", - " \n", - " 390\n", - " \n", - " \n", - " \n", - " Valid\n", - " \n", - " \n", - " \n", - " LL3-6\n", - " \n", - " \n", - " \n", - " 910.0\n", - " \n", - " \n", - " \n", - " Fell\n", - " \n", + "\n", + "
\n", + "
\n", + "

recclass

\n", + "
categorical
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Distinct count 444
Unique (%) 0.971
Missing (%)0
Missing (n)0.0
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 45716\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 0\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
L6828518.123%
H5714215.623%
L5479610.491%
H645289.905%
H442119.211%
LL527666.05%
LL620434.469%
L412532.741%
H4/54280.936%
CM24160.91%
\"Missing\"00.0%
\n", + "
\n", " \n", - " \n", - " 01/01/1949⸱12:00:00⸱AM\n", - " \n", + "\n", " \n", - " \n", - " 44.83333\n", - " \n", + "
\n", + " \n", " \n", - " \n", + " \n", + "\n", + " \n", + " \n", " \n", - " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + "\n", + " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", + "\n", + "
\n", - " 95.16667\n", - "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", - " (44.833330,⸱95.166670)\n", - "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", - " Agen\n", - " \n", - " 392\n", - " \n", - " Valid\n", - " \n", - " H5\n", - " \n", - " 30000.0\n", - " \n", - " Fell\n", - "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", " \n", - " \n", - " 01/01/1814⸱12:00:00⸱AM\n", - " \n", + "\n", + "
\n", + "
\n", + "

mass (g)

\n", + "
numeric
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Distinct count 12515
Unique (%) 27.376
Missing (%)131
Missing (n)0.29
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 0\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 45716\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 0\n", + "
\n", + " \n", + "
\n", + "

\n", + " Basic Stats\n", + "

\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
Mean13278.078548580497
Minimum0.0
Maximum60000000.0
Zeros(%)19
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
1.29999995231628421710.374%
1.20000004768371581400.306%
1.3999999761581421380.302%
None1310.287%
2.09999990463256841300.284%
2.40000009536743161260.276%
1.6000000238418581200.262%
0.51190.26%
1.1000000238418581160.254%
3.7999999523162841140.249%
\"Missing\"1310.29%
\n", + "
\n", " \n", - " \n", - " 44.21667\n", - " \n", + "\n", " \n", - " \n", - " 0.61667\n", - " \n", + "
\n", + "\n", + "\n", + "

Quantile statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Minimum0.0
5-th percentile1.100000023841858
Q17.199999809265137
Median32.599998474121094
Q3202.60000610351562
95-th percentile4000.0
Maximum60000000.0
Range60000000.0
Interquartile range195.4000062942505
\n", + "
\n", + "
\n", + "

Descriptive statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Standard deviation574988.8764104772
Coef of variation43.30362
Kurtosis6796.17060791067
Mean13278.078548580497
MAD30.5
Skewness19
Sum605281210.6370419
Variance330612207995.783
\n", + "
\n", " \n", - " \n", - " (44.216670,⸱0.616670)\n", - " \n", + "
\n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + "\n", + " \n", + " \n", " \n", - " \n", " \n", - " \n", + " \n", + " \n", + "\n", + " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", - " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", - " Aguada\n", - " \n", - " 398\n", - "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", - " Valid\n", - " \n", - " L6\n", - " \n", - " 1620.0\n", - " \n", - " Fell\n", - " \n", - " 01/01/1930⸱12:00:00⸱AM\n", - " \n", - " -31.6\n", - " \n", - " -65.23333\n", - "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", " \n", - " \n", - " (-31.600000,⸱-65.233330)\n", - " \n", + "\n", + "
\n", + "
\n", + "

fall

\n", + "
categorical
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Distinct count 2
Unique (%) 0.004
Missing (%)0
Missing (n)0.0
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 45716\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 0\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
Found4460997.579%
Fell11072.421%
\"Missing\"00.0%
\n", + "
\n", " \n", - " \n", - " \n", - " \n", + "\n", " \n", - " \n", - " Aguila⸱Blanca\n", - " \n", + "
\n", + " \n", " \n", - " \n", + " \n", + "\n", + " \n", + " \n", " \n", - " \n", " \n", - " \n", + " \n", + " \n", + "\n", + " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", + "\n", + "
\n", - " 417\n", - "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", - " Valid\n", - " \n", - " L\n", - "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", - " 1440.0\n", - " \n", - " Fell\n", - " \n", - " 01/01/1920⸱12:00:00⸱AM\n", - " \n", - " -30.86667\n", - " \n", - " -64.55\n", - " \n", - " (-30.866670,⸱-64.550000)\n", - "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", " \n", - " \n", - " \n", - " \n", - "\n", "\n", - "
Viewing 10 of 45716 rows / 10 columns
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "op.profiler.run(df, \"name\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Plot profile for a specific column" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'timeit' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mstart_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mProfiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"reclat\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mstart_time\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mNameError\u001b[0m: name 'timeit' is not defined" - ] - } - ], - "source": [ - "start_time = timeit.default_timer()\n", - "Profiler.columns(df, \"reclat\")\n", - "timeit.default_timer() - start_time" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Output a json file" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Plot histagram for multiple columns" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using 'column_exp' to process column 'id_buckets' with function _bucketizer\n", - "Using 'column_exp' to process column 'reclong_buckets' with function _bucketizer\n" - ] - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA2gAAAEHCAYAAADbK2WbAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAGi9JREFUeJzt3X2wZVdZJ+Dfa0LQIUiCaaiYDzto6ximxhB7QhgsB4mTD6ITrIKqoGMCYsXSYKnjlNXoKCDDVJhSUEqME02GoEjIoEhL4sRMxKGYIiEdDZAQYprQkDYxac0XiAYT3vnjrMZD53b37e7b9+7b93mqdp291157n7VzVnLyu2vvdaq7AwAAwMr7mpVuAAAAADMCGgAAwEQIaAAAABMhoAEAAEyEgAYAADARAhoAAMBECGgALKuqur2qXrTS7VguVXViVX2hqg7bzf7XV9XvLXe7AJgmAQ2AJVNV26rqe3cpe2VVfXjndnc/t7v/fC/nWV9VXVWHH6SmHlTjmt+RJN39ue4+srufWMRx66tq28FuHwDTJaABsOas1uAHwKFPQANgWc2PslXVaVW1paoerar7q+oto9qHxuvD4/bAF1TV11TVf6mqz1bVA1X1zqp6xtx5Lxj7/q6qfnGX93l9Vb23qn6vqh5N8srx3h+pqoer6r6q+o2qOmLufF1VP1FVd1XV56vqjVX1zeOYR6vq6vn6e7jerxoNrKqTqur/jnNen+SYpfknC8ChQEADYCX9epJf7+6vT/LNSa4e5d89Xo8atwd+JMkrx/I9SZ6T5Mgkv5EkVXVykt9M8kNJjk3yjCTH7fJe5yV5b5KjkrwryRNJfiazgPSCJGck+Yldjjk7yXcmOT3JzyW5bLzHCUn+VZJXLHRR3f2O7n7lbq7595PcMt73jUkunDtuW3ev381xAKwBAhoAS+2PxqjUw1X1cGbBaXf+Kcm3VNUx3f2F7r5xD3V/KMlbuvvu7v5CktcmOX+MTL0syR9394e7+0tJfilJ73L8R7r7j7r7y939D919S3ff2N2Pd/e2JP8jyb/b5Zg3d/ej3X17ktuS/Ol4/0eS/EmS5y3uH8lMVZ2Y5N8k+cXufqy7P5Tkj/flHAAc2gQ0AJbaS7v7qJ1LnjwqNe/VSb41yaeq6uaq+r491P3GJJ+d2/5sksOTPHvsu2fnju7+YpK/2+X4e+Y3qupbq+oDVfU347bH/5Yn3254/9z6PyywfeQe2ru7a3iou/9+l+sAgCQCGgArqLvv6u5XJHlWkjcneW9VPS1PHv1KknuTfNPc9olJHs8sNN2X5PidO6rq65J8w65vt8v2pUk+lWTDuMXy55PU/l/NotyX5OhxjTudeJDfE4BVREADYMVU1X+sqnXd/eUkD4/iJ5LsSPLlzJ412+ndSX5mTLJxZGYjXu/p7scze7bs+6vq346JO96QvYetpyd5NMkXqupfJvnxJbuw3ejuzybZkuQNVXVEVX1Xku8/2O8LwOohoAGwks5OcntVfSGzCUPO7+5/HLcovinJ/xvPsp2e5Iokv5vZDI+fSfKPSX4yScYzYj+Z5KrMRqk+n+SBJI/t4b3/c5IfHHV/O8l7lv7yFvSDSZ6f5MEkr0vyzmV6XwBWgepe6C4SAFi9xgjbw5ndvviZlW4PACyWETQADglV9f1V9S/G812/kuQTSbatbKsAYN8IaAAcKs7LbCKRe5NsyOx2SbeJALCq7DWgVdXXVtVHq+pjVXV7Vb1hlJ9UVTdV1V1V9Z7xUHaq6qlje+vYv37uXK8d5XdW1VkH66IAWHu6+0fH1P7P6O4zuvvOlW4TAOyrxYygPZbkxd39HUlOSXL2eFj7zUne2t0bkjyU2W/ZZLw+1N3fkuSto16q6uQk5yd5bmYPhf9mVR22lBcDAACwmu01oPXMF8bmU8bSSV6c2bTGSXJlkpeO9fPGdsb+M6qqRvlV3f3YeGB7a5LTluQqAAAADgGHL6bSGOm6Jcm3JHl7kk8neXj89kySbE9y3Fg/Lsk9SdLdj1fVI5n9WOhxSW6cO+38MQs65phjev369Yu6EAAAgOV2yy23/G13r1uq8y0qoHX3E0lOqaqjkrwvybcvVG28LvTDoL2H8q9SVRcluShJTjzxxGzZsmUxTQQAAFh2VfXZpTzfPs3i2N0PJ/nzJKcnOaqqdga84zObNSuZjYydkCRj/zMy+zHOr5QvcMz8e1zW3Ru7e+O6dUsWRAEAACZvMbM4rhsjZ6mqr0vyvUnuSPLBJC8b1S5M8v6xvnlsZ+z/szHN8eYk549ZHk/KbArkjy7VhQAAAKx2i7nF8dgkV47n0L4mydXd/YGq+mSSq6rqvyb5yySXj/qXJ/ndqtqa2cjZ+UnS3bdX1dVJPpnk8SQXj1snAQAASFJT/g3PjRs3tmfQAACAqaqqW7p741Kdb5+eQQMAAODgEdAAAAAmQkADAACYCAENAABgIhb1Q9UArA7rN12z0k1Ikmy75NyVbgIArEpG0AAAACbCCBpryhRGF3YdWZhim1gcnx0AsNQENOBJphg8ptgmAICl5hZHAACAiRDQAAAAJkJAAwAAmAgBDQAAYCIENAAAgIkQ0AAAACZCQAMAAJgIAQ0AAGAiBDQAAICJOHylGwAAACtl/aZrVroJ2XbJuSvdBCbECBoAAMBEGEED4KCawl+nE3+hBmB1MIIGAAAwEQIaAADARAhoAAAAEyGgAQAATISABgAAMBECGgAAwEQIaAAAABOx14BWVSdU1Qer6o6qur2qfmqUv76q/rqqbh3LS+aOeW1Vba2qO6vqrLnys0fZ1qradHAuCQAAYHVazA9VP57kZ7v7L6rq6Uluqarrx763dvevzFeuqpOTnJ/kuUm+Mcn/qapvHbvfnuTfJ9me5Oaq2tzdn1yKCwEAAFjt9hrQuvu+JPeN9c9X1R1JjtvDIecluaq7H0vymaramuS0sW9rd9+dJFV11agroAEAAGRxI2hfUVXrkzwvyU1JXpjkNVV1QZItmY2yPZRZeLtx7rDt+edAd88u5c/fr1bzVdZvumalm5Ak2XbJuSvdBAAAWNUWPUlIVR2Z5A+S/HR3P5rk0iTfnOSUzEbYfnVn1QUO7z2U7/o+F1XVlqrasmPHjsU2DwAAYNVbVECrqqdkFs7e1d1/mCTdfX93P9HdX07y2/nn2xi3Jzlh7vDjk9y7h/Kv0t2XdffG7t64bt26fb0eAACAVWsxszhWksuT3NHdb5krP3au2g8kuW2sb05yflU9tapOSrIhyUeT3JxkQ1WdVFVHZDaRyOaluQwAAIDVbzHPoL0wyQ8n+URV3TrKfj7JK6rqlMxuU9yW5MeSpLtvr6qrM5v84/EkF3f3E0lSVa9Jcl2Sw5Jc0d23L+G1AMCq5XliAJLFzeL44Sz8/Ni1ezjmTUnetED5tXs6DgAAYC1b9CQhAAAAHFz7NM0+AABwcLnleW0zggYAADARAhoAAMBECGgAAAATIaABAABMhIAGAAAwEQIaAADARAhoAAAAEyGgAQAATISABgAAMBGHr3QDAGC5rd90zUo3IUmy7ZJzV7oJAEyMgAYArCpTCNjCNXCwCGgAAMAeTeEPI8na+OOIZ9AAAAAmQkADAACYCAENAABgIgQ0AACAiRDQAAAAJkJAAwAAmAgBDQAAYCIENAAAgIkQ0AAAACZCQAMAAJiIw1e6ARy61m+6ZqWbkG2XnLvSTQAAgEUzggYAADARAhoAAMBE7DWgVdUJVfXBqrqjqm6vqp8a5c+squur6q7xevQor6p6W1VtraqPV9Wpc+e6cNS/q6ouPHiXBQAAsPosZgTt8SQ/293fnuT0JBdX1clJNiW5obs3JLlhbCfJOUk2jOWiJJcms0CX5HVJnp/ktCSv2xnqAAAAWERA6+77uvsvxvrnk9yR5Lgk5yW5clS7MslLx/p5Sd7ZMzcmOaqqjk1yVpLru/vB7n4oyfVJzl7SqwEAAFjF9ukZtKpan+R5SW5K8uzuvi+ZhbgkzxrVjktyz9xh20fZ7sp3fY+LqmpLVW3ZsWPHvjQPAABgVVt0QKuqI5P8QZKf7u5H91R1gbLeQ/lXF3Rf1t0bu3vjunXrFts8AACAVW9RAa2qnpJZOHtXd//hKL5/3LqY8frAKN+e5IS5w49Pcu8eygEAAMjiZnGsJJcnuaO73zK3a3OSnTMxXpjk/XPlF4zZHE9P8si4BfK6JGdW1dFjcpAzRxkAAABJDl9EnRcm+eEkn6iqW0fZzye5JMnVVfXqJJ9L8vKx79okL0myNckXk7wqSbr7wap6Y5KbR71f7u4Hl+QqAAAADgF7DWjd/eEs/PxYkpyxQP1OcvFuznVFkiv2pYEAAABrxT7N4ggAAMDBI6ABAABMhIAGAAAwEQIaAADARAhoAAAAEyGgAQAATISABgAAMBECGgAAwEQIaAAAABMhoAEAAEyEgAYAADARAhoAAMBECGgAAAATIaABAABMhIAGAAAwEQIaAADARAhoAAAAEyGgAQAATISABgAAMBECGgAAwEQIaAAAABMhoAEAAEyEgAYAADARAhoAAMBECGgAAAATIaABAABMhIAGAAAwEXsNaFV1RVU9UFW3zZW9vqr+uqpuHctL5va9tqq2VtWdVXXWXPnZo2xrVW1a+ksBAABY3RYzgvaOJGcvUP7W7j5lLNcmSVWdnOT8JM8dx/xmVR1WVYcleXuSc5KcnOQVoy4AAADD4Xur0N0fqqr1izzfeUmu6u7HknymqrYmOW3s29rddydJVV016n5yn1sMAABwiDqQZ9BeU1UfH7dAHj3Kjktyz1yd7aNsd+VPUlUXVdWWqtqyY8eOA2geAADA6rLXEbTduDTJG5P0eP3VJD+SpBao21k4CPZCJ+7uy5JcliQbN25csA4AAKvP+k3XrHQTsu2Sc1e6CbBH+xXQuvv+netV9dtJPjA2tyc5Ya7q8UnuHeu7KwcAACD7eYtjVR07t/kDSXbO8Lg5yflV9dSqOinJhiQfTXJzkg1VdVJVHZHZRCKb97/ZAAAAh569jqBV1buTvCjJMVW1Pcnrkryoqk7J7DbFbUl+LEm6+/aqujqzyT8eT3Jxdz8xzvOaJNclOSzJFd19+5JfDQAAwCq2mFkcX7FA8eV7qP+mJG9aoPzaJNfuU+sAAADWkAOZxREAAIAlJKABAABMxP5Osw8AwGD6eGCpGEEDAACYCAENAABgIgQ0AACAiRDQAAAAJkJAAwAAmAgBDQAAYCJMsw8A7Jbp4wGWlxE0AACAiRDQAAAAJkJAAwAAmAgBDQAAYCIENAAAgIkQ0AAAACZCQAMAAJgIAQ0AAGAiBDQAAICJENAAAAAmQkADAACYCAENAABgIgQ0AACAiRDQAAAAJkJAAwAAmAgBDQAAYCL2GtCq6oqqeqCqbpsre2ZVXV9Vd43Xo0d5VdXbqmprVX28qk6dO+bCUf+uqrrw4FwOAADA6rWYEbR3JDl7l7JNSW7o7g1JbhjbSXJOkg1juSjJpcks0CV5XZLnJzktyet2hjoAAABm9hrQuvtDSR7cpfi8JFeO9SuTvHSu/J09c2OSo6rq2CRnJbm+ux/s7oeSXJ8nhz4AAIA1bX+fQXt2d9+XJOP1WaP8uCT3zNXbPsp2Vw4AAMCw1JOE1AJlvYfyJ5+g6qKq2lJVW3bs2LGkjQMAAJiy/Q1o949bFzNeHxjl25OcMFfv+CT37qH8Sbr7su7e2N0b161bt5/NAwAAWH32N6BtTrJzJsYLk7x/rvyCMZvj6UkeGbdAXpfkzKo6ekwOcuYoAwAAYDh8bxWq6t1JXpTkmKrantlsjJckubqqXp3kc0lePqpfm+QlSbYm+WKSVyVJdz9YVW9McvOo98vdvevEIwAAAGvaXgNad79iN7vOWKBuJ7l4N+e5IskV+9Q6AACANWSpJwkBAABgPwloAAAAEyGgAQAATISABgAAMBECGgAAwEQIaAAAABMhoAEAAEyEgAYAADARAhoAAMBECGgAAAATIaABAABMhIAGAAAwEQIaAADARAhoAAAAEyGgAQAATISABgAAMBECGgAAwEQIaAAAABMhoAEAAEyEgAYAADARAhoAAMBECGgAAAATIaABAABMhIAGAAAwEQIaAADARAhoAAAAEyGgAQAATMQBBbSq2lZVn6iqW6tqyyh7ZlVdX1V3jdejR3lV1duqamtVfbyqTl2KCwAAADhULMUI2vd09yndvXFsb0pyQ3dvSHLD2E6Sc5JsGMtFSS5dgvcGAAA4ZByMWxzPS3LlWL8yyUvnyt/ZMzcmOaqqjj0I7w8AALAqHWhA6yR/WlW3VNVFo+zZ3X1fkozXZ43y45LcM3fs9lEGAABAksMP8PgXdve9VfWsJNdX1af2ULcWKOsnVZoFvYuS5MQTTzzA5gEAAKweBzSC1t33jtcHkrwvyWlJ7t956+J4fWBU357khLnDj09y7wLnvKy7N3b3xnXr1h1I8wAAAFaV/Q5oVfW0qnr6zvUkZya5LcnmJBeOahcmef9Y35zkgjGb4+lJHtl5KyQAAAAHdovjs5O8r6p2nuf3u/t/V9XNSa6uqlcn+VySl4/61yZ5SZKtSb6Y5FUH8N4AAACHnP0OaN19d5LvWKD875KcsUB5J7l4f98PAADgUHcwptkHAABgPwhoAAAAEyGgAQAATMSB/g4aAAATtH7TNSvdhGy75NyVbgKsOkbQAAAAJkJAAwAAmAgBDQAAYCIENAAAgIkQ0AAAACZCQAMAAJgIAQ0AAGAiBDQAAICJENAAAAAmQkADAACYCAENAABgIgQ0AACAiRDQAAAAJkJAAwAAmAgBDQAAYCIENAAAgIkQ0AAAACZCQAMAAJgIAQ0AAGAiBDQAAICJENAAAAAmQkADAACYCAENAABgIpY9oFXV2VV1Z1VtrapNy/3+AAAAU7WsAa2qDkvy9iTnJDk5ySuq6uTlbAMAAMBULfcI2mlJtnb33d39pSRXJTlvmdsAAAAwScsd0I5Lcs/c9vZRBgAAsOZVdy/fm1W9PMlZ3f2jY/uHk5zW3T85V+eiJBeNzW9LcueyNXDPjknytyvdCFacfoA+gD6APoA+wHwf+KbuXrdUJz58qU60SNuTnDC3fXySe+crdPdlSS5bzkYtRlVt6e6NK90OVpZ+gD6APoA+gD7AwewDy32L481JNlTVSVV1RJLzk2xe5jYAAABM0rKOoHX341X1miTXJTksyRXdfftytgEAAGCqlvsWx3T3tUmuXe73XQKTu+2SFaEfoA+gD6APoA9w0PrAsk4SAgAAwO4t9zNoAAAA7IaAtghVdXZV3VlVW6tq00q3hwNTVVdU1QNVddtc2TOr6vqqumu8Hj3Kq6reNj77j1fVqXPHXDjq31VVF86Vf2dVfWIc87aqquW9Qvamqk6oqg9W1R1VdXtV/dQo1w/WiKr62qr6aFV9bPSBN4zyk6rqpvF5vmdMaJWqeurY3jr2r58712tH+Z1VddZcue+OVaCqDquqv6yqD4xtfWANqapt47/Vt1bVllHmu2ANqaqjquq9VfWp8f8FL1jxPtDdlj0smU1m8ukkz0lyRJKPJTl5pdtlOaDP9LuTnJrktrmy/55k01jflOTNY/0lSf4kSSU5PclNo/yZSe4er0eP9aPHvo8mecE45k+SnLPS12x5Uh84NsmpY/3pSf4qycn6wdpZxudy5Fh/SpKbxmd7dZLzR/lvJfnxsf4TSX5rrJ+f5D1j/eTxvfDUJCeN74vDfHesniXJf0ry+0k+MLb1gTW0JNmW5JhdynwXrKElyZVJfnSsH5HkqJXuA0bQ9u60JFu7++7u/lKSq5Kct8Jt4gB094eSPLhL8XmZ/Qua8frSufJ39syNSY6qqmOTnJXk+u5+sLsfSnJ9krPHvq/v7o/07N/Kd86di4no7vu6+y/G+ueT3JHkuOgHa8b4LL8wNp8ylk7y4iTvHeW79oGdfeO9Sc4YfwU9L8lV3f1Yd38mydbMvjd8d6wCVXV8knOT/M7YrugD+C5YM6rq6zP7w/3lSdLdX+ruh7PCfUBA27vjktwzt719lHFoeXZ335fM/uc9ybNG+e4+/z2Vb1+gnIkatyk9L7MRFP1gDRm3tt2a5IHMvkw/neTh7n58VJn/3L7yWY/9jyT5hux732Bafi3JzyX58tj+hugDa00n+dOquqWqLhplvgvWjuck2ZHkf45bnX+nqp6WFe4DAtreLXSfqKkv147dff77Ws4EVdWRSf4gyU9396N7qrpAmX6wynX3E919SpLjMxvt+PaFqo1XfeAQU1Xfl+SB7r5lvniBqvrAoe2F3X1qknOSXFxV372HuvrAoefwzB57ubS7n5fk7zO7pXF3lqUPCGh7tz3JCXPbxye5d4XawsFz/xiGznh9YJTv7vPfU/nxC5QzMVX1lMzC2bu6+w9HsX6wBo3bWf48s+cJjqqqnb8ROv+5feWzHvufkdmt0vvaN5iOFyb5D1W1LbPbD1+c2YiaPrCGdPe94/WBJO/L7I81vgvWju1Jtnf3TWP7vZkFthXtAwLa3t2cZMOY1emIzB4M3rzCbWLpbU6yc8adC5O8f678gjFrz+lJHhlD3dclObOqjh4z+5yZ5Lqx7/NVdfp4NuGCuXMxEeOzuTzJHd39lrld+sEaUVXrquqosf51Sb43s2cRP5jkZaParn1gZ994WZI/G88TbE5yfs1m+DspyYbMHgj33TFx3f3a7j6+u9dn9vn8WXf/UPSBNaOqnlZVT9+5ntl/w2+L74I1o7v/Jsk9VfVto+iMJJ/MSveB/ZntZK0tmc3Y8leZPZ/wCyvdHssBf57vTnJfkn/K7C8br87sOYIbktw1Xp856laSt4/P/hNJNs6d50cyexh8a5JXzZVvzOw/8J9O8hsZPwhvmc6S5Lsyu8Xg40luHctL9IO1syT510n+cvSB25L80ih/Tmb/c701yf9K8tRR/rVje+vY/5y5c/3C+JzvzNzsXL47Vs+S5EX551kc9YE1sozP+mNjuX3nZ+S7YG0tSU5JsmV8H/xRZrMwrmgfqHEgAAAAK8wtjgAAABMhoAEAAEyEgAYAADARAhoAAMBECGgAAAATIaABAABMhIAGAAAwEQIaAADARPx/IF0MqUHQbtUAAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df.plots.hist([\"id\", \"reclong\"], 20)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df.plots.frequency([\"id\", \"reclong\"], 10)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAV0AAAD8CAYAAADUv3dIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAADtxJREFUeJzt3X+s3fVdx/Hnqxc2soBbInHDtnPEFWdFI0KIBOdQwBXi1v2xmLJM3CBc/5D9cM6JUXFiNBPjpkYy181mY1G6icu8LlWMk2ULDmyZC6EluKaacdcZjEN0mQq99+0f90APl3vPObc953NOv30+km9yvj/O53xyQ1598/l+vp9vqgpJUhubpt0BSTqdGLqS1JChK0kNGbqS1JChK0kNGbqS1JChK0nrSLInyeNJHl7nfJL8YZLDSR5K8kPD2jR0JWl9HwV2DDh/DbCtt80DHxzWoKErSeuoqs8D3xhwyU7gzlpxP/CSJOcNavOMcXZwLV/5kdf6yNuE3XTV66bdhc674crLpt2F08L1r744J9vGRjLngvv+9mdZqVCfsbuqdm/g5zYDj/XtL/aOfX29L0w8dCVpVvUCdiMhu9pa/0gMDH1DV1K3pOmo6SKwtW9/C3B00Bcc05XUKZnbNPI2BgvA9b1ZDD8MPFlV6w4tgJWupK4ZY6Wb5C7gCuDcJIvArwNnAlTVHwP7gGuBw8C3gLcOa9PQldQtOel7cc+qquuGnC/g5zbSpqErqVs2jS90J8HQldQpGWOlOwmGrqRu2TTb8wMMXUndYuhKUjsxdCWpIUNXkhryRpoktePsBUlqaW5u2j0YyNCV1C0+HCFJ7Ti8IEkttV3accMMXUnd4vCCJLXjwxGS1JKhK0kNGbqS1I6zFySpJW+kSVJDThmTpHbG9JbfiTF0JXWLY7qS1JCzFySpHR+OkKSWHF6QpIYMXUlqJy5iLkkNWelKUkM+kSZJDZ3qT6QleRWwE9gMFHAUWKiqRybcN0nasMx4pTvwn4QkvwTsBQL8I7C/9/muJLdMvnuStEGbNo2+TcGwSvdG4Puq6un+g0neDxwE3rfWl5LMA/MAt333dna9bMsYuipJw836wxHDercMfOcax8/rnVtTVe2uqkuq6hIDV1JTp3il+07gs0m+AjzWO/Zy4JXAzZPsmCSdkFN5ylhV/U2SC4BLWbmRFmAR2F9VSw36J0kbM8bQTbID+ANgDvhIVb1v1fmXAx8DXtK75paq2jeozaGzF6pqGbj/RDstSS2Na0w3yRxwB3A1vWIzyUJVHeq77FeBT1bVB5NsB/YBrxjUrvN0JXXL+BYxvxQ4XFVHAJLsZWX6bH/oFvBtvc8vZmVK7UCzfZtPkjYqGXlLMp/kQN8239fSZo7fy4KVanfzql97L/DmJIusVLlvG9Y9K11JnbKR4YWq2g3sXq+ptb6yav864KNV9XtJLgM+nuTC3rDsmgxdSd0yvseAF4GtfftbeP7wwY3ADoCq+mKSs4BzgcfXa9ThBUndsimjb4PtB7YlOT/JC4BdwMKqa74KXAmQ5HuBs4B/H9Sola6kTsmYpoxV1bEkNwP3sDIdbE9VHUxyG3CgqhaAXwA+nOTnWRl6eEtVrR6CeA5DV1K3jHER896c232rjt3a9/kQcPlG2jR0JXXLqfxEmiSdamZ9aUdDV1K3nOqLmEvSKcXhBUlqyOEFSWonc7Mda7PdO0naKCtdSWpnXA9HTIqhK6lbDF1JamjGX0xp6ErqlFl/G7ChK6lbDF1JasgxXUlqyEpXktpxwRtJasnhBUlqJ2NcxHwSDF1J3WKlK0kNuZ6uJDXkjTRJascFbySpJStdSWrodF/E/KarXjfpnzjtffjv/mraXei895z9oml34bRw/asvPuk2HF6QpJYcXpCkhqx0Jakh5+lKUjuZM3QlqR2XdpSkdpy9IEktWelKUkNWupLUkPN0JamdbJrtRcxne/BDkjZqU0bfhkiyI8mjSQ4nuWWda34qyaEkB5P82bA2rXQldcuYHo5IMgfcAVwNLAL7kyxU1aG+a7YBvwxcXlVPJPmOYe0aupI6ZYxvA74UOFxVRwCS7AV2Aof6rrkJuKOqngCoqseHNerwgqRuSUbekswnOdC3zfe1tBl4rG9/sXes3wXABUnuS3J/kh3DumelK6lTNvI24KraDexer6m1vrJq/wxgG3AFsAX4QpILq+o/1/tNQ1dSt4zv4YhFYGvf/hbg6BrX3F9VTwP/kuRRVkJ4/7rdG1fvJGkmbGB4YYj9wLYk5yd5AbALWFh1zaeBH1v52ZzLynDDkUGNWulK6pYx3UirqmNJbgbuAeaAPVV1MMltwIGqWuid+4kkh4Al4Ber6j8GtWvoSuqUjHE93araB+xbdezWvs8FvKu3jcTQldQtrr0gSQ25iLkktTPO4YVJMHQldYurjElSQy5iLknt+LoeSWrJSleSGjJ0JamdMS7tOBGGrqRuccqYJDXkjTRJasjhBUlqZyOLmE+DoSupU/7nrBeOfO05E+zHemZ7xFmSOsbQlaSGTjh0k7x1nB2RpNPByVS6v7Heif7XGh998L6T+AlJ6paBN9KSPLTeKeCl632v/7XGV7z3j1a/sliSTlvDZi+8FHgt8MSq4wH+YSI9kqQOGxa6nwHOrqovrz6R5HMT6ZEkddjA0K2qGwece9P4uyNJ3ebDEZI65em5M6fdhYEMXUmdUjN+697QldQpS8vL0+7CQIaupE6pGS91DV1JnbJs6EpSOzOeuYaupG5xeEGSGloqb6RJUjOO6UpSQ8vLhq4kNTPjha5vjpDULVU18jZMkh1JHk1yOMktA657Y5JKcsmwNq10JXXKMuMpdZPMAXcAVwOLwP4kC1V1aNV15wBvBx4YpV0rXUmdsry8PPI2xKXA4ao6UlVPAXuBnWtc95vA7cD/jtI/Q1dSpyzX6Fv/q8V623xfU5uBx/r2F3vHnpXkImBrVX1m1P45vCCpUzbycET/q8XWkLW+8uzJZBPwAeAtG+ieoSupW8b4RNoisLVvfwtwtG//HOBC4HNJAF4GLCR5fVUdWK9RQ1dSp4zx4Yj9wLYk5wNfA3YBz74xp6qeBM59Zr/3CrN3DwpcMHQldcy4QreqjiW5GbgHmAP2VNXBJLcBB6pq4UTaNXQldco4FzGvqn3AvlXHbl3n2itGadPQldQps/5EmqErqVNc2lGSGnKVMUlqyEpXkhpacmlHSWrHSleSGnJMV5IaMnQlqSGHFySpIUNXkho67Wcv3HDlZZP+idPee85+0bS70Hm3f/oT0+7C6eHdN5x0E1a6ktTQuN6RNimGrqROsdKVpIZmfEjX0JXULUtL41tPdxIMXUmd4vCCJDXkjTRJashKV5IamvHMNXQldYsL3khSQ8tjfBvwJBi6kjrFSleSGjJ0JakhZy9IUkM+BixJDVnpSlJDS85ekKR2ZrzQNXQldYvDC5LUkFPGJKkhK11JauiYN9IkqZ1Zr3Q3TbsDkjROVaNvwyTZkeTRJIeT3LLG+XclOZTkoSSfTfJdw9o0dCV1ynLVyNsgSeaAO4BrgO3AdUm2r7rsn4BLquoHgLuB24f1z9CV1ClVNfI2xKXA4ao6UlVPAXuBnat+696q+lZv935gy7BGDV1JnbKR0E0yn+RA3zbf19Rm4LG+/cXesfXcCPz1sP55I01Spyxt4EZaVe0Gdq9zOmt9Zc0LkzcDlwCvGfabhq6kThnjwxGLwNa+/S3A0dUXJbkK+BXgNVX1f8MaNXQldcoYp4ztB7YlOR/4GrALeFP/BUkuAj4E7Kiqx0dp1NCV1CnLY1pQt6qOJbkZuAeYA/ZU1cEktwEHqmoB+F3gbODPkwB8tapeP6hdQ1dSp4zz4Yiq2gfsW3Xs1r7PV220TUNXUqe44I0kNWToSlJDsx66Qx+OSPKqJFcmOXvV8R2T65YknZgxPpE2EQNDN8nbgb8E3gY8nKT/EbjfnmTHJOlEjGvthUkZVuneBFxcVW8ArgB+Lck7eufWelpj5UTfo3X3LnxqPD2VpBGMc5WxSRg2pjtXVd8EqKp/TXIFcHdv+bJ1Q7f/0bo7v/DgbA+wSOqUWX8b8LBK99+S/OAzO70A/kngXOD7J9kxSToRsz68MKzSvR441n+gqo4B1yf50MR6JUknaNbfHDEwdKtqccC5+8bfHUk6Oad06ErSqWZMSy9MjKErqVOsdCWpoVmfvWDoSuoUK11JasgxXUlqaLkcXpCkZmZ8dMHQldQtjulKUkPOXpCkhqx0JakhZy9IUkNWupLU0DKGriQ1s7TkjTRJasbhBUlqyBtpktSQla4kNVTeSJOkdqb1wslRGbqSOmVpxgd1DV1JneKYriQ1ZOhKUkOO6UpSQ7Meupum3QFJGqeqGnkbJsmOJI8mOZzkljXOvzDJJ3rnH0jyimFtGrqSOmVpuUbeBkkyB9wBXANsB65Lsn3VZTcCT1TVK4EPAL8zrH+GrqROGWOleylwuKqOVNVTwF5g56prdgIf632+G7gySQY1auhK6pTlqpG3JPNJDvRt831NbQYe69tf7B1jrWuq6hjwJPDtg/rnjTRJnbKRKWNVtRvYvc7ptSrW1Y2Pcs1zWOlK6pSq0bchFoGtfftbgKPrXZPkDODFwDcGNWroSuqUpeXlkbch9gPbkpyf5AXALmBh1TULwM/0Pr8R+PsaUmpn1p/emIYk873/7dCE+DeePP/GJy/JtcDvA3PAnqr6rSS3AQeqaiHJWcDHgYtYqXB3VdWRgW0aus+X5EBVXTLtfnSZf+PJ8288mxxekKSGDF1JasjQXZvjYJPn33jy/BvPIMd0JakhK11JasjQlaSGDN0+w5Zx08lLsifJ40kennZfuirJ1iT3JnkkycEk75h2n3ScY7o9vWXc/hm4mpVH+/YD11XVoal2rGOS/CjwTeDOqrpw2v3poiTnAedV1ZeSnAM8CLzB/5Zng5XucaMs46aTVFWfZ8iz6To5VfX1qvpS7/N/A4/w/NWxNCWG7nGjLOMmnVJ6bzK4CHhguj3RMwzd4za8RJs0y5KcDfwF8M6q+q9p90crDN3jRlnGTTolJDmTlcD906r61LT7o+MM3eNGWcZNmnm918X8CfBIVb1/2v3Rcxm6Pb1XbdwM3MPKjYdPVtXB6faqe5LcBXwR+J4ki0lunHafOuhy4KeBH0/y5d527bQ7pRVOGZOkhqx0JakhQ1eSGjJ0JakhQ1eSGjJ0JakhQ1eSGjJ0Jamh/we0mwq/RR2gWAAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df.plots.correlation([\"id\",\"mass (g)\", \"reclat\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[ 1. , -0.01888518, 0.25706522],\n", - " [-0.01888518, 1. , 0.02892697],\n", - " [ 0.25706522, 0.02892697, 1. ]])" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.correlation([\"id\",\"mass (g)\", \"reclat\"], output=\"array\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Benchmark " - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "df = op.load.csv(\"order_products__prior.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
Viewing 100 of 32434489 rows / 4 columns
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + "
\n", + "
\n", + "

year

\n", + "
date
\n", + "
\n", + "
\n", - "
order_id
\n", - "
1 (int)
\n", - "\n", - "
\n", - "
product_id
\n", - "
2 (int)
\n", - "\n", - "
\n", - "
add_to_cart_order
\n", - "
3 (int)
\n", - "\n", - "
\n", - "
reordered
\n", - "
4 (int)
\n", - "\n", - "
\n", - " 2\n", - " \n", - " 33120\n", - " \n", - " 1\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 28985\n", - " \n", - " 2\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 9327\n", - " \n", - " 3\n", - " \n", - " 0\n", - "
\n", - " 2\n", - " \n", - " 45918\n", - " \n", - " 4\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 30035\n", - " \n", - " 5\n", - " \n", - " 0\n", - "
\n", - " 2\n", - " \n", - " 17794\n", - " \n", - " 6\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 40141\n", - " \n", - " 7\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 1819\n", - " \n", - " 8\n", - " \n", - " 1\n", - "
\n", - " 2\n", - " \n", - " 43668\n", - " \n", - " 9\n", - " \n", - " 0\n", - "
\n", - " 3\n", - " \n", - " 33754\n", - " \n", - " 1\n", - " \n", - " 1\n", - "
\n", - " 3\n", - " \n", - " 24838\n", - " \n", - " 2\n", - " \n", - " 1\n", - "
\n", - " 3\n", - " \n", - " 17704\n", - " \n", - " 3\n", - " \n", - " 1\n", - "
\n", - " 3\n", - " \n", - " 21903\n", - " \n", - " 4\n", - " \n", - " 1\n", - "
\n", - " 3\n", - " \n", - " 17668\n", - " \n", - " 5\n", - " \n", - " 1\n", - "
\n", - " 3\n", - " \n", - " 46667\n", - " \n", - " 6\n", - " \n", - " 1\n", - "
\n", - " 3\n", - " \n", - " 17461\n", - " \n", - " 7\n", - " \n", - " 1\n", - "
\n", - " 3\n", - " \n", - " 32665\n", - " \n", - " 8\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 46842\n", - " \n", - " 1\n", - " \n", - " 0\n", - "
\n", - " 4\n", - " \n", - " 26434\n", - " \n", - " 2\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 39758\n", - " \n", - " 3\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 27761\n", - " \n", - " 4\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 10054\n", - " \n", - " 5\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 21351\n", - " \n", - " 6\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 22598\n", - " \n", - " 7\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 34862\n", - " \n", - " 8\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 40285\n", - " \n", - " 9\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 17616\n", - " \n", - " 10\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 25146\n", - " \n", - " 11\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 32645\n", - " \n", - " 12\n", - " \n", - " 1\n", - "
\n", - " 4\n", - " \n", - " 41276\n", - " \n", - " 13\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 13176\n", - " \n", - " 1\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 15005\n", - " \n", - " 2\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 47329\n", - " \n", - " 3\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 27966\n", - " \n", - " 4\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 23909\n", - " \n", - " 5\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 48370\n", - " \n", - " 6\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 13245\n", - " \n", - " 7\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 9633\n", - " \n", - " 8\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 27360\n", - " \n", - " 9\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 6348\n", - " \n", - " 10\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 40878\n", - " \n", - " 11\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 6184\n", - " \n", - " 12\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 48002\n", - " \n", - " 13\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 20914\n", - " \n", - " 14\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 37011\n", - " \n", - " 15\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 12962\n", - " \n", - " 16\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 45698\n", - " \n", - " 17\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 24773\n", - " \n", - " 18\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 18569\n", - " \n", - " 19\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 41176\n", - " \n", - " 20\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 48366\n", - " \n", - " 21\n", - " \n", - " 1\n", - "
\n", - " 5\n", - " \n", - " 47209\n", - " \n", - " 22\n", - " \n", - " 0\n", - "
\n", - " 5\n", - " \n", - " 46522\n", - " \n", - " 23\n", - " \n", - " 0\n", - "
\n", - " 5\n", - " \n", - " 38693\n", - " \n", - " 24\n", - " \n", - " 0\n", - "
\n", - " 5\n", - " \n", - " 48825\n", - " \n", - " 25\n", - " \n", - " 0\n", - "
\n", - " 5\n", - " \n", - " 8479\n", - " \n", - " 26\n", - " \n", - " 0\n", - "
\n", - " 6\n", - " \n", - " 40462\n", - " \n", - " 1\n", - " \n", - " 0\n", - "
\n", - " 6\n", - " \n", - " 15873\n", - " \n", - " 2\n", - " \n", - " 0\n", - "
\n", - " 6\n", - " \n", - " 41897\n", - " \n", - " 3\n", - " \n", - " 0\n", - "
\n", - " 7\n", - " \n", - " 34050\n", - " \n", - " 1\n", - " \n", - " 0\n", - "
\n", - " 7\n", - " \n", - " 46802\n", - " \n", - " 2\n", - " \n", - " 0\n", - "
\n", - " 8\n", - " \n", - " 23423\n", - " \n", - " 1\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 21405\n", - "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Distinct count 265
Unique (%) 0.58
Missing (%)288
Missing (n)0.63
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 0\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 45428\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 288\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
01/01/2003 12:00:00 AM33237.269%
01/01/1979 12:00:00 AM30466.663%
01/01/1998 12:00:00 AM26975.899%
01/01/2006 12:00:00 AM24565.372%
01/01/1988 12:00:00 AM22965.022%
01/01/2002 12:00:00 AM20784.545%
01/01/2004 12:00:00 AM19404.244%
01/01/2000 12:00:00 AM17923.92%
01/01/1997 12:00:00 AM16963.71%
01/01/1999 12:00:00 AM16913.699%
\"Missing\"2880.63%
\n", + "
\n", " \n", - " \n", - " 1\n", - " \n", + "\n", " \n", - " \n", - " 0\n", - " \n", + "
\n", + " \n", " \n", - " \n", - " \n", - " \n", " \n", - " \n", + " \n", + " \n", + "\n", + " \n", " \n", - " \n", " \n", - " \n", + " \n", + " \n", + "\n", + " \n", " \n", - " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + "\n", + " \n", " \n", - " \n", " \n", - " \n", + " \n", + " \n", + "\n", + " \n", " \n", - " \n", " \n", - " \n", + " \n", + " \n", + "\n", + " \n", " \n", - " \n", - " \n", - " \n", " \n", - " \n", + " \n", + " \n", + "\n", + " \n", " \n", - " \n", + "\n", + "
\n", - " 9\n", - "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", - " 47890\n", - " \n", - " 2\n", - "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", - " 1\n", - "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", - " 9\n", - " \n", - " 11182\n", - "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", - " 3\n", - " \n", - " 0\n", - "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", - " 9\n", - "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", - " 2014\n", - "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", " \n", - " \n", - " 4\n", - " \n", + "\n", + "
\n", + "
\n", + "

reclat

\n", + "
numeric
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Distinct count 12876
Unique (%) 28.165
Missing (%)7315
Missing (n)16.0
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 0\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 45716\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 0\n", + "
\n", + " \n", + "
\n", + "

\n", + " Basic Stats\n", + "

\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
Mean-39.12258010110455
Minimum-87.36666870117188
Maximum81.16667175292969
Zeros(%)6438
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
None731516.001%
0.0643814.083%
-71.5476110.414%
-84.030406.65%
-72.015063.294%
-79.683326721191411302.472%
-76.716667175292976801.487%
-76.18332672119145391.179%
-84.216667175292972630.575%
-86.366668701171882260.494%
\"Missing\"731516.0%
\n", + "
\n", " \n", - " \n", - " 1\n", - " \n", + "\n", " \n", - " \n", - " \n", - " \n", + "
\n", + "\n", + "\n", + "

Quantile statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Minimum-87.36666870117188
5-th percentile-84.35516357421875
Q1-76.71424102783203
Median-71.5
Q30.0
95-th percentile34.49058151245117
Maximum81.16667175292969
Range168.53334045410156
Interquartile range76.71424102783203
\n", + "
\n", + "
\n", + "

Descriptive statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Standard deviation46.37851116080687
Coef of variation-1.18547
Kurtosis-1.4768000616006505
Mean-39.12258010110455
MAD12.76421
Skewness6438
Sum-1502346.198462516
Variance2150.966297493088
\n", + "
\n", " \n", - " \n", - " 9\n", - " \n", + "
\n", + " \n", " \n", - " \n", + " \n", + "\n", + " \n", + " \n", " \n", - " \n", " \n", - " \n", + " \n", + " \n", + "\n", + " \n", " \n", - " \n", - " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", - " \n", - " \n", + "\n", + "
\n", - " 29193\n", - "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", - " 5\n", - " \n", - " 1\n", - "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", - " 9\n", - " \n", - " 34203\n", - " \n", - " 6\n", - " \n", - " 1\n", - "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", " \n", - " \n", - " 9\n", - " \n", + "\n", + "
\n", + "
\n", + "

reclong

\n", + "
numeric
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Distinct count 14709
Unique (%) 32.175
Missing (%)7315
Missing (n)16.0
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 0\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 45716\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 0\n", + "
\n", + " \n", + "
\n", + "

\n", + " Basic Stats\n", + "

\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
Mean61.07431878848027
Minimum-165.43333435058594
Maximum354.47332763671875
Zeros(%)6214
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
None731516.001%
0.0621413.593%
35.66667175292969498510.904%
168.030406.65%
26.015063.294%
159.756571.437%
159.66667175292976371.393%
157.16667175292975421.186%
155.754731.035%
160.52630.575%
\"Missing\"731516.0%
\n", + "
\n", " \n", - " \n", - " 14992\n", - " \n", + "\n", " \n", - " \n", - " 7\n", - " \n", + "
\n", + "\n", + "\n", + "

Quantile statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Minimum-165.43333435058594
5-th percentile-90.36556243896484
Q10.0
Median35.66667175292969
Q3157.1666717529297
95-th percentile168.0
Maximum354.47332763671875
Range519.9066619873047
Interquartile range157.1666717529297
\n", + "
\n", + "
\n", + "

Descriptive statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Standard deviation80.64729806550085
Coef of variation1.32048
Kurtosis-0.7312421309648038
Mean61.07431878848027
MAD39.53972
Skewness6214
Sum2345314.915796431
Variance6503.986685265737
\n", + "
\n", " \n", - " \n", - " 1\n", - " \n", + "
\n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + "\n", + " \n", + " \n", " \n", - " \n", " \n", - " \n", + " \n", + " \n", + "\n", + " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", - " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", - " 9\n", - " \n", - " 31506\n", - "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", - " 8\n", - " \n", - " 1\n", - "
\n", - " 9\n", - " \n", - " 23288\n", - " \n", - " 9\n", - "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", " \n", - " \n", - " 0\n", - " \n", + "\n", + "
\n", + "
\n", + "

GeoLocation

\n", + "
categorical
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Distinct count 16686
Unique (%) 36.499
Missing (%)7315
Missing (n)16.0
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 38401\n", + "
\n", + " Integer\n", + " \n", + " 0\n", + "
\n", + " Float\n", + " \n", + " 0\n", + "
\n", + " Bool\n", + " \n", + " 0\n", + "
\n", + " Date\n", + " \n", + " 0\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 7315\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "

Frequency

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueCountFrecuency (%)
None731516.001%
(0.000000, 0.000000)621413.593%
(-71.500000, 35.666670)476110.414%
(-84.000000, 168.000000)30406.65%
(-72.000000, 26.000000)15053.292%
(-79.683330, 159.750000)6571.437%
(-76.716670, 159.666670)6371.393%
(-76.183330, 157.166670)5391.179%
(-79.683330, 155.750000)4731.035%
(-84.216670, 160.500000)2630.575%
\"Missing\"731516.0%
\n", + "
\n", " \n", - " \n", - " \n", - " \n", + "\n", " \n", - " \n", - " 9\n", - " \n", + "
\n", + " \n", " \n", - " \n", + " \n", + "\n", + " \n", + " \n", " \n", - " \n", " \n", - " \n", + " \n", + " \n", + "\n", + " \n", " \n", - " \n", - " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", - " \n", + "\n", + "
\n", - " 44533\n", - "
\n", + "\n", + "
\n", + " \n", + "
\n", + "
\n", - " 10\n", - " \n", - " 1\n", - "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", - " 9\n", - " \n", - " 18362\n", - " \n", - " 11\n", - " \n", - " 0\n", - "
\n", + "
\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 10 of 45716 rows / 10 columns
\n", + "\n", + "\n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", + "\n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -3328,11 +3706,7 @@ " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -3388,59 +3750,43 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -3448,59 +3794,43 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -3508,59 +3838,43 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -3568,39 +3882,43 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -3608,39 +3926,43 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", " \n", " \n", @@ -3648,39 +3970,43 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", " \n", " \n", @@ -3688,39 +4014,43 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", " \n", " \n", @@ -3728,39 +4058,43 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", " \n", " \n", @@ -3768,7 +4102,7 @@ " \n", "
\n", - " 9\n", - " \n", + "
name
\n", + "
1 (string)
\n", + "\n", + "
\n", - " 27366\n", - " \n", + "
id
\n", + "
2 (int)
\n", + "\n", + "
\n", - " 12\n", - " \n", + "
nametype
\n", + "
3 (string)
\n", + "\n", + "
\n", - " 1\n", - " \n", + "
recclass
\n", + "
4 (string)
\n", + "\n", + "
\n", + "
mass (g)
\n", + "
5 (double)
\n", + "\n", + "
\n", - " 9\n", - " \n", + "
fall
\n", + "
6 (string)
\n", + "\n", + "
\n", - " 432\n", - " \n", + "
year
\n", + "
7 (string)
\n", + "\n", + "
\n", - " 13\n", - " \n", + "
reclat
\n", + "
8 (double)
\n", + "\n", + "
\n", - " 1\n", - " \n", + "
reclong
\n", + "
9 (double)
\n", + "\n", + "
\n", + "
GeoLocation
\n", + "
10 (string)
\n", + "\n", + "
\n", - " 9\n", - " \n", - " 3990\n", - " \n", - " 14\n", + " Aachen\n", " \n", " 1\n", "
\n", - " 9\n", + " Valid\n", " \n", - " 14183\n", + " L5\n", " \n", - " 15\n", + " 21.0\n", " \n", - " 0\n", + " Fell\n", "
\n", - " 10\n", + " 01/01/1880⸱12:00:00⸱AM\n", " \n", - " 24852\n", + " 50.775\n", " \n", - " 1\n", + " 6.08333\n", " \n", - " 1\n", + " (50.775000,⸱6.083330)\n", "
\n", - " 10\n", - " \n", - " 4796\n", + " Aarhus\n", " \n", @@ -3340,47 +3714,35 @@ " \n", - " 1\n", - "
\n", - " 10\n", + " Valid\n", " \n", - " 31717\n", + " H6\n", " \n", - " 3\n", + " 720.0\n", " \n", - " 0\n", + " Fell\n", "
\n", - " 10\n", + " 01/01/1951⸱12:00:00⸱AM\n", " \n", - " 47766\n", + " 56.18333\n", " \n", - " 4\n", + " 10.23333\n", " \n", - " 1\n", + " (56.183330,⸱10.233330)\n", "
\n", - " 10\n", - " \n", - " 4605\n", - " \n", - " 5\n", + " Abee\n", " \n", - " 1\n", + " 6\n", "
\n", - " 10\n", + " Valid\n", " \n", - " 1529\n", + " EH4\n", " \n", - " 6\n", + " 107000.0\n", " \n", - " 0\n", + " Fell\n", "
\n", - " 10\n", + " 01/01/1952⸱12:00:00⸱AM\n", " \n", - " 21137\n", + " 54.21667\n", " \n", - " 7\n", + " -113.0\n", " \n", - " 1\n", + " (54.216670,⸱-113.000000)\n", "
\n", - " 10\n", - " \n", - " 22122\n", - " \n", - " 8\n", + " Acapulco\n", " \n", - " 1\n", + " 10\n", "
\n", - " 10\n", + " Valid\n", " \n", - " 34134\n", + " Acapulcoite\n", " \n", - " 9\n", + " 1914.0\n", " \n", - " 1\n", + " Fell\n", "
\n", - " 10\n", + " 01/01/1976⸱12:00:00⸱AM\n", " \n", - " 27156\n", + " 16.88333\n", " \n", - " 10\n", + " -99.9\n", " \n", - " 0\n", + " (16.883330,⸱-99.900000)\n", "
\n", - " 10\n", - " \n", - " 14992\n", - " \n", - " 11\n", + " Achiras\n", " \n", - " 0\n", + " 370\n", "
\n", - " 10\n", + " Valid\n", " \n", - " 49235\n", + " L6\n", " \n", - " 12\n", + " 780.0\n", " \n", - " 1\n", + " Fell\n", "
\n", - " 10\n", + " 01/01/1902⸱12:00:00⸱AM\n", " \n", - " 26842\n", + " -33.16667\n", " \n", - " 13\n", + " -64.95\n", " \n", - " 0\n", + " (-33.166670,⸱-64.950000)\n", "
\n", - " 10\n", + " Adhi⸱Kot\n", " \n", - " 3464\n", + " 379\n", " \n", - " 14\n", + " Valid\n", " \n", - " 0\n", + " EH4\n", "
\n", + " 4239.0\n", + " \n", - " 10\n", + " Fell\n", + " \n", + " 01/01/1919⸱12:00:00⸱AM\n", " \n", - " 25720\n", + " 32.1\n", " \n", - " 15\n", + " 71.8\n", " \n", - " 0\n", + " (32.100000,⸱71.800000)\n", "
\n", - " 11\n", + " Adzhi-Bogdo⸱(stone)\n", " \n", - " 30162\n", + " 390\n", " \n", - " 1\n", + " Valid\n", " \n", - " 1\n", + " LL3-6\n", "
\n", + " 910.0\n", + " \n", - " 11\n", + " Fell\n", " \n", - " 27085\n", + " 01/01/1949⸱12:00:00⸱AM\n", " \n", - " 2\n", + " 44.83333\n", " \n", - " 1\n", + " 95.16667\n", + " \n", + " (44.833330,⸱95.166670)\n", "
\n", - " 11\n", + " Agen\n", " \n", - " 5994\n", + " 392\n", " \n", - " 3\n", + " Valid\n", " \n", - " 1\n", + " H5\n", "
\n", + " 30000.0\n", + " \n", - " 11\n", + " Fell\n", " \n", - " 1313\n", + " 01/01/1814⸱12:00:00⸱AM\n", " \n", - " 4\n", + " 44.21667\n", " \n", - " 1\n", + " 0.61667\n", + " \n", + " (44.216670,⸱0.616670)\n", "
\n", - " 11\n", + " Aguada\n", " \n", - " 31506\n", + " 398\n", " \n", - " 5\n", + " Valid\n", " \n", - " 1\n", + " L6\n", "
\n", + " 1620.0\n", + " \n", - " 12\n", + " Fell\n", " \n", - " 30597\n", + " 01/01/1930⸱12:00:00⸱AM\n", " \n", - " 1\n", + " -31.6\n", " \n", - " 1\n", + " -65.23333\n", + " \n", + " (-31.600000,⸱-65.233330)\n", "
\n", - " 12\n", + " Aguila⸱Blanca\n", " \n", - " 15221\n", + " 417\n", " \n", - " 2\n", + " Valid\n", " \n", - " 1\n", + " L\n", "
\n", + " 1440.0\n", + " \n", - " 12\n", + " Fell\n", " \n", - " 43772\n", + " 01/01/1920⸱12:00:00⸱AM\n", " \n", - " 3\n", + " -30.86667\n", " \n", - " 1\n", + " -64.55\n", + " \n", + " (-30.866670,⸱-64.550000)\n", "
\n", "\n", - "
Viewing 100 of 32434489 rows / 4 columns
\n" + "
Viewing 10 of 45716 rows / 10 columns
\n" ], "text/plain": [ "" @@ -3779,108 +4113,161 @@ } ], "source": [ - "df.table()" + "op.profiler.run(df, \"*\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plot profile for a specific column" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "start_time = timeit.default_timer()\n", + "Profiler.columns(df, \"reclat\")\n", + "timeit.default_timer() - start_time" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Output a json file" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plot histagram for multiple columns" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Error while sending.\n", - "Traceback (most recent call last):\n", - " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 1145, in send_command\n", - " self.socket.sendall(command.encode(\"utf-8\"))\n", - "ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host\n", - "Exception while sending command.\n", - "Traceback (most recent call last):\n", - " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 1145, in send_command\n", - " self.socket.sendall(command.encode(\"utf-8\"))\n", - "ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 985, in send_command\n", - " response = connection.send_command(command)\n", - " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 1149, in send_command\n", - " \"Error while sending\", e, proto.ERROR_ON_SEND)\n", - "py4j.protocol.Py4JNetworkError: Error while sending\n", - "An error occurred while trying to connect to the Java server (127.0.0.1:50332)\n", - "Traceback (most recent call last):\n", - " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 1145, in send_command\n", - " self.socket.sendall(command.encode(\"utf-8\"))\n", - "ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 985, in send_command\n", - " response = connection.send_command(command)\n", - " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 1149, in send_command\n", - " \"Error while sending\", e, proto.ERROR_ON_SEND)\n", - "py4j.protocol.Py4JNetworkError: Error while sending\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 929, in _get_connection\n", - " connection = self.deque.pop()\n", - "IndexError: pop from an empty deque\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"C:\\Users\\argenisleon\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\", line 1067, in start\n", - " self.socket.connect((self.address, self.port))\n", - "ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it\n" + "Using 'column_exp' to process column 'id_buckets' with function _bucketizer\n", + "Using 'column_exp' to process column 'reclong_buckets' with function _bucketizer\n" ] }, { - "ename": "Py4JNetworkError", - "evalue": "An error occurred while trying to connect to the Java server (127.0.0.1:50332)", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mConnectionResetError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command)\u001b[0m\n\u001b[0;32m 1144\u001b[0m \u001b[1;31m# if it sent a RST packet (SO_LINGER)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1145\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msendall\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"utf-8\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1146\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mConnectionResetError\u001b[0m: [WinError 10054] An existing connection was forcibly closed by the remote host", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[1;31mPy4JNetworkError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command, retry, binary)\u001b[0m\n\u001b[0;32m 984\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 985\u001b[1;33m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 986\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mbinary\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command)\u001b[0m\n\u001b[0;32m 1148\u001b[0m raise Py4JNetworkError(\n\u001b[1;32m-> 1149\u001b[1;33m \"Error while sending\", e, proto.ERROR_ON_SEND)\n\u001b[0m\u001b[0;32m 1150\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mPy4JNetworkError\u001b[0m: Error while sending", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[1;31mIndexError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m_get_connection\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 928\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 929\u001b[1;33m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdeque\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 930\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mIndexError\u001b[0m: pop from an empty deque", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[1;31mConnectionRefusedError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36mstart\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1066\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1067\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maddress\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mport\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1068\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstream\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmakefile\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"rb\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mConnectionRefusedError\u001b[0m: [WinError 10061] No connection could be made because the target machine actively refused it", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[1;31mPy4JNetworkError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mop\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mprofiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"product_id\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;32m~\\Documents\\Optimus\\optimus\\profiler\\profiler.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self, df, columns, buckets)\u001b[0m\n\u001b[0;32m 343\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 344\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparse_columns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 345\u001b[1;33m \u001b[0moutput\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mProfiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_json\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbuckets\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 346\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 347\u001b[0m \u001b[1;31m# Load jinja\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Documents\\Optimus\\optimus\\profiler\\profiler.py\u001b[0m in \u001b[0;36mto_json\u001b[1;34m(df, columns, buckets)\u001b[0m\n\u001b[0;32m 402\u001b[0m \"\"\"\n\u001b[0;32m 403\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 404\u001b[1;33m \u001b[0moutput\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mProfiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbuckets\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 405\u001b[0m \u001b[0mdataset\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mProfiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdataset_info\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 406\u001b[0m \u001b[0moutput\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"summary\"\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdataset\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Documents\\Optimus\\optimus\\profiler\\profiler.py\u001b[0m in \u001b[0;36mcolumns\u001b[1;34m(df, columns, buckets)\u001b[0m\n\u001b[0;32m 163\u001b[0m \u001b[0mcolumn_info\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'columns'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 164\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 165\u001b[1;33m \u001b[0mrows_count\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 166\u001b[0m \u001b[0mcolumn_info\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'rows_count'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mrows_count\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 167\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pyspark\\sql\\dataframe.py\u001b[0m in \u001b[0;36mcount\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 453\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 454\u001b[0m \"\"\"\n\u001b[1;32m--> 455\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 456\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 457\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mignore_unicode_prefix\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m 1253\u001b[0m \u001b[0mproto\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mEND_COMMAND_PART\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1254\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1255\u001b[1;33m \u001b[0manswer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1256\u001b[0m return_value = get_return_value(\n\u001b[0;32m 1257\u001b[0m answer, self.gateway_client, self.target_id, self.name)\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command, retry, binary)\u001b[0m\n\u001b[0;32m 998\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_should_retry\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mretry\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpne\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 999\u001b[0m \u001b[0mlogging\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Exception while sending command.\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mexc_info\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1000\u001b[1;33m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbinary\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbinary\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1001\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1002\u001b[0m logging.exception(\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command, retry, binary)\u001b[0m\n\u001b[0;32m 981\u001b[0m \u001b[1;32mif\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0mbinary\u001b[0m\u001b[0;31m`\u001b[0m \u001b[1;32mis\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[0;31m`\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 982\u001b[0m \"\"\"\n\u001b[1;32m--> 983\u001b[1;33m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_connection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 984\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 985\u001b[0m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m_get_connection\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 929\u001b[0m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdeque\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 930\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 931\u001b[1;33m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_create_connection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 932\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 933\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m_create_connection\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 935\u001b[0m connection = GatewayConnection(\n\u001b[0;32m 936\u001b[0m self.gateway_parameters, self.gateway_property)\n\u001b[1;32m--> 937\u001b[1;33m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 938\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 939\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36mstart\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1077\u001b[0m \u001b[1;34m\"server ({0}:{1})\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maddress\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mport\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1078\u001b[0m \u001b[0mlogger\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexception\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1079\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mPy4JNetworkError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1080\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1081\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_authenticate_connection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mPy4JNetworkError\u001b[0m: An error occurred while trying to connect to the Java server (127.0.0.1:50332)" - ] + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA2QAAAEHCAYAAADBF4UVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAGfdJREFUeJzt3X+wnmWd3/H3ZxNhVboGJLqYZDesZl3B/tDNYNSOpWIhgBrakZlQV6JlJ7O7aNXZrQa3LbsqndjuyEpXmFLJAi5jZNBK1mAxRVzGVpCgVgk/mhSQZBMhbvihomLw2z+eK/Xx8CQnnHPgOufk/Zo589z3977u+/k+4R7Ih/t6rpOqQpIkSZL0zPul3g1IkiRJ0qHKQCZJkiRJnRjIJEmSJKkTA5kkSZIkdWIgkyRJkqRODGSSJEmS1ImBTJI0aUm2JDmxdx8zUZITk+zo3YckqQ8DmSTpgJLcl+QNY2pvT/KVfftVdXxVfXmc6yxOUknmPk2tPq3aZ768dx8TNdP7l6TZykAmSZoVega9mRoyJUn9GcgkSZM2/BQtyQlJNid5NMkDST7aht3UXh9O8oMkr07yS0n+bZLvJHkwyZVJnjd03bPbsb9L8u/GvM+fJLkmyV8leRR4e3vvryZ5OMmuJH+R5LCh61WSP0iyNcn3k3woyYvbOY8muXp4/AE+776nfeckuR/4UqsvS/K/2vv/7+FpnEmOSvKXSXYmeSjJ5/Zz7Zcl+XK7xpYkbx46dnmSjyfZ2Pq/JcmLh46fnOTuJI8kuTjJ3yT53XH/AUqSujGQSZKm2seAj1XVrwAvBq5u9de113lVdURVfRV4e/v5p8BvAEcAfwGQ5DjgYuCtwDHA84AFY95rBXANMA+4CngCeC9wNPBq4CTgD8acsxz4bWAZ8D7g0vYei4CXA2eN+lBVdXlVvX1M+Z8ALwNOSbIA2Ah8GDgK+CPgM0nmt7GfBJ4DHA+8ALhw7HskeRbw18AX25h3AVcleenQsLOAPwWOBLYBF7Rzj25/FucBzwfuBl4zTv+SpM4MZJKkg/G59sTm4SQPMwhK+/NT4CVJjq6qH1TVzQcY+1bgo1V1T1X9gEGYWNmmAL4F+Ouq+kpVPQ78e6DGnP/VqvpcVf2sqn5UVbdV1c1Vtbeq7gP+C4PQNOwjVfVoVW0Bbge+2N7/EeALwCsO7o8EgD+pqh9W1Y+A3wGuq6rrWj+bgM3AaUmOAU4Ffq+qHqqqn1bV34y43jIGoXRtVT1eVV8CPs8vhsTPVtXXqmovgxD6j1r9NGBLVX22HbsI+O5T+CySpA4MZJKkg3FGVc3b98OTnzoNOwf4TeCuJLcmeeMBxr4I+M7Q/neAucAL27Ht+w5U1WPA3405f/vwTpLfTPL5JN9t0xj/A4OnZcMeGNr+0Yj9Iw7Q71jD7//rwJljgus/ZvB0bxGwp6oeGud6LwK2V9XPhmrf4RefDA6HrMeG+h3751WAqzdK0jRnIJMkTamq2lpVZzGYcvcR4Jokz+XJT7cAdjIIMvv8GrCXQUjaBSzcdyDJsxlMxfuFtxuzfwlwF7CkTZn8AJCJf5pxDb//duCTw8G1qp5bVWvbsaOSzBvnejuBRUmG//v8a8DfHkQvY/+8MrwvSZqeDGSSpCmV5HeSzG9PeR5u5SeA3cDPGHxXbJ9PAe9NcmySIxg80fp0m3J3DfCmJK9pC238KeOHq78HPAr8IMlvAb8/ZR9sfH/FoN9TksxJ8svtd4wtrKpdDKZDXpzkyCTPSvK6Ede4Bfgh8L425kTgTcD6g3j/jcDfT3JGm/J5LvCrU/LJJElPGwOZJGmqLQe2JPkBgwU+VlbVj9uUwwuA/9mm9C0D1jFY7OIm4F7gxwwWsqB9x+tdDMLILuD7wIPATw7w3n8E/Ms29r8Cn576jzdaVW1nsMjIBxiEz+3Av+Hn/619G4Pv193F4HO8Z8Q1HgfezOD7Zt9j8F29s6vqroN4/+8BZwL/kcHUzuMYfIftQH9ekqTOMphiLknS9NaeoD3MYDrivb37me7atMcdwFur6sbe/UiSRvMJmSRp2krypiTPad9B+zPg28B9fbuavtp0yXlJDufn35870CqXkqTODGSSpOlsBYOFLnYCSxhMf3Rqx/69Gvi/DKY7vonB6pg/6tuSJOlAnLIoSZIkSZ34hEySJEmSOjGQSZIkSVInc3s3MFFHH310LV68uHcbkiRJkvQkt9122/eqav5442ZsIFu8eDGbN2/u3YYkSZIkPUmS7xzMOKcsSpIkSVIn4wayJOuSPJjk9qHaf0pyV5JvJflvSeYNHTsvybYkdyc5Zai+vNW2JVkzVD82yS1Jtib5dJLDpvIDSpIkSdJ0dTBPyC4Hlo+pbQJeXlX/APg/wHkASY4DVgLHt3MuTjInyRzg48CpwHHAWW0swEeAC6tqCfAQcM6kPpEkSZIkzRDjBrKqugnYM6b2xara23ZvBha27RXA+qr6SVXdC2wDTmg/26rqnqp6HFgPrEgS4PXANe38K4AzJvmZJEmSJGlGmIrvkP0r4AttewGwfejYjlbbX/35wMND4W5ffaQkq5NsTrJ59+7dU9C6JEmSJPUzqUCW5I+BvcBV+0ojhtUE6iNV1aVVtbSqls6fP+4KkpIkSZI0rU142fskq4A3AidV1b4QtQNYNDRsIbCzbY+qfw+Yl2Rue0o2PF6SJEmSZrUJPSFLshx4P/Dmqnps6NAGYGWSw5McCywBvgbcCixpKyoexmDhjw0tyN0IvKWdvwq4dmIfRZIkSZJmlnGfkCX5FHAicHSSHcD5DFZVPBzYNFiXg5ur6veqakuSq4E7GExlPLeqnmjXeSdwPTAHWFdVW9pbvB9Yn+TDwDeAy6bw80mSJGmWWLxmY+8WDsp9a0/v3YJmkHEDWVWdNaK839BUVRcAF4yoXwdcN6J+D4NVGCVJkiTpkDIVqyxKkiRJkibAQCZJkiRJnRjIJEmSJKkTA5kkSZIkdWIgkyRJkqRODGSSJEmS1ImBTJIkSZI6MZBJkiRJUicGMkmSJEnqxEAmSZIkSZ0YyCRJkiSpEwOZJEmSJHViIJMkSZKkTgxkkiRJktSJgUySJEmSOjGQSZIkSVInBjJJkiRJ6sRAJkmSJEmdGMgkSZIkqRMDmSRJkiR1YiCTJEmSpE4MZJIkSZLUiYFMkiRJkjoxkEmSJElSJwYySZIkSerEQCZJkiRJnYwbyJKsS/JgktuHakcl2ZRka3s9stWT5KIk25J8K8krh85Z1cZvTbJqqP7bSb7dzrkoSab6Q0qSJEnSdHQwT8guB5aPqa0BbqiqJcANbR/gVGBJ+1kNXAKDAAecD7wKOAE4f1+Ia2NWD5039r0kSZIkaVYaN5BV1U3AnjHlFcAVbfsK4Iyh+pU1cDMwL8kxwCnApqraU1UPAZuA5e3Yr1TVV6uqgCuHriVJkiRJs9pEv0P2wqraBdBeX9DqC4DtQ+N2tNqB6jtG1EdKsjrJ5iSbd+/ePcHWJUmSJGl6mOpFPUZ9/6smUB+pqi6tqqVVtXT+/PkTbFGSJEmSpoeJBrIH2nRD2uuDrb4DWDQ0biGwc5z6whF1SZIkSZr1JhrINgD7VkpcBVw7VD+7rba4DHikTWm8Hjg5yZFtMY+Tgevbse8nWdZWVzx76FqSJEmSNKvNHW9Akk8BJwJHJ9nBYLXEtcDVSc4B7gfObMOvA04DtgGPAe8AqKo9ST4E3NrGfbCq9i0U8vsMVnJ8NvCF9iNJkiRJs964gayqztrPoZNGjC3g3P1cZx2wbkR9M/Dy8fqQJEmSpNlmqhf1kCRJkiQdJAOZJEmSJHViIJMkSZKkTgxkkiRJktSJgUySJEmSOjGQSZIkSVInBjJJkiRJ6sRAJkmSJEmdGMgkSZIkqRMDmSRJkiR1YiCTJEmSpE4MZJIkSZLUiYFMkiRJkjqZ27sBSdLstHjNxt4tjOu+taf3bkGSdIjzCZkkSZIkdWIgkyRJkqRODGSSJEmS1ImBTJIkSZI6MZBJkiRJUicGMkmSJEnqxEAmSZIkSZ0YyCRJkiSpEwOZJEmSJHViIJMkSZKkTgxkkiRJktTJpAJZkvcm2ZLk9iSfSvLLSY5NckuSrUk+neSwNvbwtr+tHV88dJ3zWv3uJKdM7iNJkiRJ0sww4UCWZAHwr4GlVfVyYA6wEvgIcGFVLQEeAs5pp5wDPFRVLwEubONIclw773hgOXBxkjkT7UuSJEmSZorJTlmcCzw7yVzgOcAu4PXANe34FcAZbXtF26cdPylJWn19Vf2kqu4FtgEnTLIvSZIkSZr2JhzIqupvgT8D7mcQxB4BbgMerqq9bdgOYEHbXgBsb+fubeOfP1wfcY4kSZIkzVqTmbJ4JIOnW8cCLwKeC5w6YmjtO2U/x/ZXH/Weq5NsTrJ59+7dT71pSZIkSZpGJjNl8Q3AvVW1u6p+CnwWeA0wr01hBFgI7GzbO4BFAO3484A9w/UR5/yCqrq0qpZW1dL58+dPonVJkiRJ6m8ygex+YFmS57Tvgp0E3AHcCLyljVkFXNu2N7R92vEvVVW1+sq2CuOxwBLga5PoS5IkSZJmhLnjDxmtqm5Jcg3wdWAv8A3gUmAjsD7Jh1vtsnbKZcAnk2xj8GRsZbvOliRXMwhze4Fzq+qJifYlSZIkSTPFhAMZQFWdD5w/pnwPI1ZJrKofA2fu5zoXABdMphdJkiRJmmkmu+y9JEmSJGmCDGSSJEmS1ImBTJIkSZI6MZBJkiRJUicGMkmSJEnqxEAmSZIkSZ0YyCRJkiSpEwOZJEmSJHViIJMkSZKkTgxkkiRJktSJgUySJEmSOjGQSZIkSVInc3s3IEnSdLJ4zcbeLRyU+9ae3rsFSdIU8AmZJEmSJHViIJMkSZKkTgxkkiRJktSJgUySJEmSOjGQSZIkSVInBjJJkiRJ6sRAJkmSJEmdGMgkSZIkqRMDmSRJkiR1YiCTJEmSpE4MZJIkSZLUiYFMkiRJkjoxkEmSJElSJ5MKZEnmJbkmyV1J7kzy6iRHJdmUZGt7PbKNTZKLkmxL8q0krxy6zqo2fmuSVZP9UJIkSZI0E0z2CdnHgP9eVb8F/EPgTmANcENVLQFuaPsApwJL2s9q4BKAJEcB5wOvAk4Azt8X4iRJkiRpNptwIEvyK8DrgMsAqurxqnoYWAFc0YZdAZzRtlcAV9bAzcC8JMcApwCbqmpPVT0EbAKWT7QvSZIkSZopJvOE7DeA3cBfJvlGkk8keS7wwqraBdBeX9DGLwC2D52/o9X2V5ckSZKkWW0ygWwu8Ergkqp6BfBDfj49cZSMqNUB6k++QLI6yeYkm3fv3v1U+5UkSZKkaWUygWwHsKOqbmn71zAIaA+0qYi01weHxi8aOn8hsPMA9SepqkuramlVLZ0/f/4kWpckSZKk/iYcyKrqu8D2JC9tpZOAO4ANwL6VElcB17btDcDZbbXFZcAjbUrj9cDJSY5si3mc3GqSJEmSNKvNneT57wKuSnIYcA/wDgYh7+ok5wD3A2e2sdcBpwHbgMfaWKpqT5IPAbe2cR+sqj2T7EuSJEmSpr1JBbKq+iawdMShk0aMLeDc/VxnHbBuMr1IkiRJ0kwz2d9DJkmSJEmaIAOZJEmSJHViIJMkSZKkTgxkkiRJktSJgUySJEmSOjGQSZIkSVInBjJJkiRJ6sRAJkmSJEmdGMgkSZIkqRMDmSRJkiR1YiCTJEmSpE4MZJIkSZLUiYFMkiRJkjoxkEmSJElSJwYySZIkSerEQCZJkiRJnRjIJEmSJKkTA5kkSZIkdWIgkyRJkqRODGSSJEmS1ImBTJIkSZI6MZBJkiRJUicGMkmSJEnqxEAmSZIkSZ0YyCRJkiSpEwOZJEmSJHUy6UCWZE6SbyT5fNs/NsktSbYm+XSSw1r98La/rR1fPHSN81r97iSnTLYnSZIkSZoJpuIJ2buBO4f2PwJcWFVLgIeAc1r9HOChqnoJcGEbR5LjgJXA8cBy4OIkc6agL0mSJEma1iYVyJIsBE4HPtH2A7weuKYNuQI4o22vaPu04ye18SuA9VX1k6q6F9gGnDCZviRJkiRpJpjsE7I/B94H/KztPx94uKr2tv0dwIK2vQDYDtCOP9LG///6iHMkSZIkadaacCBL8kbgwaq6bbg8YmiNc+xA54x9z9VJNifZvHv37qfUryRJkiRNN5N5QvZa4M1J7gPWM5iq+OfAvCRz25iFwM62vQNYBNCOPw/YM1wfcc4vqKpLq2ppVS2dP3/+JFqXJEmSpP4mHMiq6ryqWlhVixksyvGlqnorcCPwljZsFXBt297Q9mnHv1RV1eor2yqMxwJLgK9NtC9JkiRJminmjj/kKXs/sD7Jh4FvAJe1+mXAJ5NsY/BkbCVAVW1JcjVwB7AXOLeqnnga+pIkSZKkaWVKAllVfRn4ctu+hxGrJFbVj4Ez93P+BcAFU9GLJEmSJM0UU/F7yCRJkiRJE2AgkyRJkqRODGSSJEmS1ImBTJIkSZI6MZBJkiRJUicGMkmSJEnqxEAmSZIkSZ0YyCRJkiSpkyn5xdCSJKm/xWs29m5hXPetPb13C5I0rfiETJIkSZI6MZBJkiRJUicGMkmSJEnqxEAmSZIkSZ0YyCRJkiSpEwOZJEmSJHViIJMkSZKkTgxkkiRJktSJgUySJEmSOpnbuwFJknRoW7xmY+8WDsp9a0/v3YKkWcgnZJIkSZLUiYFMkiRJkjoxkEmSJElSJwYySZIkSerEQCZJkiRJnRjIJEmSJKkTA5kkSZIkdTLhQJZkUZIbk9yZZEuSd7f6UUk2JdnaXo9s9SS5KMm2JN9K8sqha61q47cmWTX5jyVJkiRJ099knpDtBf6wql4GLAPOTXIcsAa4oaqWADe0fYBTgSXtZzVwCQwCHHA+8CrgBOD8fSFOkiRJkmazCQeyqtpVVV9v298H7gQWACuAK9qwK4Az2vYK4MoauBmYl+QY4BRgU1XtqaqHgE3A8on2JUmSJEkzxZR8hyzJYuAVwC3AC6tqFwxCG/CCNmwBsH3otB2ttr+6JEmSJM1qkw5kSY4APgO8p6oePdDQEbU6QH3Ue61OsjnJ5t27dz/1ZiVJkiRpGplUIEvyLAZh7Kqq+mwrP9CmItJeH2z1HcCiodMXAjsPUH+Sqrq0qpZW1dL58+dPpnVJkiRJ6m4yqywGuAy4s6o+OnRoA7BvpcRVwLVD9bPbaovLgEfalMbrgZOTHNkW8zi51SRJkiRpVps7iXNfC7wN+HaSb7baB4C1wNVJzgHuB85sx64DTgO2AY8B7wCoqj1JPgTc2sZ9sKr2TKIvSZIkSZoRJhzIquorjP7+F8BJI8YXcO5+rrUOWDfRXiRJkiRpJpqSVRYlSZIkSU+dgUySJEmSOpnMd8gkHWIWr9nYu4WDct/a03u3IEmSdFB8QiZJkiRJnRjIJEmSJKkTA5kkSZIkdWIgkyRJkqRODGSSJEmS1ImrLEoduFqhJEmSwCdkkiRJktSNgUySJEmSOjGQSZIkSVInBjJJkiRJ6sRAJkmSJEmdGMgkSZIkqRMDmSRJkiR1YiCTJEmSpE4MZJIkSZLUydzeDUiSJOmZtXjNxt4tHJT71p7euwXpaecTMkmSJEnqxEAmSZIkSZ0YyCRJkiSpE79DphnL+e+SJEma6XxCJkmSJEmdGMgkSZIkqROnLMqpf5IkSVIn0yaQJVkOfAyYA3yiqtZ2bkmSJEmaEv4PcO3PtAhkSeYAHwf+GbADuDXJhqq6o29nkiRJo/kXbElTYVoEMuAEYFtV3QOQZD2wAjCQSZoSM/UvTjO1b0mSdHCmy6IeC4DtQ/s7Wk2SJEmSZq1UVe8eSHImcEpV/W7bfxtwQlW9a8y41cDqtvtS4O5ntFEdKo4Gvte7CWkKeC9rNvA+1mzhvXzo+fWqmj/eoOkyZXEHsGhofyGwc+ygqroUuPSZakqHpiSbq2pp7z6kyfJe1mzgfazZwntZ+zNdpizeCixJcmySw4CVwIbOPUmSJEnS02paPCGrqr1J3glcz2DZ+3VVtaVzW5IkSZL0tJoWgQygqq4Druvdh4TTYjV7eC9rNvA+1mzhvayRpsWiHpIkSZJ0KJou3yGTJEmSpEOOgUyHtCRnJtmS5GdJlo45dl6SbUnuTnLKUH15q21LsuaZ71o6MO9RzSRJ1iV5MMntQ7WjkmxKsrW9HtnqSXJRu7e/leSV/TqXfi7JoiQ3Jrmz/b3i3a3uvaxxGch0qLsd+BfATcPFJMcxWO3zeGA5cHGSOUnmAB8HTgWOA85qY6VpwXtUM9DlDP49O2wNcENVLQFuaPswuK+XtJ/VwCXPUI/SePYCf1hVLwOWAee2f/d6L2tcBjId0qrqzqoa9QvGVwDrq+onVXUvsA04of1sq6p7qupxYH0bK00X3qOaUarqJmDPmPIK4Iq2fQVwxlD9yhq4GZiX5JhnplNp/6pqV1V9vW1/H7gTWID3sg6CgUwabQGwfWh/R6vtry5NF96jmg1eWFW7YPAXXeAFre79rWkvyWLgFcAteC/rIEybZe+lp0uS/wH86ohDf1xV1+7vtBG1YvT/xHCpUk0n+7t3pdnA+1vTWpIjgM8A76mqR5NRt+xg6Iia9/IhykCmWa+q3jCB03YAi4b2FwI72/b+6tJ0cKB7V5opHkhyTFXtatO4Hmx1729NW0mexSCMXVVVn21l72WNyymL0mgbgJVJDk9yLIMv3X4NuBVYkuTYJIcxWPhjQ8c+pbG8RzUbbABWte1VwLVD9bPbCnXLgEf2TQeTesrgUdhlwJ1V9dGhQ97LGpdPyHRIS/LPgf8MzAc2JvlmVZ1SVVuSXA3cwWDlpHOr6ol2zjuB64E5wLqq2tKpfelJqmqv96hmkiSfAk4Ejk6yAzgfWAtcneQc4H7gzDb8OuA0BgstPQa84xlvWBrttcDbgG8n+WarfQDvZR2EVDldVZIkSZJ6cMqiJEmSJHViIJMkSZKkTgxkkiRJktSJgUySJEmSOjGQSZIkSVInBjJJkiRJ6sRAJkmSJEmdGMgkSZIkqZP/B2qzDZVN+lDTAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.plots.hist([\"id\", \"reclong\"], 20)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.plots.frequency([\"id\", \"reclong\"], 10)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAV0AAAD8CAYAAADUv3dIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAADtxJREFUeJzt3X+s3fVdx/Hnqxc2soBbInHDtnPEFWdFI0KIBOdQwBXi1v2xmLJM3CBc/5D9cM6JUXFiNBPjpkYy181mY1G6icu8LlWMk2ULDmyZC6EluKaacdcZjEN0mQq99+0f90APl3vPObc953NOv30+km9yvj/O53xyQ1598/l+vp9vqgpJUhubpt0BSTqdGLqS1JChK0kNGbqS1JChK0kNGbqS1JChK0nrSLInyeNJHl7nfJL8YZLDSR5K8kPD2jR0JWl9HwV2DDh/DbCtt80DHxzWoKErSeuoqs8D3xhwyU7gzlpxP/CSJOcNavOMcXZwLV/5kdf6yNuE3XTV66bdhc674crLpt2F08L1r744J9vGRjLngvv+9mdZqVCfsbuqdm/g5zYDj/XtL/aOfX29L0w8dCVpVvUCdiMhu9pa/0gMDH1DV1K3pOmo6SKwtW9/C3B00Bcc05XUKZnbNPI2BgvA9b1ZDD8MPFlV6w4tgJWupK4ZY6Wb5C7gCuDcJIvArwNnAlTVHwP7gGuBw8C3gLcOa9PQldQtOel7cc+qquuGnC/g5zbSpqErqVs2jS90J8HQldQpGWOlOwmGrqRu2TTb8wMMXUndYuhKUjsxdCWpIUNXkhryRpoktePsBUlqaW5u2j0YyNCV1C0+HCFJ7Ti8IEkttV3accMMXUnd4vCCJLXjwxGS1JKhK0kNGbqS1I6zFySpJW+kSVJDThmTpHbG9JbfiTF0JXWLY7qS1JCzFySpHR+OkKSWHF6QpIYMXUlqJy5iLkkNWelKUkM+kSZJDZ3qT6QleRWwE9gMFHAUWKiqRybcN0nasMx4pTvwn4QkvwTsBQL8I7C/9/muJLdMvnuStEGbNo2+TcGwSvdG4Puq6un+g0neDxwE3rfWl5LMA/MAt333dna9bMsYuipJw836wxHDercMfOcax8/rnVtTVe2uqkuq6hIDV1JTp3il+07gs0m+AjzWO/Zy4JXAzZPsmCSdkFN5ylhV/U2SC4BLWbmRFmAR2F9VSw36J0kbM8bQTbID+ANgDvhIVb1v1fmXAx8DXtK75paq2jeozaGzF6pqGbj/RDstSS2Na0w3yRxwB3A1vWIzyUJVHeq77FeBT1bVB5NsB/YBrxjUrvN0JXXL+BYxvxQ4XFVHAJLsZWX6bH/oFvBtvc8vZmVK7UCzfZtPkjYqGXlLMp/kQN8239fSZo7fy4KVanfzql97L/DmJIusVLlvG9Y9K11JnbKR4YWq2g3sXq+ptb6yav864KNV9XtJLgM+nuTC3rDsmgxdSd0yvseAF4GtfftbeP7wwY3ADoCq+mKSs4BzgcfXa9ThBUndsimjb4PtB7YlOT/JC4BdwMKqa74KXAmQ5HuBs4B/H9Sola6kTsmYpoxV1bEkNwP3sDIdbE9VHUxyG3CgqhaAXwA+nOTnWRl6eEtVrR6CeA5DV1K3jHER896c232rjt3a9/kQcPlG2jR0JXXLqfxEmiSdamZ9aUdDV1K3nOqLmEvSKcXhBUlqyOEFSWonc7Mda7PdO0naKCtdSWpnXA9HTIqhK6lbDF1JamjGX0xp6ErqlFl/G7ChK6lbDF1JasgxXUlqyEpXktpxwRtJasnhBUlqJ2NcxHwSDF1J3WKlK0kNuZ6uJDXkjTRJascFbySpJStdSWrodF/E/KarXjfpnzjtffjv/mraXei895z9oml34bRw/asvPuk2HF6QpJYcXpCkhqx0Jakh5+lKUjuZM3QlqR2XdpSkdpy9IEktWelKUkNWupLUkPN0JamdbJrtRcxne/BDkjZqU0bfhkiyI8mjSQ4nuWWda34qyaEkB5P82bA2rXQldcuYHo5IMgfcAVwNLAL7kyxU1aG+a7YBvwxcXlVPJPmOYe0aupI6ZYxvA74UOFxVRwCS7AV2Aof6rrkJuKOqngCoqseHNerwgqRuSUbekswnOdC3zfe1tBl4rG9/sXes3wXABUnuS3J/kh3DumelK6lTNvI24KraDexer6m1vrJq/wxgG3AFsAX4QpILq+o/1/tNQ1dSt4zv4YhFYGvf/hbg6BrX3F9VTwP/kuRRVkJ4/7rdG1fvJGkmbGB4YYj9wLYk5yd5AbALWFh1zaeBH1v52ZzLynDDkUGNWulK6pYx3UirqmNJbgbuAeaAPVV1MMltwIGqWuid+4kkh4Al4Ber6j8GtWvoSuqUjHE93araB+xbdezWvs8FvKu3jcTQldQtrr0gSQ25iLkktTPO4YVJMHQldYurjElSQy5iLknt+LoeSWrJSleSGjJ0JamdMS7tOBGGrqRuccqYJDXkjTRJasjhBUlqZyOLmE+DoSupU/7nrBeOfO05E+zHemZ7xFmSOsbQlaSGTjh0k7x1nB2RpNPByVS6v7Heif7XGh998L6T+AlJ6paBN9KSPLTeKeCl632v/7XGV7z3j1a/sliSTlvDZi+8FHgt8MSq4wH+YSI9kqQOGxa6nwHOrqovrz6R5HMT6ZEkddjA0K2qGwece9P4uyNJ3ebDEZI65em5M6fdhYEMXUmdUjN+697QldQpS8vL0+7CQIaupE6pGS91DV1JnbJs6EpSOzOeuYaupG5xeEGSGloqb6RJUjOO6UpSQ8vLhq4kNTPjha5vjpDULVU18jZMkh1JHk1yOMktA657Y5JKcsmwNq10JXXKMuMpdZPMAXcAVwOLwP4kC1V1aNV15wBvBx4YpV0rXUmdsry8PPI2xKXA4ao6UlVPAXuBnWtc95vA7cD/jtI/Q1dSpyzX6Fv/q8V623xfU5uBx/r2F3vHnpXkImBrVX1m1P45vCCpUzbycET/q8XWkLW+8uzJZBPwAeAtG+ieoSupW8b4RNoisLVvfwtwtG//HOBC4HNJAF4GLCR5fVUdWK9RQ1dSp4zx4Yj9wLYk5wNfA3YBz74xp6qeBM59Zr/3CrN3DwpcMHQldcy4QreqjiW5GbgHmAP2VNXBJLcBB6pq4UTaNXQldco4FzGvqn3AvlXHbl3n2itGadPQldQps/5EmqErqVNc2lGSGnKVMUlqyEpXkhpacmlHSWrHSleSGnJMV5IaMnQlqSGHFySpIUNXkho67Wcv3HDlZZP+idPee85+0bS70Hm3f/oT0+7C6eHdN5x0E1a6ktTQuN6RNimGrqROsdKVpIZmfEjX0JXULUtL41tPdxIMXUmd4vCCJDXkjTRJashKV5IamvHMNXQldYsL3khSQ8tjfBvwJBi6kjrFSleSGjJ0JakhZy9IUkM+BixJDVnpSlJDS85ekKR2ZrzQNXQldYvDC5LUkFPGJKkhK11JauiYN9IkqZ1Zr3Q3TbsDkjROVaNvwyTZkeTRJIeT3LLG+XclOZTkoSSfTfJdw9o0dCV1ynLVyNsgSeaAO4BrgO3AdUm2r7rsn4BLquoHgLuB24f1z9CV1ClVNfI2xKXA4ao6UlVPAXuBnat+696q+lZv935gy7BGDV1JnbKR0E0yn+RA3zbf19Rm4LG+/cXesfXcCPz1sP55I01Spyxt4EZaVe0Gdq9zOmt9Zc0LkzcDlwCvGfabhq6kThnjwxGLwNa+/S3A0dUXJbkK+BXgNVX1f8MaNXQldcoYp4ztB7YlOR/4GrALeFP/BUkuAj4E7Kiqx0dp1NCV1CnLY1pQt6qOJbkZuAeYA/ZU1cEktwEHqmoB+F3gbODPkwB8tapeP6hdQ1dSp4zz4Yiq2gfsW3Xs1r7PV220TUNXUqe44I0kNWToSlJDsx66Qx+OSPKqJFcmOXvV8R2T65YknZgxPpE2EQNDN8nbgb8E3gY8nKT/EbjfnmTHJOlEjGvthUkZVuneBFxcVW8ArgB+Lck7eufWelpj5UTfo3X3LnxqPD2VpBGMc5WxSRg2pjtXVd8EqKp/TXIFcHdv+bJ1Q7f/0bo7v/DgbA+wSOqUWX8b8LBK99+S/OAzO70A/kngXOD7J9kxSToRsz68MKzSvR441n+gqo4B1yf50MR6JUknaNbfHDEwdKtqccC5+8bfHUk6Oad06ErSqWZMSy9MjKErqVOsdCWpoVmfvWDoSuoUK11JasgxXUlqaLkcXpCkZmZ8dMHQldQtjulKUkPOXpCkhqx0JakhZy9IUkNWupLU0DKGriQ1s7TkjTRJasbhBUlqyBtpktSQla4kNVTeSJOkdqb1wslRGbqSOmVpxgd1DV1JneKYriQ1ZOhKUkOO6UpSQ7Meupum3QFJGqeqGnkbJsmOJI8mOZzkljXOvzDJJ3rnH0jyimFtGrqSOmVpuUbeBkkyB9wBXANsB65Lsn3VZTcCT1TVK4EPAL8zrH+GrqROGWOleylwuKqOVNVTwF5g56prdgIf632+G7gySQY1auhK6pTlqpG3JPNJDvRt831NbQYe69tf7B1jrWuq6hjwJPDtg/rnjTRJnbKRKWNVtRvYvc7ptSrW1Y2Pcs1zWOlK6pSq0bchFoGtfftbgKPrXZPkDODFwDcGNWroSuqUpeXlkbch9gPbkpyf5AXALmBh1TULwM/0Pr8R+PsaUmpn1p/emIYk873/7dCE+DeePP/GJy/JtcDvA3PAnqr6rSS3AQeqaiHJWcDHgYtYqXB3VdWRgW0aus+X5EBVXTLtfnSZf+PJ8288mxxekKSGDF1JasjQXZvjYJPn33jy/BvPIMd0JakhK11JasjQlaSGDN0+w5Zx08lLsifJ40kennZfuirJ1iT3JnkkycEk75h2n3ScY7o9vWXc/hm4mpVH+/YD11XVoal2rGOS/CjwTeDOqrpw2v3poiTnAedV1ZeSnAM8CLzB/5Zng5XucaMs46aTVFWfZ8iz6To5VfX1qvpS7/N/A4/w/NWxNCWG7nGjLOMmnVJ6bzK4CHhguj3RMwzd4za8RJs0y5KcDfwF8M6q+q9p90crDN3jRlnGTTolJDmTlcD906r61LT7o+MM3eNGWcZNmnm918X8CfBIVb1/2v3Rcxm6Pb1XbdwM3MPKjYdPVtXB6faqe5LcBXwR+J4ki0lunHafOuhy4KeBH0/y5d527bQ7pRVOGZOkhqx0JakhQ1eSGjJ0JakhQ1eSGjJ0JakhQ1eSGjJ0Jamh/we0mwq/RR2gWAAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.plots.correlation([\"id\",\"mass (g)\", \"reclat\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 1. , -0.01888518, 0.25706522],\n", + " [-0.01888518, 1. , 0.02892697],\n", + " [ 0.25706522, 0.02892697, 1. ]])" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "op.profiler.run(df, \"product_id\")" + "df.correlation([\"id\",\"mass (g)\", \"reclat\"], output=\"array\")" ] }, { diff --git a/optimus/dataframe/columns.py b/optimus/dataframe/columns.py index f281e4108..e931557f8 100644 --- a/optimus/dataframe/columns.py +++ b/optimus/dataframe/columns.py @@ -2,7 +2,6 @@ import itertools import re import string -import timeit import unicodedata from fastnumbers import fast_float from functools import reduce @@ -17,8 +16,9 @@ from optimus.functions import abstract_udf as audf, concat from optimus.functions import filter_row_by_data_type as fbdt -from optimus.helpers.checkit import is_num_or_str, is_list, is_tuple, is_list_of_dataframes, is_list_of_tuples, \ - is_function, is_one_element, is_type, is_int, is_dict, is_str, is_ +from optimus.helpers.checkit \ + import is_num_or_str, is_list, is_, is_tuple, is_list_of_dataframes, is_list_of_tuples, \ + is_function, is_one_element, is_type, is_int, is_dict, is_str, has_ # Helpers from optimus.helpers.constants import * from optimus.helpers.decorators import add_attr @@ -83,8 +83,7 @@ def append(cols_values=None): col_name = c[0] value = c[1] df_result = df_result.cols.append(col_name, value) - else: - raise Exception("Must be List of dataframes or list of tuples") + return df_result @add_attr(cols) @@ -230,10 +229,10 @@ def rename(old_column, new_column, func=None): def rename(old_column, new_column): return rename([(old_column, new_column)], None) - def _cast(column, args): + def _cast(cols, args): """ Helper function to support the multiple params implementation - :param column: + :param cols: :param args: :return: """ @@ -244,21 +243,8 @@ def _cast(column, args): # if parse_spark_dtypes(attr[0]) def cast_factory(cls): - # Parse to Vector - func_return_type = None - cast_to_vectors = None - func_type = None - - if is_type(cls, Vectors): - func_type = "udf" - - def cast_to_vectors(val, attr): - return Vectors.dense(val) - - func_return_type = VectorUDT() # Parse standard data types - elif get_spark_dtypes_object(cls): - + if get_spark_dtypes_object(cls): func_type = "column_exp" def cast_to_vectors(col_name, attr): @@ -266,6 +252,15 @@ def cast_to_vectors(col_name, attr): func_return_type = None + # Parse to Vector + elif is_type(cls, Vectors): + func_type = "udf" + + def cast_to_vectors(val, attr): + return Vectors.dense(val) + + func_return_type = VectorUDT() + # Add here any other parse you want else: RaiseIt.value_error(cls) @@ -273,7 +268,7 @@ def cast_to_vectors(col_name, attr): return func_return_type, cast_to_vectors, func_type df = self - for col, args in zip(column, args): + for col, args in zip(cols, args): return_type, func, func_type = cast_factory(args[0]) df = df.withColumn(col, audf(col, func, func_return_type=return_type, @@ -528,15 +523,13 @@ def median(columns): return percentile(columns, [0.5]) @add_attr(cols) - def percentile(columns, values=None, error=1): + def percentile(columns, values=None, error=0): """ Return the percentile of a dataframe :param columns: '*', list of columns names or a single column name. :param values: list of percentiles to be calculated :return: percentiles per columns """ - start_time = timeit.default_timer() - if values is None: values = [0.05, 0.25, 0.5, 0.75, 0.95] @@ -554,8 +547,6 @@ def percentile(columns, values=None, error=1): percentile_results = dict(zip(columns, percentile_results)) - logging.info("percentile") - logging.info(timeit.default_timer() - start_time) return format_dict(percentile_results) # Descriptive Analytics @@ -903,6 +894,7 @@ def count_zeros(columns): """ Return the NAN and Null count in a Column :param columns: '*', list of columns names or a single column name. + :param type: Accepts integer, float, string or None :return: """ columns = parse_columns(self, columns) @@ -1228,7 +1220,6 @@ def hist(columns, min_value, max_value, buckets=10): """ columns = parse_columns(self, columns) - for col_name in columns: # Create splits splits = create_buckets(min_value, max_value, buckets) diff --git a/optimus/helpers/functions.py b/optimus/helpers/functions.py index 8634e654c..ca0df71fb 100644 --- a/optimus/helpers/functions.py +++ b/optimus/helpers/functions.py @@ -49,7 +49,7 @@ def get_spark_dtypes_object(value): try: data_type = [SPARK_DTYPES_DICT_OBJECTS[SPARK_SHORT_DTYPES[v]] for v in value] - except (KeyError, TypeError): + except KeyError: data_type = value data_type = one_list_to_val(data_type) diff --git a/optimus/io/load.py b/optimus/io/load.py index 6b3eee641..680f29a3e 100644 --- a/optimus/io/load.py +++ b/optimus/io/load.py @@ -109,9 +109,14 @@ def parquet(path, *args, **kwargs): @staticmethod def avro(path, *args, **kwargs): - print("Not yet implemented") - return - + try: + df = (Spark.instance.spark.read + .format("com.databricks.spark.avro") + .load(path, *args, **kwargs)) + except IOError as error: + logging.error(error) + raise + return df """ diff --git a/optimus/optimus.py b/optimus/optimus.py index b08596d20..ecf218ba0 100644 --- a/optimus/optimus.py +++ b/optimus/optimus.py @@ -41,6 +41,7 @@ def __init__(self, master="local[*]", app_name="optimus", checkpoint=False, path from optimus.dl.models import DL self.dl = DL() else: + Optimus.add_spark_packages(["com.databricks:spark-avro_2.11:4.0.0 pyspark-shell"]) Spark.instance = Spark(master, app_name) pass @@ -84,34 +85,19 @@ def enrich(self, df, func_request, func_response): @property def spark(self): - """ - Return a Spark session object - :return: - """ return Spark.instance.spark @property def sc(self): - """ - Return a Spark Context object - :return: - """ return Spark.instance.sc - def stop(self): - """ - Stop Spark Session - :return: - """ - Spark.instance.spark.stop() + @staticmethod + def concat(dfs, like): + return concat(dfs, like) @staticmethod def add_spark_packages(packages): - """ - Define the Spark packages that must be loaded at start time - :param packages: - :return: - """ + p = "--packages " + " ".join(packages) os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages " + " ".join(packages) @staticmethod @@ -190,7 +176,3 @@ def delete_check_point_folder(path, file_system): logging.info("Folder deleted.") else: RaiseIt.value_error(file_system, ["hadoop", "local"]) - - @staticmethod - def concat(dfs, like): - return concat(dfs, like) diff --git a/optimus/profiler/functions.py b/optimus/profiler/functions.py index dcc081863..674cc0468 100644 --- a/optimus/profiler/functions.py +++ b/optimus/profiler/functions.py @@ -1,6 +1,5 @@ import json import math -import timeit from pyspark.sql import functions as F from pyspark.sql.functions import when @@ -110,7 +109,6 @@ def bucketizer(df, columns, splits): :param splits: :return: """ - start_time = timeit.default_timer() columns = parse_columns(df, columns) def _bucketizer(col_name, args): @@ -139,11 +137,9 @@ def _bucketizer(col_name, args): return expr output_columns = [c + "_buckets" for c in columns] - # TODO: This seems weird but I can not find another way. Send the actual column name to the func not seems right df = df.cols.apply_expr(output_columns, _bucketizer, [splits, dict(zip(output_columns, columns))]) - logging.info("bucketizer") - logging.info(timeit.default_timer() - start_time) + return df diff --git a/optimus/profiler/profiler.py b/optimus/profiler/profiler.py index 1cbc74240..2c468f433 100644 --- a/optimus/profiler/profiler.py +++ b/optimus/profiler/profiler.py @@ -141,7 +141,7 @@ def _count_data_types(col_name): return results @staticmethod - def columns(df, columns, buckets=40, relative_error=1): + def columns(df, columns, buckets=10): """ Return statistical information about a specific column in json format count_data_type() @@ -242,9 +242,7 @@ def zeros(col_name): # https://stackoverflow.com/questions/45287832/pyspark-approxquantile-function max_value = fast_float(max_value) min_value = fast_float(min_value) - col_info['stats']['quantile'] = df.cols.percentile(col_name, [0.05, 0.25, 0.5, 0.75, 0.95], - relative_error) - + col_info['stats']['quantile'] = df.cols.percentile(col_name, [0.05, 0.25, 0.5, 0.75, 0.95]) col_info['stats']['range'] = max_value - min_value col_info['stats']['median'] = col_info['stats']['quantile'][0.5] col_info['stats']['interquartile_range'] = col_info['stats']['quantile'][0.75] - \ @@ -330,19 +328,17 @@ def infer_date(value, args): return column_info - def run(self, df, columns, buckets=40, relative_error=1): + def run(self, df, columns, buckets=40): """ - Return dataframe statistical information in HTML Format - + Return statistical information in HTML Format :param df: Dataframe to be analyzed :param columns: Columns to be analized - :param buckets: Number of buckets calculated to print the histogram - :param relative_error: Relative Error for quantile discretizer calculation + :param buckets: number of buckets calculated to print the histogram :return: """ columns = parse_columns(df, columns) - output = Profiler.to_json(df, columns, buckets, relative_error) + output = Profiler.to_json(df, columns, buckets) # Load jinja path = os.path.dirname(os.path.abspath(__file__)) @@ -383,6 +379,7 @@ def run(self, df, columns, buckets=40, relative_error=1): html = html + template.render(data=col, freq_pic=freq_pic, **hist_pic) html = html + df.table_html(10) + # df.plots.correlation(columns) # Display HTML display(HTML(html)) @@ -391,7 +388,7 @@ def run(self, df, columns, buckets=40, relative_error=1): write_json(output, self.path) @staticmethod - def to_json(df, columns, buckets=40, relative_error=1): + def to_json(df, columns, buckets=20): """ Return the profiling data in json format :param df: Dataframe to be processed @@ -400,14 +397,12 @@ def to_json(df, columns, buckets=40, relative_error=1): :return: json file """ - # Get the stats for all the columns - output = Profiler.columns(df, columns, buckets, relative_error) - - # Add the data summary to the output - output["summary"] = Profiler.dataset_info(df) + output = Profiler.columns(df, columns, buckets) + dataset = Profiler.dataset_info(df) + output["summary"] = dataset - # Get a data sample and transform it to friendly json format data = [] + # Get a sample of the data and transform it to friendly json format for l in df.sample_n(10).to_json(): data.append([v for k, v in l.items()]) output["sample"] = {"columns": df.columns, "data": data} diff --git a/optimus/spark.py b/optimus/spark.py index 2b5d60f53..94456f60e 100644 --- a/optimus/spark.py +++ b/optimus/spark.py @@ -1,3 +1,5 @@ +from functools import lru_cache + from pyspark.sql import SparkSession from optimus.helpers.constants import * @@ -30,11 +32,8 @@ def __init__(self, master="local[*]", app_name="optimus"): logging.info(STARTING_SPARK) # Build the spark session - self._spark = (SparkSession - .builder - .master(self.master) - .appName(self.app_name) - .getOrCreate()) + self.spark + @property def spark(self): @@ -43,7 +42,12 @@ def spark(self): :return: None """ - return self._spark + return (SparkSession + .builder + .master(self.master) + .appName(self.app_name) + .getOrCreate() + ) @property def sc(self): @@ -51,4 +55,4 @@ def sc(self): Return the Spark Context :return: """ - return self._spark.sparkContext + return self.spark.sparkContext diff --git a/optimus/version.py b/optimus/version.py index 59f084ee3..c13da0f32 100644 --- a/optimus/version.py +++ b/optimus/version.py @@ -5,5 +5,5 @@ def _safe_int(string): return string -__version__ = '2.0.6' +__version__ = '2.0.4' VERSION = tuple(_safe_int(x) for x in __version__.split('.')) diff --git a/requirements-docs.txt b/requirements-docs.txt index d4ad1024a..a66a5ed5a 100644 --- a/requirements-docs.txt +++ b/requirements-docs.txt @@ -1,5 +1,5 @@ findspark==1.3.0 -pytest==3.8.0 +pytest==3.7.2 numpy==1.15.1 matplotlib==2.2.3 ipython==6.5.0 diff --git a/requirements-test.txt b/requirements-test.txt index 1a5e55cb6..7251c66c3 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -7,12 +7,12 @@ python_dateutil==2.7.3 numpy==1.15.1 matplotlib==2.2.3 pyspark==2.3.1 -pytest==3.8.0 +pytest==3.7.2 findspark==1.3.0 nose==1.3.7 seaborn==0.9.0 setuptools==40.2.0 -deprecated==1.2.2 +deprecated==1.2.0 pyarrow==0.10.0 tabulate==0.8.2 Jinja2==2.10 @@ -24,4 +24,4 @@ six>=1.10.0 h5py>=2.7.0 flask==1.0.2 ipython==6.5.0 -pytest-cov==2.6.0 +pytest-cov==2.5.1 diff --git a/requirements.txt b/requirements.txt index da5d64de8..8f053bfd4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,12 +8,12 @@ nose==1.3.7 numpy==1.15.1 matplotlib==2.2.3 pyspark==2.3.1 -pytest==3.8.0 +pytest==3.7.2 findspark==1.3.0 nose==1.3.7 seaborn==0.9.0 setuptools==40.2.0 -deprecated==1.2.2 +deprecated==1.2.0 pyarrow==0.10.0 tabulate==0.8.2 Jinja2==2.10 diff --git a/setup.py b/setup.py index 2eaa372f7..4b9f29b93 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ def readme(): author='Favio Vazquez and Argenis Leon', author_email='favio.vazquez@ironmussa.com', url='https://github.com/ironmussa/Optimus/', - download_url='https://github.com/ironmussa/Optimus/archive/2.0.6.tar.gz', + download_url='https://github.com/ironmussa/Optimus/archive/2.0.4.tar.gz', description=('Optimus is the missing framework for cleaning and pre-processing data in a distributed fashion with ' 'pyspark.'), long_description=readme(), @@ -60,7 +60,7 @@ def readme(): }, dependency_links=dependency_links, test_suite='nose.collector', - include_package_data=True, + include_package_data=False, classifiers=[ 'Development Status :: 4 - Beta', 'Intended Audience :: Developers', From cc1860f007694c6bc3f5d70868c891a9216e57b7 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Mon, 10 Sep 2018 15:42:41 -0500 Subject: [PATCH 71/94] Fix merge --- tests/test_cols.py | 356 ++++++++++++++++++++++----------------------- 1 file changed, 172 insertions(+), 184 deletions(-) diff --git a/tests/test_cols.py b/tests/test_cols.py index aa0084116..398d5026f 100644 --- a/tests/test_cols.py +++ b/tests/test_cols.py @@ -557,188 +557,176 @@ def test_sort(): assert (actual_df.collect() == expected_df.collect()) @staticmethod + def test_nest(): + source_df = op.create.df( + rows=[ + ("happy", 1), + ("excited", 2) + ], + cols=[ + ("emotion", StringType(), True), + ("num", IntegerType(), True) + ] + ) + + actual_df = source_df.cols.nest(["emotion", "num"], "new", separator=" ") + + expected_df = op.create.df( + rows=[ + (1, "happy", "1 happy"), + (2, "excited", "2 excited") + ], + cols=[ + ("emotion", StringType(), True), + ("num", IntegerType(), True), + ("new", StringType(), True) + + ] + ) + + assert (actual_df.collect() == expected_df.collect()) + + @staticmethod + def test_nest_mix(): + source_df = op.create.df( + rows=[ + ("happy", 1), + ("excited", 2) + ], + cols=[ + ("emotion", StringType(), True), + ("num", IntegerType(), True) + ] + ) + + actual_df = source_df.cols.nest([F.Column("emotion"), "---", F.Column("num")], separator="new") -<< << << < HEAD - - -def test_nest(): - source_df = op.create.df( - rows=[ - ("happy", 1), - ("excited", 2) - ], - cols=[ - ("emotion", StringType(), True), - ("num", IntegerType(), True) - ] - ) - - actual_df = source_df.cols.nest(["emotion", "num"], "new", separator=" ") - - expected_df = op.create.df( - rows=[ - (1, "happy", "1 happy"), - (2, "excited", "2 excited") - ], - cols=[ - ("emotion", StringType(), True), - ("num", IntegerType(), True), - ("new", StringType(), True) - - ] - ) - - assert (actual_df.collect() == expected_df.collect()) - - -@staticmethod -def test_nest_mix(): - source_df = op.create.df( - rows=[ - ("happy", 1), - ("excited", 2) - ], - cols=[ - ("emotion", StringType(), True), - ("num", IntegerType(), True) - ] - ) - - actual_df = source_df.cols.nest([F.Column("emotion"), "---", F.Column("num")], separator="new") - - expected_df = op.create.df( - rows=[ - (1, "happy", "1---happy"), - (2, "excited", "2---excited") - ], - cols=[ - ("emotion", StringType(), True), - ("num", IntegerType(), True), - ("new", StringType(), True) - - == == == = - - -def test_fill_na(): - source_df = op.create.df( - rows=[ - ("happy", 1, None), - ("excited", 2, 8) - ], - cols=[ - ("emotion", StringType(), True), - ("num1", IntegerType(), True), - ("num2", IntegerType(), True) - ] - ) - - actual_df = source_df.cols.fill_na("*", "N/A") - - expected_df = op.create.df( - rows=[ - ("happy", 1, "N/A"), - ("excited", 2, 8) - ], - cols=[ - ("emotion", StringType(), True), - ("num1", IntegerType(), True), - ("num2", IntegerType(), True) - >> >> >> > develop - ] - ) - - assert (actual_df.collect() == expected_df.collect()) - - -@staticmethod - -<< << << < HEAD - - -def test_nest_vector(): - source_df = op.create.df( - rows=[ - ("happy", 1), - ("excited", 2) - ], - cols=[ - ("emotion", StringType(), True), - ("num", IntegerType(), True) - ] - ) - - actual_df = source_df.cols.nest(["emotion", "num"], "new", shape="vector") - - expected_df = op.create.df( - rows=[ - (1, "happy", [1, "happy"]), - (2, "excited", [2, "excited"]) - ], - cols=[ - ("emotion", StringType(), True), - ("num", IntegerType(), True), - ("new", VectorUDT(), True) - - ] - ) - - assert (actual_df.collect() == expected_df.collect()) - - -@staticmethod -def test_nest_array(): - source_df = op.create.df( - rows=[ - ("happy", 1), - ("excited", 2) - ], - cols=[ - ("emotion", StringType(), True), - ("num", IntegerType(), True) - ] - ) - - actual_df = source_df.cols.nest(["emotion", "num"], "new", shape="array") - - expected_df = op.create.df( - rows=[ - (1, "happy", [1, "happy"]), - (2, "excited", [2, "excited"]) - ], - cols=[ - ("emotion", StringType(), True), - ("num", IntegerType(), True), - ("new", ArrayType(), True) - - == == == = - - -def test_is_na(): - source_df = op.create.df( - rows=[ - ("happy", None, 1), - ("excited", 2, 8) - ], - cols=[ - ("emotion", StringType(), True), - ("num1", IntegerType(), True), - ("num2", IntegerType(), True) - ] - ) - - actual_df = source_df.cols.fill_na("*", "N/A") - - expected_df = op.create.df( - rows=[ - (False, True, False), - (False, False, False) - ], - cols=[ - ("emotion", StringType(), True), - ("num1", IntegerType(), True), - ("num2", IntegerType(), True) - >> >> >> > develop - ] - ) - - assert (actual_df.collect() == expected_df.collect()) + expected_df = op.create.df( + rows=[ + (1, "happy", "1---happy"), + (2, "excited", "2---excited") + ], + cols=[ + ("emotion", StringType(), True), + ("num", IntegerType(), True), + ("new", StringType(), True)]) + + assert (actual_df.collect() == expected_df.collect()) + + @staticmethod + def test_fill_na(): + source_df = op.create.df( + rows=[ + ("happy", 1, None), + ("excited", 2, 8) + ], + cols=[ + ("emotion", StringType(), True), + ("num1", IntegerType(), True), + ("num2", IntegerType(), True) + ] + ) + + actual_df = source_df.cols.fill_na("*", "N/A") + + expected_df = op.create.df( + rows=[ + ("happy", 1, "N/A"), + ("excited", 2, 8) + ], + cols=[ + ("emotion", StringType(), True), + ("num1", IntegerType(), True), + ("num2", IntegerType(), True) + ] + ) + + assert (actual_df.collect() == expected_df.collect()) + + @staticmethod + def test_nest_vector(): + source_df = op.create.df( + rows=[ + ("happy", 1), + ("excited", 2) + ], + cols=[ + ("emotion", StringType(), True), + ("num", IntegerType(), True) + ] + ) + + actual_df = source_df.cols.nest(["emotion", "num"], "new", shape="vector") + + expected_df = op.create.df( + rows=[ + (1, "happy", [1, "happy"]), + (2, "excited", [2, "excited"]) + ], + cols=[ + ("emotion", StringType(), True), + ("num", IntegerType(), True), + ("new", VectorUDT(), True) + + ] + ) + + assert (actual_df.collect() == expected_df.collect()) + + @staticmethod + def test_nest_array(): + source_df = op.create.df( + rows=[ + ("happy", 1), + ("excited", 2) + ], + cols=[ + ("emotion", StringType(), True), + ("num", IntegerType(), True) + ] + ) + + actual_df = source_df.cols.nest(["emotion", "num"], "new", shape="array") + + expected_df = op.create.df( + rows=[ + (1, "happy", [1, "happy"]), + (2, "excited", [2, "excited"]) + ], + cols=[ + ("emotion", StringType(), True), + ("num", IntegerType(), True), + ("new", ArrayType(), True)]) + + assert (actual_df.collect() == expected_df.collect()) + + @staticmethod + def test_is_na(): + source_df = op.create.df( + rows=[ + ("happy", None, 1), + ("excited", 2, 8) + ], + cols=[ + ("emotion", StringType(), True), + ("num1", IntegerType(), True), + ("num2", IntegerType(), True) + ] + ) + + actual_df = source_df.cols.fill_na("*", "N/A") + + expected_df = op.create.df( + rows=[ + (False, True, False), + (False, False, False) + ], + cols=[ + ("emotion", StringType(), True), + ("num1", IntegerType(), True), + ("num2", IntegerType(), True) + + ] + ) + + assert (actual_df.collect() == expected_df.collect()) From 54237d6abf0b7dca87d3ae957ff269bc6ab3dd94 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Tue, 11 Sep 2018 10:47:28 -0500 Subject: [PATCH 72/94] Fix to handle one column dataframe creation with list. Test Added --- optimus/create.py | 4 +++- tests/test_optimus.py | 34 +++++++++++++++++++++++++--------- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/optimus/create.py b/optimus/create.py index c9a0ed9dc..4ca38b3ce 100644 --- a/optimus/create.py +++ b/optimus/create.py @@ -2,7 +2,7 @@ from pyspark.sql.types import StructField, StructType, StringType # Helpers -from optimus.helpers.checkit import is_tuple +from optimus.helpers.checkit import is_tuple, is_list_of_tuples from optimus.helpers.functions import get_spark_dtypes_object from optimus.spark import Spark @@ -18,6 +18,8 @@ def data_frame(cols, rows): :param rows: List of Tuples if vals with the same number and types that cols :return: Dataframe """ + if not is_list_of_tuples(rows): + rows = [(i,) for i in rows] specs = [] for c in cols: diff --git a/tests/test_optimus.py b/tests/test_optimus.py index 407bd0956..05d55f00d 100644 --- a/tests/test_optimus.py +++ b/tests/test_optimus.py @@ -9,18 +9,34 @@ class TestDataFrameCols(object): @staticmethod - def test_create_data_frames_plain(): + def test_create_data_frames_one_column(): source_df = op.create.df( - rows=[ - ("BOB", 1), - ("JoSe", 2) - ], - cols=[ - "name", - "age" - ] + rows=["Argenis", "Favio", "Matthew"], + cols=["name"] + ) + + actual_df = source_df + + expected_df = op.create.df( + rows=["Argenis", "Favio", "Matthew"], + cols=["name"] ) + assert (expected_df.collect() == actual_df.collect()) + + @staticmethod + def test_create_data_frames_plain(): + source_df = op.create.df( + rows=[ + ("BOB", 1), + ("JoSe", 2) + ], + cols=[ + "name", + "age" + ] + ) + actual_df = source_df expected_df = op.create.df( From a51f3590d890a356aa729a771508e0d227897fc7 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Tue, 11 Sep 2018 12:54:57 -0500 Subject: [PATCH 73/94] Fixed run(). Was not working --- optimus/dataframe/extension.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/optimus/dataframe/extension.py b/optimus/dataframe/extension.py index 371b69197..ad973ffda 100644 --- a/optimus/dataframe/extension.py +++ b/optimus/dataframe/extension.py @@ -128,23 +128,14 @@ def run(self): Sometimes when transformations are numerous, the computations are very extensive because the high number of operations that spark needs to run in order to get the results. - Other important thing is that apache spark usually save task but not result of dataFrame, so tasks are + Other important thing is that Apache Spark save task but not result of dataFrame, so tasks are accumulated and the same situation happens. - - The problem can be deal it with the checkPoint method. This method save the resulting dataFrame in disk, so - the lineage is cut. """ - # Check pointing of dataFrame. One question can be thought. Why not use cache() or persist() instead of - # checkpoint. This is because cache() and persis() apparently do not break the lineage of operations, - - logging.info("Saving changes at disk by checkpoint...") - self.cache().count + self.count() - logging.info("Done.") - - return True + return self @add_method(DataFrame) From 561e53edbfff5179c6cc95c932e98f8b3c0888e1 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Tue, 11 Sep 2018 12:55:45 -0500 Subject: [PATCH 74/94] Now Enricher is full integrated with Optimus --- examples/Enricher.ipynb | 804 ++++++++++++++++++++++++++++++ examples/new-api-column.ipynb | 7 + examples/new-api-enrichment.ipynb | 587 +++++----------------- optimus/dataframe/columns.py | 3 - optimus/enricher.py | 209 +++----- optimus/optimus.py | 6 +- requirements.txt | 3 +- 7 files changed, 1001 insertions(+), 618 deletions(-) create mode 100644 examples/Enricher.ipynb diff --git a/examples/Enricher.ipynb b/examples/Enricher.ipynb new file mode 100644 index 000000000..744d713d7 --- /dev/null +++ b/examples/Enricher.ipynb @@ -0,0 +1,804 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"..\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from optimus import Optimus\n", + "op = Optimus()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a Spark Dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 9 of 9 rows / 1 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
todo_id
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " 1\n", + "
\n", + " 2\n", + "
\n", + " 3\n", + "
\n", + " 4\n", + "
\n", + " 5\n", + "
\n", + " 6\n", + "
\n", + " 7\n", + "
\n", + " 8\n", + "
\n", + " 9\n", + "
\n", + "\n", + "
Viewing 9 of 9 rows / 1 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# make some test data\n", + "columns = ['todo_id']\n", + "vals = [1,2,3,4,5,6,7,8,9]\n", + "\n", + "# create DataFrame\n", + "df = op.create.df(columns,vals).repartition(1).cache()\n", + "df.table()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2bc92342affa42e2b9dd4cae8bf2dec2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, description='Processing...', max=9), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "import requests\n", + "\n", + "def func_request(params):\n", + " # You can use here whatever header or auth info you need to send. \n", + " # For more information see the requests library\n", + " url= \"https://jsonplaceholder.typicode.com/todos/\" + str(params[\"todo_id\"])\n", + "\n", + " return requests.get(url)\n", + "\n", + "def func_response(response):\n", + " # Here you can parse de response\n", + " return response[\"title\"]\n", + "\n", + "df_result = op.enrich(df, func_request= func_request, func_response= func_response)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 9 of 9 rows / 2 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
todo_id
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
jazz_results
\n", + "
2 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " 1\n", + " \n", + " delectus⸱aut⸱autem\n", + "
\n", + " 2\n", + " \n", + " quis⸱ut⸱nam⸱facilis⸱et⸱officia⸱qui\n", + "
\n", + " 3\n", + " \n", + " fugiat⸱veniam⸱minus\n", + "
\n", + " 4\n", + " \n", + " et⸱porro⸱tempora\n", + "
\n", + " 5\n", + " \n", + " laboriosam⸱mollitia⸱et⸱enim⸱quasi⸱adipisci⸱quia⸱provident⸱illum\n", + "
\n", + " 6\n", + " \n", + " qui⸱ullam⸱ratione⸱quibusdam⸱voluptatem⸱quia⸱omnis\n", + "
\n", + " 7\n", + " \n", + " illo⸱expedita⸱consequatur⸱quia⸱in\n", + "
\n", + " 8\n", + " \n", + " quo⸱adipisci⸱enim⸱quam⸱ut⸱ab\n", + "
\n", + " 9\n", + " \n", + " molestiae⸱perspiciatis⸱ipsa\n", + "
\n", + "\n", + "
Viewing 9 of 9 rows / 2 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df_result.table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Enricher without Optimus" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a pandas daraframe" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
todo_iddescurl
06badhttps://jsonplaceholder.typicode.com/todos/6
17goodhttps://jsonplaceholder.typicode.com/todos/7
28uglyhttps://jsonplaceholder.typicode.com/todos/8
39tallhttps://jsonplaceholder.typicode.com/todos/9
410shorthttps://jsonplaceholder.typicode.com/todos/10
\n", + "
" + ], + "text/plain": [ + " todo_id desc url\n", + "0 6 bad https://jsonplaceholder.typicode.com/todos/6\n", + "1 7 good https://jsonplaceholder.typicode.com/todos/7\n", + "2 8 ugly https://jsonplaceholder.typicode.com/todos/8\n", + "3 9 tall https://jsonplaceholder.typicode.com/todos/9\n", + "4 10 short https://jsonplaceholder.typicode.com/todos/10" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "data = {\"todo_id\": [6, 7, 8, 9,10], \n", + " \"desc\": [\"bad\", \"good\", \"ugly\", \"tall\",\"short\"],\n", + " \"url\": [\"https://jsonplaceholder.typicode.com/todos/6\", \n", + " \"https://jsonplaceholder.typicode.com/todos/7\", \n", + " \"https://jsonplaceholder.typicode.com/todos/8\", \n", + " \"https://jsonplaceholder.typicode.com/todos/9\",\n", + " \"https://jsonplaceholder.typicode.com/todos/10\"]}\n", + "pdf =pd.DataFrame.from_dict(data)\n", + "pdf.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "from optimus.enricher import Enricher\n", + "\n", + "e = Enricher()\n", + "#e = Enricher(\"localhost\",27017, \"enricher\",\"optimus\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run the enrichment process" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 5 of 5 rows / 1 columns
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
todo_id
\n", + "
1 (bigint)
\n", + "\n", + "
\n", + " 1\n", + "
\n", + " 2\n", + "
\n", + " 3\n", + "
\n", + " 4\n", + "
\n", + " 5\n", + "
\n", + "\n", + "
Viewing 5 of 5 rows / 1 columns
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.table()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'e' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 17\u001b[1;33m \u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc_request\u001b[0m\u001b[1;33m=\u001b[0m \u001b[0mfunc_request\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc_response\u001b[0m\u001b[1;33m=\u001b[0m \u001b[0mfunc_response\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"jazz.csv\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;31mNameError\u001b[0m: name 'e' is not defined" + ] + } + ], + "source": [ + "import requests \n", + "\n", + "\n", + "def func_request(params):\n", + " # You can use here whatever header or auth info you need to send. \n", + " # For mor information see the requests library\n", + " url= \"https://jsonplaceholder.typicode.com/todos/\" + str(params[\"todo_id\"])\n", + "\n", + " return requests.get(url)\n", + "\n", + "\n", + "def func_response(response):\n", + " # Here you can parse de response\n", + " return response[\"title\"]\n", + "\n", + "\n", + "e.run(df, func_request= func_request, func_response= func_response, filename=\"jazz.csv\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Merge with dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Removed 10 documents\n" + ] + } + ], + "source": [ + "e.flush()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Some other operations" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['url', 'result', 'todo_id', '_id']" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "e.show_keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total documents:15\n", + "{'_id': ObjectId('5b940c37e254121740b5d19f'), 'todo_id': 1, 'url': 'https://jsonplaceholder.typicode.com/todos/1', 'result': 'delectus aut autem'}\n", + "{'_id': ObjectId('5b940c37e254121740b5d1a0'), 'todo_id': 2, 'url': 'https://jsonplaceholder.typicode.com/todos/2', 'result': 'quis ut nam facilis et officia qui'}\n", + "{'_id': ObjectId('5b940c37e254121740b5d1a1'), 'todo_id': 3, 'url': 'https://jsonplaceholder.typicode.com/todos/3', 'result': 'fugiat veniam minus'}\n", + "{'_id': ObjectId('5b940c37e254121740b5d1a2'), 'todo_id': 4, 'url': 'https://jsonplaceholder.typicode.com/todos/4', 'result': 'et porro tempora'}\n", + "{'_id': ObjectId('5b940c37e254121740b5d1a3'), 'todo_id': 5, 'url': 'https://jsonplaceholder.typicode.com/todos/5', 'result': 'laboriosam mollitia et enim quasi adipisci quia provident illum'}\n" + ] + } + ], + "source": [ + "e.head(\"BOB\",5)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/new-api-column.ipynb b/examples/new-api-column.ipynb index c860cf37c..cb139372e 100644 --- a/examples/new-api-column.ipynb +++ b/examples/new-api-column.ipynb @@ -1,5 +1,12 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/examples/new-api-enrichment.ipynb b/examples/new-api-enrichment.ipynb index 2ed793c88..b8f6233fa 100644 --- a/examples/new-api-enrichment.ipynb +++ b/examples/new-api-enrichment.ipynb @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -39,40 +39,62 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 9 of 9 rows / 1 columns
\n", + "
1 partition(s)
\n", "\n", - "\n", + "
\n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -147,7 +169,7 @@ " \n", " \n", " \n", " \n", " \n", @@ -155,7 +177,8 @@ " \n", "
\n", "
todo_id
\n", - "
1 (bigint)
\n", - "\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", - " 94\n", + " 9\n", "
\n", "\n", - "
Viewing 9 of 9 rows / 1 columns
\n" + "
Viewing 9 of 9 rows / 1 columns
\n", + "
1 partition(s)
\n" ], "text/plain": [ "" @@ -168,48 +191,37 @@ "source": [ "# make some test data\n", "columns = ['todo_id']\n", - "vals = [\n", - " (1, ),\n", - " (2, ),\n", - " (3, ),\n", - " (4, ),\n", - " (5, ),\n", - " (6, ),\n", - " (7, ),\n", - " (8, ),\n", - " (94, ),\n", - " \n", - "\n", - "]\n", + "vals = [1,2,3,4,5,6,7,8,9]\n", "\n", "# create DataFrame\n", - "df = op.spark.createDataFrame(vals, columns).repartition(1).cache()\n", + "df = op.create.df(columns,vals).repartition(1).cache()\n", "df.table()" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 20, + "execution_count": 39, "metadata": {}, "outputs": [ { - "ename": "AttributeError", - "evalue": "'function' object has no attribute 'create_id'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 14\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 16\u001b[1;33m \u001b[0mdf_result\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mop\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0menrich\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc_request\u001b[0m\u001b[1;33m=\u001b[0m \u001b[0mfunc_request\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc_response\u001b[0m\u001b[1;33m=\u001b[0m \u001b[0mfunc_response\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;32m~\\Documents\\Optimus\\optimus\\optimus.py\u001b[0m in \u001b[0;36menrich\u001b[1;34m(self, df, func_request, func_response)\u001b[0m\n\u001b[0;32m 82\u001b[0m \"\"\"\n\u001b[0;32m 83\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 84\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0menricher\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc_request\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mfunc_request\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc_response\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mfunc_response\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 85\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 86\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Documents\\Optimus\\optimus\\enricher.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self, df, collection_name, func_request, func_response, return_type, filename, calls, period)\u001b[0m\n\u001b[0;32m 71\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 72\u001b[0m \u001b[1;31m# Load the dataframe data in the enricher\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 73\u001b[1;33m \u001b[0mdf_result\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcols\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcreate_id\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 74\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_result\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 75\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mAttributeError\u001b[0m: 'function' object has no attribute 'create_id'" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c8890ac5987a42d0916a029cf55abdaa", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, description='Processing...', max=9), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" ] } ], @@ -223,198 +235,81 @@ "\n", " return requests.get(url)\n", "\n", - "\n", "def func_response(response):\n", " # Here you can parse de response\n", " return response[\"title\"]\n", "\n", - "\n", - "df_result = op.enrich(df, func_request= func_request, func_response= func_response)\n" + "df_result = op.enrich(df, func_request= func_request, func_response= func_response)" ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "ename": "AttributeError", - "evalue": "'NoneType' object has no attribute 'table'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdf_result\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtable\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'table'" - ] - } - ], - "source": [ - "df_result.table()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Enricher without Optimus" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create a pandas daraframe" - ] - }, - { - "cell_type": "code", - "execution_count": 126, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
todo_iddescurl
06badhttps://jsonplaceholder.typicode.com/todos/6
17goodhttps://jsonplaceholder.typicode.com/todos/7
28uglyhttps://jsonplaceholder.typicode.com/todos/8
39tallhttps://jsonplaceholder.typicode.com/todos/9
410shorthttps://jsonplaceholder.typicode.com/todos/10
\n", - "
" - ], - "text/plain": [ - " todo_id desc url\n", - "0 6 bad https://jsonplaceholder.typicode.com/todos/6\n", - "1 7 good https://jsonplaceholder.typicode.com/todos/7\n", - "2 8 ugly https://jsonplaceholder.typicode.com/todos/8\n", - "3 9 tall https://jsonplaceholder.typicode.com/todos/9\n", - "4 10 short https://jsonplaceholder.typicode.com/todos/10" - ] - }, - "execution_count": 126, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "data = {\"todo_id\": [6, 7, 8, 9,10], \n", - " \"desc\": [\"bad\", \"good\", \"ugly\", \"tall\",\"short\"],\n", - " \"url\": [\"https://jsonplaceholder.typicode.com/todos/6\", \n", - " \"https://jsonplaceholder.typicode.com/todos/7\", \n", - " \"https://jsonplaceholder.typicode.com/todos/8\", \n", - " \"https://jsonplaceholder.typicode.com/todos/9\",\n", - " \"https://jsonplaceholder.typicode.com/todos/10\"]}\n", - "pdf =pd.DataFrame.from_dict(data)\n", - "pdf.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 169, - "metadata": {}, - "outputs": [], - "source": [ - "from optimus.enricher import Enricher\n", - "\n", - "e = Enricher()\n", - "#e = Enricher(\"localhost\",27017, \"enricher\",\"optimus\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run the enrichment process" - ] - }, - { - "cell_type": "code", - "execution_count": 173, + "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", - "
Viewing 5 of 5 rows / 1 columns
\n", + "
Viewing 9 of 9 rows / 2 columns
\n", + "
1 partition(s)
\n", "\n", - "\n", + "
\n", " \n", " \n", " \n", " \n", + " \n", + " \n", " \n", " \n", @@ -428,6 +323,10 @@ " 1\n", " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -436,12 +335,8 @@ " 2\n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -449,208 +344,35 @@ " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", - " \n", - "
\n", "
todo_id
\n", - "
1 (bigint)
\n", - "\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
jazz_results
\n", + "
2 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", + " delectus⸱aut⸱autem\n", + "
\n", - " 3\n", + " quis⸱ut⸱nam⸱facilis⸱et⸱officia⸱qui\n", "
\n", - " 4\n", + " 3\n", "
\n", - " 5\n", + " fugiat⸱veniam⸱minus\n", "
\n", - "\n", - "
Viewing 5 of 5 rows / 1 columns
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df.table()" - ] - }, - { - "cell_type": "code", - "execution_count": 176, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "3782e14c59674612a19663282bd7d31c", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Processing...', max=5), HTML(value='')))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "0faffca247b24402a52b3eee067d1d98", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Saving...', max=5), HTML(value='')))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Removed 5 documents\n" - ] - }, - { - "ename": "AttributeError", - "evalue": "'DataFrame' object has no attribute 'load'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 17\u001b[1;33m \u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc_request\u001b[0m\u001b[1;33m=\u001b[0m \u001b[0mfunc_request\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc_response\u001b[0m\u001b[1;33m=\u001b[0m \u001b[0mfunc_response\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"jazz.csv\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;32m~\\Documents\\Optimus\\optimus\\enricher.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self, df, collection_name, func_request, func_response, return_type, filename, calls, period)\u001b[0m\n\u001b[0;32m 124\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msave_to_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 125\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 126\u001b[1;33m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcsv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtable\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 127\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 128\u001b[0m \u001b[1;31m#\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\pyspark\\sql\\dataframe.py\u001b[0m in \u001b[0;36m__getattr__\u001b[1;34m(self, name)\u001b[0m\n\u001b[0;32m 1180\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1181\u001b[0m raise AttributeError(\n\u001b[1;32m-> 1182\u001b[1;33m \"'%s' object has no attribute '%s'\" % (self.__class__.__name__, name))\n\u001b[0m\u001b[0;32m 1183\u001b[0m \u001b[0mjc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1184\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mColumn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mjc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mAttributeError\u001b[0m: 'DataFrame' object has no attribute 'load'" - ] - } - ], - "source": [ - "import requests \n", - "\n", - "\n", - "def func_request(params):\n", - " # You can use here whatever header or auth info you need to send. \n", - " # For mor information see the requests library\n", - " url= \"https://jsonplaceholder.typicode.com/todos/\" + str(params[\"todo_id\"])\n", - "\n", - " return requests.get(url)\n", - "\n", - "\n", - "def func_response(response):\n", - " # Here you can parse de response\n", - " return response[\"title\"]\n", - "\n", - "\n", - "e.run(df, func_request= func_request, func_response= func_response, filename=\"jazz.csv\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Merge with dataframe" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Removed 10 documents\n" - ] - } - ], - "source": [ - "e.flush()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "### You can prepare a rul using nest" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
Viewing 5 of 5 rows / 2 columns
\n", - "\n", - "\n", - " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - "\n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -658,11 +380,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -670,11 +392,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -682,11 +404,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -694,11 +416,11 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -706,7 +428,8 @@ " \n", "
\n", - "
todo_id
\n", - "
1 (bigint)
\n", - "\n", - "
\n", + " 4\n", + " \n", - "
url
\n", - "
2 (string)
\n", - "\n", - "
\n", + " et⸱porro⸱tempora\n", + "
\n", - " 1\n", + " 5\n", " \n", - " https://jsonplaceholder.typicode.com/todos/1\n", + " laboriosam⸱mollitia⸱et⸱enim⸱quasi⸱adipisci⸱quia⸱provident⸱illum\n", "
\n", - " 2\n", + " 6\n", " \n", - " https://jsonplaceholder.typicode.com/todos/2\n", + " qui⸱ullam⸱ratione⸱quibusdam⸱voluptatem⸱quia⸱omnis\n", "
\n", - " 3\n", + " 7\n", " \n", - " https://jsonplaceholder.typicode.com/todos/3\n", + " illo⸱expedita⸱consequatur⸱quia⸱in\n", "
\n", - " 4\n", + " 8\n", " \n", - " https://jsonplaceholder.typicode.com/todos/4\n", + " quo⸱adipisci⸱enim⸱quam⸱ut⸱ab\n", "
\n", - " 5\n", + " 9\n", " \n", - " https://jsonplaceholder.typicode.com/todos/5\n", + " molestiae⸱perspiciatis⸱ipsa\n", "
\n", "\n", - "
Viewing 5 of 5 rows / 2 columns
\n" + "
Viewing 9 of 9 rows / 2 columns
\n", + "
1 partition(s)
\n" ], "text/plain": [ "" @@ -717,61 +440,15 @@ } ], "source": [ - "from pyspark.sql import functions as F\n", - " \n", - "# Prepare the URL\n", - "df_url = df.cols.nest((\"https://jsonplaceholder.typicode.com/todos/\",F.col(\"todo_id\")),\"url\")\n", - "df_url.table()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Some other operations" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['url', 'result', 'todo_id', '_id']" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "e.show_keys()" + "df_result.table()" ] }, { "cell_type": "code", - "execution_count": 58, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total documents:15\n", - "{'_id': ObjectId('5b940c37e254121740b5d19f'), 'todo_id': 1, 'url': 'https://jsonplaceholder.typicode.com/todos/1', 'result': 'delectus aut autem'}\n", - "{'_id': ObjectId('5b940c37e254121740b5d1a0'), 'todo_id': 2, 'url': 'https://jsonplaceholder.typicode.com/todos/2', 'result': 'quis ut nam facilis et officia qui'}\n", - "{'_id': ObjectId('5b940c37e254121740b5d1a1'), 'todo_id': 3, 'url': 'https://jsonplaceholder.typicode.com/todos/3', 'result': 'fugiat veniam minus'}\n", - "{'_id': ObjectId('5b940c37e254121740b5d1a2'), 'todo_id': 4, 'url': 'https://jsonplaceholder.typicode.com/todos/4', 'result': 'et porro tempora'}\n", - "{'_id': ObjectId('5b940c37e254121740b5d1a3'), 'todo_id': 5, 'url': 'https://jsonplaceholder.typicode.com/todos/5', 'result': 'laboriosam mollitia et enim quasi adipisci quia provident illum'}\n" - ] - } - ], - "source": [ - "e.head(\"BOB\",5)" - ] + "outputs": [], + "source": [] } ], "metadata": { @@ -779,18 +456,6 @@ "display_name": "Python 3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" } }, "nbformat": 4, diff --git a/optimus/dataframe/columns.py b/optimus/dataframe/columns.py index 51b5c200d..aaec0fbab 100644 --- a/optimus/dataframe/columns.py +++ b/optimus/dataframe/columns.py @@ -1124,18 +1124,15 @@ def nest(input_cols, output_col, shape="string", separator=""): columns = parse_columns(self, input_cols) if shape is "vector": - vector_assembler = VectorAssembler( inputCols=input_cols, outputCol=output_col) df = vector_assembler.transform(self) elif shape is "array": - df = apply_expr(output_col, F.array(*columns)) elif shape is "string": - df = apply_expr(output_col, F.concat_ws(separator, *columns)) else: RaiseIt.value_error(shape, ["vector", "array", "string"]) diff --git a/optimus/enricher.py b/optimus/enricher.py index f3cdeef7f..50c2cb865 100644 --- a/optimus/enricher.py +++ b/optimus/enricher.py @@ -4,12 +4,18 @@ import pandas as pd import requests +from backoff import on_exception, expo from pymongo import MongoClient from pyspark.sql.functions import DataFrame +from pyspark.sql.functions import pandas_udf, PandasUDFType +from ratelimit import limits, RateLimitException from tqdm import tqdm_notebook from optimus.helpers.checkit import is_function, is_ -from optimus.helpers.functions import random_int + +# Temporal col used to create a temporal ID to join the enriched data in mongo with the dataframe. +COL_ID = "jazz_id" +COL_RESULTS = "jazz_results" class Enricher: @@ -17,7 +23,7 @@ class Enricher: Enrich data from a Pandas or Spark dataframe """ - def __init__(self, host="localhost", port=27017, db_name="jazz", collection_name="data", op=None, *args, + def __init__(self, op=None, host="localhost", port=27017, db_name="jazz", collection_name="data", *args, **kwargs): """ @@ -38,9 +44,6 @@ def __init__(self, host="localhost", port=27017, db_name="jazz", collection_name self.client = MongoClient(host, port, *args, **kwargs) self.op = op - # FIFTEEN_MINUTES = 900 - # @limits(calls=15, period=FIFTEEN_MINUTES) - def send(self, df): """ Send the dataframe to the mongo collection @@ -55,8 +58,8 @@ def send(self, df): else: raise Exception("df must by a Spark Dataframe or Pandas Dataframe") - def run(self, df, collection_name=None, func_request=None, func_response=None, return_type="json", filename=None, - calls=None, period=60): + def run(self, df, collection_name=None, func_request=None, func_response=None, return_type="json", calls=60, + period=60, max_tries=8): """ Read a the url key from a mongo collection an make a request to a service :param df: Dataframe to me loaded to the enricher collection. @@ -66,29 +69,40 @@ def run(self, df, collection_name=None, func_request=None, func_response=None, r :param return_type: :param calls: how many call can you make :param period: in which period ot time can the call be made + :param max_tries: how many retries should we do :return: """ # Load the dataframe data in the enricher - df_result = df.cols.create_id() - self.send(df_result) + if is_(df, DataFrame): + df = df.create_id(COL_ID) + + # Load the dataframe data in the enricher + self.send(df) if collection_name is None: collection_name = self.collection_name - collection = self.get_collection(collection_name) - cursor = collection.find({"result": {"$exists": False}}) + # Get data that is not yet enriched + cursor = collection.find({COL_RESULTS: {"$exists": False}}) total_docs = cursor.count(True) - if total_docs > 0: - if func_request is None: - func_request = requests.get + if func_request is None: + func_request = requests.get + collection = self.get_collection(collection_name) + + @on_exception(expo, RateLimitException, max_tries=max_tries) + @limits(calls=calls, period=period) + def _func_request(v): + return func_request(v) + + if total_docs > 0: for c in tqdm_notebook(cursor, total=total_docs, desc='Processing...'): # Send request to the API - response = func_request(c) + response = _func_request(c) mongo_id = c["_id"] @@ -102,33 +116,42 @@ def run(self, df, collection_name=None, func_request=None, func_response=None, r if is_function(func_response): response = func_response(response) - # update the mongo id with the result - self.get_collection(collection_name).find_and_modify(query={"_id": mongo_id}, - update={"$set": {'result': response}}, - upsert=False, full_response=True) + # Update the mongo id with the result + collection.find_and_modify(query={"_id": mongo_id}, + update={"$set": {COL_RESULTS: response}}, + upsert=False, full_response=True) else: # The response key will remain blank so we can filter it to try in future request - print(response.status_code) + logging.info(response.status_code) - # Save a temporal data file to be merged with the dataframe. - # If someone knows a way get the data form the collection and merge it the source dataframe - # please open an issue. - if filename is None: - filename = random_int() + ".csv" + # Append the data in enrichment to the dataframe - # Save temporal file from mongo to - # self.save_to_csv(filename, collection_name) + logging.info("Appending collection info into the dataframe") + # TODO: An elegant way to handle pickling? + # take care to the pickling + host = self.host + port = self.port + db_name = self.db_name - # Load from the temporal fgi - # df_result = self.op.load.csv(filename) + @pandas_udf('string', PandasUDFType.SCALAR) + def func(value): + # More about pickling + from pymongo import MongoClient + _client = MongoClient(host, port) + _db = _client[db_name] + _collection = _db[collection_name] - # join both the actual dataframe an the temp csv + def func_serie(serie): + _cursor = _collection.find_one({COL_ID: serie}, projection={"_id": 0, COL_RESULTS: 1}) + return _cursor[COL_RESULTS] - # Flush the mongo collection - # self.flush() - return True + return value.apply(func_serie) + + df = df.withColumn(COL_RESULTS, func(df[COL_ID])).cols.drop(COL_ID).run() - # + # If the process is finished, flush the Mongo collection + self.flush() + return df else: print("No records available to process") @@ -148,7 +171,7 @@ def flush(self): """ count = self.count() self.drop_collection(self.collection_name) - print("Removed {count} documents".format(count=count)) + logging.info("Removed {count} documents".format(count=count)) def collection_exists(self, collection_name): """ @@ -319,118 +342,6 @@ def save_to_csv(self, filename, collection_name=None, projection=None, limit=0): file.close() - # CSV https: // gist.github.com / jxub / f722e0856ed461bf711684b0960c8458 - - def save_to_json(self, filename, projection=None, limit=0): - """ - Save collection to json file - :param filename: - :param projection: - :param limit: - :return: - """ - - _collection = self.get_collection(self.collection_name) - - file = open(filename, "w") - file.write('[') - - i = 0 - - # dump all the data - documents = _collection.find({}, projection).limit(limit) - count = documents.count(True) - - for r in tqdm_notebook(documents, total=count, desc='Processing records'): - - i = i + 1 - # FIX: we should remove the id in the projection - r.pop('_id') - - file.write(json.dumps(r)) - if (i < count): - file.write(',') - - file.write(']') - file.close() - - def save_to_geojson(self, filename, coordinates_keys, projection=None, limit=0): - """ - Save collection to geojson file - :param filename: Output file - :param coordinates_keys: - :param projection: - :return: - """ - - _collection = self.get_collection(self.collection_name) - - file = open(filename, "w") - file.write('{"type": "FeatureCollection","features":[') - - i = 0 - - # dump all the data - projection = coordinates_keys + projection - documents = _collection.find({}, projection).limit(limit) - count = documents.count(True) - - for r in tqdm_notebook(documents, total=count, desc='Processing records'): - - i = i + 1 - - lon_key = coordinates_keys[0] - # Verify if the key exist and is a float number - if (lon_key in r) and (isinstance(r[lon_key], float)): - lon = r[lon_key] - - lat_key = coordinates_keys[1] - if (lat_key in r) and (isinstance(r[lat_key], float)): - lat = r[lat_key] - - # FIX: we should remove the id in the projection - r.pop('_id') - r.pop(lon_key) - r.pop(lat_key) - - features = {"type": "Feature", "properties": r, "geometry": {"type": "Point", "coordinates": [lon, lat]}} - - file.write(json.dumps(features)) - if (i < count): - file.write(',') - - file.write(']}') - file.close() - - # FIX: not work need to find how to implement in a cursor object - def head_cursor(self, collection_name_or_cursor, n=1): - """ - - :param collection_name_or_cursor: - :param n: - :return: - """ - if not int(n): raise Exception('n must be an integer') - - if isinstance(collection_name_or_cursor, str): - # try to bring a cursor from a collection - cursor = self.get_collection(collection_name_or_cursor).find({}).limit(n) - count = cursor.count(True) - else: - cursor = collection_name_or_cursor - cursor.rewind() - count = n - - # FIX: and elegant way to make it in python? - i = 0 - - for c in cursor: - if (i < count): - print(c) - else: - break - i = i + 1 - def insert_to_collection(self, cursor, dest_collection_name, drop=False): """ Insert a cursor into a collection @@ -453,7 +364,7 @@ def insert_to_collection(self, cursor, dest_collection_name, drop=False): def create_missing_fields(self, cols, collection_name=None): """ - + Helper function to fill missing keys in a json :param cols: :param collection_name: :return: @@ -490,8 +401,6 @@ def cast(self, collection_name, field, convert_to): cursor = collection.find({field: {'$exists': True}}).limit(0) desc = 'Converting', field, 'to', convert_to - # for c in tqdm_notebook(cursor, total = cursor.count(), desc = 'sad'): - if convert_to == 'int': data_type = float elif convert_to == 'float': diff --git a/optimus/optimus.py b/optimus/optimus.py index 1b9ec2a50..38660b417 100644 --- a/optimus/optimus.py +++ b/optimus/optimus.py @@ -20,7 +20,6 @@ class Optimus: def __init__(self, master="local[*]", app_name="optimus", checkpoint=False, path=None, file_system="local", - verbose=False, dl=False, server=False, repositories=None, @@ -40,6 +39,7 @@ def __init__(self, master="local[*]", app_name="optimus", checkpoint=False, path :param file_system: 'local' or 'hadoop' :param additional_options: + :param options: Configuration options that are passed to spark-submit. See `the list of possible options `_. @@ -123,7 +123,7 @@ def __init__(self, master="local[*]", app_name="optimus", checkpoint=False, path self.read = self.spark.read self.profiler = Profiler() self.ml = ML() - self.enricher = Enricher(enricher_localhost, enricher_port, op=self) + self.enricher = Enricher(op=self, host=enricher_localhost, port=enricher_port, ) def enrich(self, df, func_request, func_response): """ @@ -134,7 +134,7 @@ def enrich(self, df, func_request, func_response): :return: """ - self.enricher.run(df, func_request=func_request, func_response=func_response) + return self.enricher.run(df, func_request=func_request, func_response=func_response) @property def spark(self): diff --git a/requirements.txt b/requirements.txt index 39421366c..32df9f7f7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,4 +27,5 @@ flask==1.0.2 ipython==6.5.0 ratelimit==2.2.0 humanize==0.5.1 -psutil \ No newline at end of file +psutil==5.4.7 +backoff=1.6.0 \ No newline at end of file From f2d62147184de6aa538c0c2fbf0e71f49aea1120 Mon Sep 17 00:00:00 2001 From: faviovazquez Date: Wed, 12 Sep 2018 10:16:18 -0500 Subject: [PATCH 75/94] Add backoff requirement --- requirements-test.txt | 1 + requirements.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 361200ed5..dfdf9d2da 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -27,3 +27,4 @@ ipython==6.5.0 humanize==0.5.1 pytest-cov==2.6.0 psutil +backoff==1.6.0 diff --git a/requirements.txt b/requirements.txt index 32df9f7f7..9285226a3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,4 +28,4 @@ ipython==6.5.0 ratelimit==2.2.0 humanize==0.5.1 psutil==5.4.7 -backoff=1.6.0 \ No newline at end of file +backoff==1.6.0 \ No newline at end of file From 2da2db20d1cac76c14b7e9ddef5f62a2fd073b78 Mon Sep 17 00:00:00 2001 From: faviovazquez Date: Wed, 12 Sep 2018 10:29:29 -0500 Subject: [PATCH 76/94] Add pymongo as requirement --- requirements-test.txt | 1 + requirements.txt | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index dfdf9d2da..a14c01a40 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -28,3 +28,4 @@ humanize==0.5.1 pytest-cov==2.6.0 psutil backoff==1.6.0 +pymongo diff --git a/requirements.txt b/requirements.txt index 9285226a3..d15c7a07e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,4 +28,5 @@ ipython==6.5.0 ratelimit==2.2.0 humanize==0.5.1 psutil==5.4.7 -backoff==1.6.0 \ No newline at end of file +backoff==1.6.0 +pymongo \ No newline at end of file From d7538e30417dc638a4b11e9653c3db28f16ddc50 Mon Sep 17 00:00:00 2001 From: faviovazquez Date: Wed, 12 Sep 2018 10:56:33 -0500 Subject: [PATCH 77/94] Add ratelimit as dependency --- requirements-test.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-test.txt b/requirements-test.txt index a14c01a40..b401a9045 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -29,3 +29,4 @@ pytest-cov==2.6.0 psutil backoff==1.6.0 pymongo +ratelimit==2.2.0 From 608ab00c1d88cd0e38ade350616adee172fea220 Mon Sep 17 00:00:00 2001 From: faviovazquez Date: Wed, 12 Sep 2018 11:14:06 -0500 Subject: [PATCH 78/94] Add tqdm as requirement --- requirements-test.txt | 1 + requirements.txt | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index b401a9045..ea4dd4ceb 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -30,3 +30,4 @@ psutil backoff==1.6.0 pymongo ratelimit==2.2.0 +tqdm \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index d15c7a07e..99e937e38 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,4 +29,5 @@ ratelimit==2.2.0 humanize==0.5.1 psutil==5.4.7 backoff==1.6.0 -pymongo \ No newline at end of file +pymongo +tqdm \ No newline at end of file From e09568e5dac990666c1607648259970240c529ac Mon Sep 17 00:00:00 2001 From: faviovazquez Date: Wed, 12 Sep 2018 11:25:14 -0500 Subject: [PATCH 79/94] Add kombu as req --- requirements-test.txt | 3 ++- requirements.txt | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/requirements-test.txt b/requirements-test.txt index ea4dd4ceb..9d4576c9a 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -30,4 +30,5 @@ psutil backoff==1.6.0 pymongo ratelimit==2.2.0 -tqdm \ No newline at end of file +tqdm +kombu \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 99e937e38..b25a6b1f0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,4 +30,4 @@ humanize==0.5.1 psutil==5.4.7 backoff==1.6.0 pymongo -tqdm \ No newline at end of file +kombu \ No newline at end of file From e3fde04b87939c15d559f8bbaff332de2c367dd6 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Wed, 12 Sep 2018 11:36:59 -0500 Subject: [PATCH 80/94] Remove junk file --- optimus/tasks.py | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 optimus/tasks.py diff --git a/optimus/tasks.py b/optimus/tasks.py deleted file mode 100644 index 45f04299c..000000000 --- a/optimus/tasks.py +++ /dev/null @@ -1,11 +0,0 @@ -from celery import Celery - -app = Celery('tasks', broker='pyamqp://guest@localhost//') - -# To run -# Install erglang -# Install rabiitmq -# >> celery -A tasks worker --loglevel=info -@app.task -def add(x, y): - return x + y From accc59fa0e5f7f7153775c5882b2bfb63b07423b Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Wed, 12 Sep 2018 11:46:37 -0500 Subject: [PATCH 81/94] Removed junk file --- optimus/enrichment/worker.py | 15 --------------- 1 file changed, 15 deletions(-) delete mode 100644 optimus/enrichment/worker.py diff --git a/optimus/enrichment/worker.py b/optimus/enrichment/worker.py deleted file mode 100644 index e8a79e02f..000000000 --- a/optimus/enrichment/worker.py +++ /dev/null @@ -1,15 +0,0 @@ -from celery import Celery -import requests -import os - -# Create the app and set the broker location (RabbitMQ) -app = Celery('worker', - backend='rpc://', - broker='redis://localhost:6379') - - -@app.task -def download(url): - response = requests.get(url) - data = response.text() - print(data) \ No newline at end of file From 0939e1cae5e44f34945897407f2ad429ee5b54d0 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Wed, 12 Sep 2018 13:13:12 -0500 Subject: [PATCH 82/94] data_frame() now use keyword arguments instead of dispatch --- optimus/create.py | 67 ++++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 36 deletions(-) diff --git a/optimus/create.py b/optimus/create.py index 4ca38b3ce..69ef3cd54 100644 --- a/optimus/create.py +++ b/optimus/create.py @@ -1,8 +1,8 @@ -from multipledispatch import dispatch +import pandas as pd from pyspark.sql.types import StructField, StructType, StringType # Helpers -from optimus.helpers.checkit import is_tuple, is_list_of_tuples +from optimus.helpers.checkit import is_tuple, is_list_of_tuples, is_ from optimus.helpers.functions import get_spark_dtypes_object from optimus.spark import Spark @@ -10,54 +10,49 @@ class Create: @staticmethod - @dispatch(list, list) - def data_frame(cols, rows): + def data_frame(cols=None, rows=None, pdf=None): """ Helper to create a Spark dataframe: :param cols: List of Tuple with name, data type and a flag to accept null :param rows: List of Tuples if vals with the same number and types that cols + :param pdf: :return: Dataframe """ - if not is_list_of_tuples(rows): - rows = [(i,) for i in rows] + if is_(pdf, pd.DataFrame): + result = Spark.instance.spark.createDataFrame(pdf) + else: + if not is_list_of_tuples(rows): + rows = [(i,) for i in rows] - specs = [] - for c in cols: + specs = [] - # Get columns name - if not is_tuple(c): - col_name = c - else: - col_name = c[0] + for c in cols: - # Get columns data type - if len(c) == 2: - var_type = get_spark_dtypes_object(c[1]) - else: - var_type = StringType() + # Get columns name + if not is_tuple(c): + col_name = c + else: + col_name = c[0] - # Get column nullable flag. It's just to tell if a column accept nulls as values - if len(c) == 3: - nullable = c[2] - else: - nullable = True + # Get columns data type + if len(c) == 2: + var_type = get_spark_dtypes_object(c[1]) + else: + var_type = StringType() - # If tuple has not the third param with put it to true to accepts Null in columns - specs.append([col_name, var_type, nullable]) + # Get column nullable flag. It's just to tell if a column accept nulls as values + if len(c) == 3: + nullable = c[2] + else: + nullable = True - struct_fields = list(map(lambda x: StructField(*x), specs)) + # If tuple has not the third param with put it to true to accepts Null in columns + specs.append([col_name, var_type, nullable]) - return Spark.instance.spark.createDataFrame(rows, StructType(struct_fields)) + struct_fields = list(map(lambda x: StructField(*x), specs)) - @staticmethod - @dispatch(object) - def data_frame(pdf): - """ - Helper to create a Spark dataframe: - :param pdf: Panda Dataframe - :return: Dataframe - """ + result = Spark.instance.spark.createDataFrame(rows, StructType(struct_fields)) - return Spark.instance.spark.createDataFrame(pdf) + return result df = data_frame From 2373b6ec50d9080abadc018661f189b516861af0 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Wed, 12 Sep 2018 13:40:55 -0500 Subject: [PATCH 83/94] Fixed wrong data type in dataframe create --- optimus/create.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/optimus/create.py b/optimus/create.py index 69ef3cd54..eb566ce58 100644 --- a/optimus/create.py +++ b/optimus/create.py @@ -35,7 +35,7 @@ def data_frame(cols=None, rows=None, pdf=None): col_name = c[0] # Get columns data type - if len(c) == 2: + if len(c) >= 2: var_type = get_spark_dtypes_object(c[1]) else: var_type = StringType() @@ -43,6 +43,7 @@ def data_frame(cols=None, rows=None, pdf=None): # Get column nullable flag. It's just to tell if a column accept nulls as values if len(c) == 3: nullable = c[2] + var_type = get_spark_dtypes_object(c[1]) else: nullable = True From 962d46a6cdf32551677c848c6a653282d8980b12 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Wed, 12 Sep 2018 13:41:26 -0500 Subject: [PATCH 84/94] Fixed keyword argument --- tests/test_optimus.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_optimus.py b/tests/test_optimus.py index 05d55f00d..39139fd33 100644 --- a/tests/test_optimus.py +++ b/tests/test_optimus.py @@ -121,7 +121,7 @@ def test_create_data_frames_pandas(): # Create pandas dataframe pdf = pd.DataFrame.from_records(data, columns=labels) - actual_df = op.create.df(pdf) + actual_df = op.create.df(pdf=pdf) expected_df = op.create.df( rows=[ @@ -136,3 +136,4 @@ def test_create_data_frames_pandas(): ) assert (expected_df.collect() == actual_df.collect()) + From 3a2677dd49cf4e716d4a9041d17ed274e728b7bc Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Wed, 12 Sep 2018 16:16:55 -0500 Subject: [PATCH 85/94] News tests fixed --- tests/test_cols.py | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/tests/test_cols.py b/tests/test_cols.py index 398d5026f..c0ec85078 100644 --- a/tests/test_cols.py +++ b/tests/test_cols.py @@ -1,4 +1,6 @@ -from pyspark.ml.linalg import Vectors, VectorUDT +import logging + +from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector from pyspark.sql import Row from pyspark.sql import functions as F from pyspark.sql.types import * @@ -6,6 +8,10 @@ from optimus import Optimus op = Optimus() +# op.sc.setLogLevel("INFO") + +s_logger = logging.getLogger('py4j.java_gateway') +s_logger.setLevel(logging.INFO) class TestDataFrameCols(object): @@ -397,24 +403,23 @@ def test_cast_vector(): source_df = op.create.df( rows=[ ("happy", [1, 2, 3]), - ("excited", 2) + ("excited", [4, 5, 6]) ], cols=[ - ("emotion", ArrayType(), True), - ("num", IntegerType(), True) + ("emotion", StringType(), True), + ("num", ArrayType(IntegerType()), True) ] ) - actual_df = source_df.cols.cast("happy", Vectors) + actual_df = source_df.cols.cast("num", Vectors) expected_df = op.create.df( rows=[ - ("happy", [1, 2, 3]), - ("excited", 2) - ], + ("happy", DenseVector([1, 2, 3])), + ("excited", DenseVector([4, 5, 6]))], cols=[ - ("emotion", VectorUDT(), True), - ("num", StringType(), True) + ("emotion", StringType(), True), + ("num", VectorUDT(), True) ] ) @@ -573,8 +578,8 @@ def test_nest(): expected_df = op.create.df( rows=[ - (1, "happy", "1 happy"), - (2, "excited", "2 excited") + ("happy", 1, "happy 1"), + ("excited", 2, "excited 2") ], cols=[ ("emotion", StringType(), True), @@ -599,12 +604,12 @@ def test_nest_mix(): ] ) - actual_df = source_df.cols.nest([F.Column("emotion"), "---", F.Column("num")], separator="new") + actual_df = source_df.cols.nest([F.col("emotion"), F.col("num")], "new", separator="--") expected_df = op.create.df( rows=[ - (1, "happy", "1---happy"), - (2, "excited", "2---excited") + ("happy", 1, "happy--1"), + ("excited", 2, "excited--2") ], cols=[ ("emotion", StringType(), True), @@ -714,7 +719,7 @@ def test_is_na(): ] ) - actual_df = source_df.cols.fill_na("*", "N/A") + actual_df = source_df.cols.is_na("*") expected_df = op.create.df( rows=[ @@ -722,9 +727,9 @@ def test_is_na(): (False, False, False) ], cols=[ - ("emotion", StringType(), True), - ("num1", IntegerType(), True), - ("num2", IntegerType(), True) + ("emotion", BooleanType(), True), + ("num1", BooleanType(), True), + ("num2", BooleanType(), True) ] ) From 528f8eec096f4aeeaf4be19ab36ab08f82238c51 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Wed, 12 Sep 2018 16:52:30 -0500 Subject: [PATCH 86/94] Fiz fill na function --- optimus/dataframe/columns.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimus/dataframe/columns.py b/optimus/dataframe/columns.py index aaec0fbab..95eba40e6 100644 --- a/optimus/dataframe/columns.py +++ b/optimus/dataframe/columns.py @@ -853,7 +853,7 @@ def fill_na(columns, value): columns = parse_columns(self, columns) def _fill_na(_col_name, _value): - return F.when(F.isnan(_col_name) | F.col(_col_name).isNull(), _value).otherwise(_col_name) + return F.when(F.isnan(_col_name) | F.col(_col_name).isNull(), _value).otherwise(F.col(_col_name)) df = self for col_name in columns: From 6688bdf08497008e6340ac1f1b1d2ad1d39f388d Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Wed, 12 Sep 2018 18:48:25 -0500 Subject: [PATCH 87/94] Additional fixes --- optimus/create.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/optimus/create.py b/optimus/create.py index eb566ce58..d72660822 100644 --- a/optimus/create.py +++ b/optimus/create.py @@ -2,7 +2,7 @@ from pyspark.sql.types import StructField, StructType, StringType # Helpers -from optimus.helpers.checkit import is_tuple, is_list_of_tuples, is_ +from optimus.helpers.checkit import is_tuple, is_, is_list, is_one_element, is_list_of_tuples from optimus.helpers.functions import get_spark_dtypes_object from optimus.spark import Spark @@ -15,37 +15,38 @@ def data_frame(cols=None, rows=None, pdf=None): Helper to create a Spark dataframe: :param cols: List of Tuple with name, data type and a flag to accept null :param rows: List of Tuples if vals with the same number and types that cols - :param pdf: + :param pdf: a pandas dataframe :return: Dataframe """ if is_(pdf, pd.DataFrame): result = Spark.instance.spark.createDataFrame(pdf) else: - if not is_list_of_tuples(rows): - rows = [(i,) for i in rows] specs = [] + # Process the rows + if not is_list_of_tuples(rows): + rows = [(i,) for i in rows] + # Process the columns for c in cols: - # Get columns name - if not is_tuple(c): - col_name = c - else: - col_name = c[0] - # Get columns data type - if len(c) >= 2: - var_type = get_spark_dtypes_object(c[1]) - else: + if is_one_element(c): + col_name = c var_type = StringType() + nullable = True + + elif is_tuple(c): - # Get column nullable flag. It's just to tell if a column accept nulls as values - if len(c) == 3: - nullable = c[2] + # Get columns data type + col_name = c[0] var_type = get_spark_dtypes_object(c[1]) - else: - nullable = True + + count = len(c) + if count == 2: + nullable = True + elif count == 3: + nullable = c[2] # If tuple has not the third param with put it to true to accepts Null in columns specs.append([col_name, var_type, nullable]) From bbfcaad13bf5049740cc2f6c7195deb8d25f048b Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Wed, 12 Sep 2018 18:48:33 -0500 Subject: [PATCH 88/94] Fixes to test --- tests/test_cols.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/tests/test_cols.py b/tests/test_cols.py index c0ec85078..3afd89ca7 100644 --- a/tests/test_cols.py +++ b/tests/test_cols.py @@ -637,12 +637,12 @@ def test_fill_na(): expected_df = op.create.df( rows=[ ("happy", 1, "N/A"), - ("excited", 2, 8) + ("excited", 2, "8") ], cols=[ ("emotion", StringType(), True), - ("num1", IntegerType(), True), - ("num2", IntegerType(), True) + ("num1", StringType(), True), + ("num2", StringType(), True) ] ) @@ -652,25 +652,29 @@ def test_fill_na(): def test_nest_vector(): source_df = op.create.df( rows=[ - ("happy", 1), - ("excited", 2) + ("happy", 1, 4), + ("excited", 2, 5), + ("sad", 3, 6) ], cols=[ ("emotion", StringType(), True), - ("num", IntegerType(), True) + ("num", IntegerType(), True), + ("num 2", IntegerType(), True) ] ) - actual_df = source_df.cols.nest(["emotion", "num"], "new", shape="vector") + actual_df = source_df.cols.nest(["num", "num 2"], "new", shape="vector") expected_df = op.create.df( rows=[ - (1, "happy", [1, "happy"]), - (2, "excited", [2, "excited"]) + ("happy", 1, 4, DenseVector([1, 4])), + ("excited", 2, 5, DenseVector([2, 5])), + ("sad", 3, 6, DenseVector([3, 6])) ], cols=[ ("emotion", StringType(), True), ("num", IntegerType(), True), + ("num 2", IntegerType(), True), ("new", VectorUDT(), True) ] @@ -695,13 +699,13 @@ def test_nest_array(): expected_df = op.create.df( rows=[ - (1, "happy", [1, "happy"]), - (2, "excited", [2, "excited"]) + ("happy", 1, ["happy", "1"]), + ("excited", 2, ["excited", "2"]) ], cols=[ ("emotion", StringType(), True), ("num", IntegerType(), True), - ("new", ArrayType(), True)]) + ("new", ArrayType(StringType()), True)]) assert (actual_df.collect() == expected_df.collect()) From 4eef2ff76dbb1e7c3a23f2853e246e7e7e7fff44 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Wed, 12 Sep 2018 18:48:54 -0500 Subject: [PATCH 89/94] Added SyntaxError Exception --- optimus/functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/optimus/functions.py b/optimus/functions.py index 2258a3743..6aa3d199b 100644 --- a/optimus/functions.py +++ b/optimus/functions.py @@ -271,7 +271,7 @@ def str_to_date(value): def str_to_array(value): """ - Check if value can be pased to tupple or arrays. + Check if value can be parsed to a tuple or and array. Because Spark can handle tuples we will try to transform tuples to arrays :param value: :return: @@ -279,7 +279,7 @@ def str_to_array(value): try: if isinstance(literal_eval((value.encode('ascii', 'ignore')).decode("utf-8")), (list, tuple)): return True - except ValueError: + except (ValueError, SyntaxError,): pass def func(value): From 95320cba59eecf7a0ff62c8a9110006a456fe185 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Wed, 12 Sep 2018 18:49:06 -0500 Subject: [PATCH 90/94] isin() rename to is_in() --- optimus/dataframe/rows.py | 2 +- tests/test_rows.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/optimus/dataframe/rows.py b/optimus/dataframe/rows.py index 215c5953c..1edafb858 100644 --- a/optimus/dataframe/rows.py +++ b/optimus/dataframe/rows.py @@ -178,7 +178,7 @@ def drop_first(): return self.zipWithIndex().filter(lambda tup: tup[1] > 0).map(lambda tup: tup[0]) @add_attr(rows) - def isin(columns, values): + def is_in(columns, values): """ Filter rows which columns that match a specific value :return: Spark DataFrame diff --git a/tests/test_rows.py b/tests/test_rows.py index 41883fa68..8151db308 100644 --- a/tests/test_rows.py +++ b/tests/test_rows.py @@ -3,7 +3,6 @@ from optimus.functions import abstract_udf as audf op = Optimus() -sc = op.sc source_df = op.create.df([ ("words", "str", True), @@ -164,7 +163,7 @@ def test_sort(): assert (expected_df.collect() == actual_df.collect()) @staticmethod - def test_isin(): + def test_is_in(): actual_df = source_df.rows.isin("num", 2) expected_df = op.create.df([ @@ -176,7 +175,7 @@ def test_isin(): ("filter", StringType(), True) ], [ - (" zombies", 2, "cat", "tv", 6, "b") + (" zombies", 2, "cat", "tv", 6, "b"), ]) assert (expected_df.collect() == actual_df.collect()) From bdfa6fe4815e0382d61ec6680cbf2401d45209d9 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Wed, 12 Sep 2018 18:50:06 -0500 Subject: [PATCH 91/94] This requiered additional work. Will be added in a next release --- tests/test_session.py | 197 ------------------------------------------ 1 file changed, 197 deletions(-) delete mode 100644 tests/test_session.py diff --git a/tests/test_session.py b/tests/test_session.py deleted file mode 100644 index cb9a74d79..000000000 --- a/tests/test_session.py +++ /dev/null @@ -1,197 +0,0 @@ -import sys -import unittest - -try: - from unittest import mock -except ImportError: - import mock - -from pyspark import SparkContext - -from optimus import Optimus - - -class TestOptimusSession(unittest.TestCase): - maxDiff = None - - def setUp(self): - super(TestOptimusSession, self).setUp() - self.spark_context_mock = mock.Mock(spec=SparkContext) - - self.patches = [ - mock.patch('optimus.sc', self.spark_context_mock), - ] - [p.start() for p in self.patches] - - def tearDown(self): - [p.stop() for p in self.patches] - super(TestOptimusSession, self).tearDown() - - def test_has_package(self): - op = Optimus() - self.assertFalse(op.has_package('datastax:spark-cassandra-connector')) - - op.packages = ['datastax:spark-cassandra-connector:1.6.1-s_2.10'] - self.assertTrue(op.has_package('datastax:spark-cassandra-connector')) - - def test_has_jar(self): - op = Optimus() - self.assertFalse(op.has_jar('mysql-connector-java')) - - op.jars = ['mysql-connector-java-5.1.39-bin.jar'] - self.assertTrue(op.has_jar('mysql-connector-java')) - - @mock.patch('optimus.os') - def test_session_with_packages(self, os_mock): - os_mock.environ = {} - - Optimus(packages=['package1', 'package2']) - - self.assertEqual(os_mock.environ, { - 'PYSPARK_PYTHON': sys.executable, - 'PYSPARK_SUBMIT_ARGS': ( - '--packages package1,package2 ' - '--conf "spark.sql.catalogImplementation=hive" ' - 'pyspark-shell' - ), - }) - - @mock.patch('optimus.os') - def test_session_with_repositories(self, os_mock): - os_mock.environ = {} - - Optimus( - packages=['package1', 'package2'], - repositories=[ - 'http://my.maven.repo', - 'http://another.maven.repo', - ]) - - self.assertEqual(os_mock.environ, { - 'PYSPARK_PYTHON': sys.executable, - 'PYSPARK_SUBMIT_ARGS': ( - '--repositories http://my.maven.repo,http://another.maven.repo ' - '--packages package1,package2 ' - '--conf "spark.sql.catalogImplementation=hive" ' - 'pyspark-shell' - ), - }) - - @mock.patch('optimus.os') - def test_session_with_jars(self, os_mock): - os_mock.environ = {} - - Optimus(jars=['file_a.jar', 'file_b.jar']) - - self.assertEqual(os_mock.environ, { - 'PYSPARK_PYTHON': sys.executable, - 'PYSPARK_SUBMIT_ARGS': ( - '--jars file_a.jar,file_b.jar ' - '--conf "spark.sql.catalogImplementation=hive" ' - 'pyspark-shell' - ), - }) - - @mock.patch('optimus.os') - def test_session_with_options(self, os_mock): - os_mock.environ = {} - - # test options attached to class definition - Optimus( - options={ - 'spark.option.a': 'value_a', - 'spark.option.b': 'value_b', - }) - - self.assertEqual(os_mock.environ, { - 'PYSPARK_PYTHON': sys.executable, - 'PYSPARK_SUBMIT_ARGS': ( - '--conf "spark.option.a=value_a" ' - '--conf "spark.option.b=value_b" ' - '--conf "spark.sql.catalogImplementation=hive" ' - 'pyspark-shell' - ), - }) - - # test additional_options override/extend options attached to class definition - os_mock.environ = {} - - Optimus(additional_options={ - 'spark.option.b': 'value_0', - 'spark.option.c': 'value_c', - }) - - self.assertEqual(os_mock.environ, { - 'PYSPARK_PYTHON': sys.executable, - 'PYSPARK_SUBMIT_ARGS': ( - '--conf "spark.option.a=value_a" ' - '--conf "spark.option.b=value_0" ' - '--conf "spark.option.c=value_c" ' - '--conf "spark.sql.catalogImplementation=hive" ' - 'pyspark-shell' - ), - }) - - # test catalog implementation is respected - os_mock.environ = {} - - Optimus(options={ - 'spark.sql.catalogImplementation': 'my_fancy_catalog', - }) - - self.assertEqual(os_mock.environ, { - 'PYSPARK_PYTHON': sys.executable, - 'PYSPARK_SUBMIT_ARGS': ( - '--conf "spark.sql.catalogImplementation=my_fancy_catalog" ' - 'pyspark-shell' - ), - }) - - @mock.patch('optimus.os') - def test_session_without_packages_jars_and_options(self, os_mock): - os_mock.environ = {} - - Optimus() - - self.assertEqual(os_mock.environ, { - 'PYSPARK_PYTHON': sys.executable, - 'PYSPARK_SUBMIT_ARGS': '--conf "spark.sql.catalogImplementation=hive" pyspark-shell', - }) - - @mock.patch('optimus.os') - def test_session_appends_to_pyspark_submit_args(self, os_mock): - os_mock.environ = { - 'PYSPARK_SUBMIT_ARGS': '--conf "my.conf.here=5g" --and-other-properties', - } - - Optimus() - - self.assertEqual(os_mock.environ, { - 'PYSPARK_PYTHON': sys.executable, - 'PYSPARK_SUBMIT_ARGS': ( - '--conf "my.conf.here=5g" --and-other-properties ' - '--conf "spark.sql.catalogImplementation=hive" ' - 'pyspark-shell' - ), - }) - - # test more complicated session - os_mock.environ = { - 'PYSPARK_SUBMIT_ARGS': '--conf "my.conf.here=5g" --and-other-properties', - } - - Optimus(options={'my.conf.here': '10g'}) - - self.assertEqual(os_mock.environ, { - 'PYSPARK_PYTHON': sys.executable, - 'PYSPARK_SUBMIT_ARGS': ( - '--conf "my.conf.here=5g" --and-other-properties ' - # Note that spark honors the first conf it sees when multiple - # are defined - '--conf "my.conf.here=10g" ' - '--conf "spark.sql.catalogImplementation=hive" ' - 'pyspark-shell' - ), - }) - - From 5e26e1726af843b80cdeb4e48d364782d59eb817 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Wed, 12 Sep 2018 19:01:35 -0500 Subject: [PATCH 92/94] More fixes --- tests/test_optimus.py | 2 +- tests/test_rows.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_optimus.py b/tests/test_optimus.py index 39139fd33..3f4c7c769 100644 --- a/tests/test_optimus.py +++ b/tests/test_optimus.py @@ -47,7 +47,7 @@ def test_create_data_frames_plain(): cols= [ ("name", StringType(), True), - ("age", IntegerType(), False) + ("age", StringType(), False) ] ) diff --git a/tests/test_rows.py b/tests/test_rows.py index 8151db308..19a34aa5e 100644 --- a/tests/test_rows.py +++ b/tests/test_rows.py @@ -164,7 +164,7 @@ def test_sort(): @staticmethod def test_is_in(): - actual_df = source_df.rows.isin("num", 2) + actual_df = source_df.rows.is_in("num", 2) expected_df = op.create.df([ ("words", "str", True), @@ -176,6 +176,7 @@ def test_is_in(): ], [ (" zombies", 2, "cat", "tv", 6, "b"), + ("simpsons cat lady", 2, "frog", "table", 7, "1") ]) assert (expected_df.collect() == actual_df.collect()) From 1e96fb78070308a652e9097ba2b0de5d6a15c843 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Wed, 12 Sep 2018 19:21:34 -0500 Subject: [PATCH 93/94] Fixes proposed by codacy --- optimus/create.py | 2 +- optimus/dataframe/columns.py | 2 +- optimus/enricher.py | 13 +++++++------ optimus/io/load.py | 18 ------------------ 4 files changed, 9 insertions(+), 26 deletions(-) diff --git a/optimus/create.py b/optimus/create.py index d72660822..f9158e955 100644 --- a/optimus/create.py +++ b/optimus/create.py @@ -2,7 +2,7 @@ from pyspark.sql.types import StructField, StructType, StringType # Helpers -from optimus.helpers.checkit import is_tuple, is_, is_list, is_one_element, is_list_of_tuples +from optimus.helpers.checkit import is_tuple, is_, is_one_element, is_list_of_tuples from optimus.helpers.functions import get_spark_dtypes_object from optimus.spark import Spark diff --git a/optimus/dataframe/columns.py b/optimus/dataframe/columns.py index 95eba40e6..fa29524b2 100644 --- a/optimus/dataframe/columns.py +++ b/optimus/dataframe/columns.py @@ -1118,7 +1118,7 @@ def nest(input_cols, output_col, shape="string", separator=""): df = self if has_(input_cols, F.Column): - "Transform non Column data to lit" + # Transform non Column data to lit columns = [F.lit(col) if not is_(col, F.Column) else col for col in input_cols] else: columns = parse_columns(self, input_cols) diff --git a/optimus/enricher.py b/optimus/enricher.py index 50c2cb865..4771b8626 100644 --- a/optimus/enricher.py +++ b/optimus/enricher.py @@ -217,14 +217,15 @@ def copy_collection(self, source_name, dest_name): source = self.db[source_name] - logging.info('Dropping', dest_name, 'collection') + logging.info("Dropping {dest_name} collection".format(dest_name=dest_name)) self.db[dest_name].drop() # if data exist in the collection drop it pipeline = [{"$match": {}}, {"$out": dest_name}, ] - logging.info('Copying', source_name, 'collection to', dest_name, 'collection ...') + logging.info("Copying {source_name} collection to {dest_name} collection ...".format(source_name=source_name, + dest_name=dest_name)) source.aggregate(pipeline) logging.info('Done') @@ -284,7 +285,7 @@ def drop_keys(collection_name, keys): :return: """ for key in tqdm_notebook(keys, desc='Processing cols'): - logging.info('Dropping', key, 'field') + logging.info("Dropping {key}".format(key=key)) collection_name.update_many({}, {'$unset': {key: 1}}) def drop_collection(self, collection_name): @@ -376,7 +377,7 @@ def create_missing_fields(self, cols, collection_name=None): source = self.collection for c in tqdm_notebook(cols, total=len(cols), desc='Processing cols'): - logging.info('Inserting', c) + logging.info("Inserting {c}".format(c=c)) if c: source.update_many( {c: {'$exists': False}}, @@ -387,7 +388,7 @@ def create_missing_fields(self, cols, collection_name=None): } ); else: - logging.info('Field', c, 'could not be added') + logging.info("Field {c} could not be added".format(c=c)) def cast(self, collection_name, field, convert_to): """ @@ -417,4 +418,4 @@ def cast(self, collection_name, field, convert_to): collection.update_one({'_id': c['_id']}, {'$set': {field: val}}) except ValueError: - logging.info('Could not convert "', val, '" to', convert_to) + logging.info("Could not convert '{val}' to '{convert_to}'".format(val=val, convert_to=convert_to)) diff --git a/optimus/io/load.py b/optimus/io/load.py index a2804982a..33edac0a0 100644 --- a/optimus/io/load.py +++ b/optimus/io/load.py @@ -2,8 +2,6 @@ import tempfile from urllib.request import Request, urlopen -from kombu import Consumer - from optimus.helpers.raiseit import RaiseIt from optimus.spark import Spark @@ -118,22 +116,6 @@ def avro(path, *args, **kwargs): return df - -""" - @staticmethod - def rabbit_mq(): - def process_message(body, message): - print("The body is {}".format(body)) - message.ack() - - with Consumer(conn, queues=queue, callbacks=[process_message], accept=["application/json"]): - line = conn.drain_events(timeout=5) - print(line) - - # conn.heartbeat_check() -""" - - class Downloader(object): def __init__(self, data_def): self.data_def = data_def From f1d31646304b603b892f293b9e3bcec8a4cc33f8 Mon Sep 17 00:00:00 2001 From: faviovazquez Date: Wed, 12 Sep 2018 23:49:42 -0500 Subject: [PATCH 94/94] Bump version --- docs/source/conf.py | 2 +- optimus/version.py | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index b2e63c4f8..933231ccf 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -60,7 +60,7 @@ # The short X.Y version. version = '2.0' # The full version, including alpha/beta/rc tags. -release = '2.0.6' +release = '2.0.7' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/optimus/version.py b/optimus/version.py index 59f084ee3..45b7a8f6d 100644 --- a/optimus/version.py +++ b/optimus/version.py @@ -5,5 +5,5 @@ def _safe_int(string): return string -__version__ = '2.0.6' +__version__ = '2.0.7' VERSION = tuple(_safe_int(x) for x in __version__.split('.')) diff --git a/setup.py b/setup.py index 2eaa372f7..f7e3ef8c3 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ def readme(): author='Favio Vazquez and Argenis Leon', author_email='favio.vazquez@ironmussa.com', url='https://github.com/ironmussa/Optimus/', - download_url='https://github.com/ironmussa/Optimus/archive/2.0.6.tar.gz', + download_url='https://github.com/ironmussa/Optimus/archive/2.0.7.tar.gz', description=('Optimus is the missing framework for cleaning and pre-processing data in a distributed fashion with ' 'pyspark.'), long_description=readme(),