Merge pull request #130 from ironmussa/1.0.3

Build 1.0.3. Closes #129, closes #131
hi-primus · Oct 2, 2017 · 224303d · 224303d
2 parents 46c9d05 + a6d3c47
commit 224303d
Show file tree

Hide file tree

Showing 11 changed files with 1,289 additions and 1,348 deletions.
diff --git a/README.md b/README.md
@@ -4,7 +4,8 @@
  [![built_by iron](https://img.shields.io/badge/built_by-iron-FF69A4.svg)](http://ironmussa.com) [![Updates](https://pyup.io/repos/github/ironmussa/Optimus/shield.svg)](https://pyup.io/repos/github/ironmussa/Optimus/)
  [![GitHub release](https://img.shields.io/github/release/ironmussa/optimus.svg)](https://github.com/ironmussa/Optimus/) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/e01572e2af5640fcbcdd58e7408f3ea0)](https://www.codacy.com/app/favio.vazquezp/Optimus?utm_source=github.com&utm_medium=referral&utm_content=ironmussa/Optimus&utm_campaign=badger) [![StackShare](https://img.shields.io/badge/tech-stack-0690fa.svg?style=flat)](https://stackshare.io/iron-mussa/devops)
 
-[![Platforms](https://img.shields.io/badge/platform-Linux%20%7C%20Mac%20OS%20%7C%20Windows-blue.svg)](https://spark.apache.org/docs/2.2.0/#downloading) [![Dependency Status](https://gemnasium.com/badges/github.com/ironmussa/Optimus.svg)](https://gemnasium.com/github.com/ironmussa/Optimus) [![Quality Gate](https://sonarqube.com/api/badges/gate?key=ironmussa-optimus:optimus)](https://sonarqube.com/dashboard/index/ironmussa-optimus:optimus) 
+[![Platforms](https://img.shields.io/badge/platform-Linux%20%7C%20Mac%20OS%20%7C%20Windows-blue.svg)](https://spark.apache.org/docs/2.2.0/#downloading) [![Dependency Status](https://gemnasium.com/badges/github.com/ironmussa/Optimus.svg)](https://gemnasium.com/github.com/ironmussa/Optimus) [![Quality Gate](https://sonarqube.com/api/badges/gate?key=ironmussa-optimus:optimus)](https://sonarqube.com/dashboard/index/ironmussa-optimus:optimus)  [![Code Health](https://landscape.io/github/ironmussa/Optimus/develop/landscape.svg?style=flat)](https://landscape.io/github/ironmussa/Optimus/develop)
+
 
 [![Join the chat at https://gitter.im/optimuspyspark/Lobby](https://badges.gitter.im/optimuspyspark/Lobby.svg)](https://gitter.im/optimuspyspark/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 

diff --git a/docs/index.rst b/docs/index.rst
@@ -108,7 +108,7 @@ Lets assume you have the following dataset, called foo.csv, in your current dire
     # case, local file system (hard drive of the pc) is used.
     filePath = "file:///" + os.getcwd() + "/foo.csv"
 
-    df = tools.read_dataset_csv(path=filePath,
+    df = tools.read_csv(path=filePath,
                                 delimiter_mark=',')
 
     # Instance of profiler class
@@ -135,6 +135,7 @@ dataFrames.
 -  DataFrameAnalyzer.get_numerical_hist(df_one_col, num_bars)
 -  DataFrameAnalyzer.unique_values_col(column)
 -  DataFrameAnalyzer.write_json(json_cols, path_to_json_file)
+-  DataFrameAnalyzer.get_frequency(columns, sort_by_count=True)
 
 Lets assume you have the following dataset, called foo.csv, in your current directory:
 
@@ -180,23 +181,23 @@ Lets assume you have the following dataset, called foo.csv, in your current dire
 | 19 | JAMES                | Chadwick    | 467       | null       | 10    | 1921/05/03 | #        |
 +----+----------------------+-------------+-----------+------------+-------+------------+----------+
 
-The following code shows how to instanciate the class to analyze a dataFrame:
+The following code shows how to instantiate the class to analyze a dataFrame:
 
 .. code:: python
 
     # Import optimus
     import optimus as op
     # Instance of Utilities class
-    tools = op.Utilites()
+    tools = op.Utilities()
 
     # Reading dataframe. os.getcwd() returns de current directory of the notebook
     # 'file:///' is a prefix that specifies the type of file system used, in this
     # case, local file system (hard drive of the pc) is used.
     filePath = "file:///" + os.getcwd() + "/foo.csv"
   
-    df = tools.read_dataset_csv(path=filePath, delimiter_mark=',')
+    df = tools.read_csv(path=filePath, delimiter_mark=',')
 
-    analyzer = op.DataFrameAnalizer(df=df,pathFile=filePath)
+    analyzer = op.DataFrameAnalyzer(df=df,pathFile=filePath)
 
 Methods
 --------
@@ -534,7 +535,7 @@ Example:
 Analyzer.write_json(json_cols, path_to_json_file)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-This functions ... and outputs a JSON in the specified path.
+This functions outputs a JSON for the DataFrame in the specified path.
 
 Input:
 
@@ -556,6 +557,101 @@ Example:
 
   analyzer.write_json(json_cols=json_cols, path_to_json_file= os.getcwd() + "/foo.json")
 
+Analyzer.get_frequency(self, columns, sort_by_count=True)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This function gets the frequencies for values inside the specified columns.
+
+Input:
+
+``columns``: String or List of columns to analyze
+
+``sort_by_count``: Boolean if true the counts will be sort desc.
+
+The method outputs a Spark Dataframe with counts per existing values in each column.
+
+Tu use it, first lets create a sample DataFrame:
+
+.. code:: python
+
+    import random
+    import optimus as op
+    from pyspark.sql.types import StringType, StructType, IntegerType, FloatType, DoubleType, StructField
+
+    schema = StructType(
+            [
+            StructField("strings", StringType(), True),
+            StructField("integers", IntegerType(), True),
+            StructField("integers2", IntegerType(), True),
+            StructField("floats",  FloatType(), True),
+            StructField("double",  DoubleType(), True)
+            ]
+    )
+
+    size = 200
+    # Generating strings column:
+    foods = ['    pizza!       ', 'pizza', 'PIZZA;', 'pizza', 'pízza¡', 'Pizza', 'Piz;za']
+    foods = [foods[random.randint(0,6)] for count in range(size)]
+    # Generating integer column:
+    num_col_1 = [random.randint(0,9) for number in range(size)]
+    # Generating integer column:
+    num_col_2 = [random.randint(0,9) for number in range(size)]
+    # Generating integer column:
+    num_col_3 = [random.random() for number in range(size)]
+    # Generating integer column:
+    num_col_4 = [random.random() for number in range(size)]
+
+    # Building DataFrame
+    df = op.spark.createDataFrame(list(zip(foods, num_col_1, num_col_2, num_col_3, num_col_4)),schema=schema)
+
+    # Instantiate Analyzer
+    analyzer = op.DataFrameAnalyzer(df)
+
+    # Get frequency DataFrame
+    df_counts = analyzer.get_frequency(["strings", "integers"], True)
+
+And you will get (note that these are random generated values):
+
++-----------------+-----+
+|          strings|count|
++-----------------+-----+
+|            pizza|   48|
++-----------------+-----+
+|           Piz;za|   38|
++-----------------+-----+
+|            Pizza|   37|
++-----------------+-----+
+|           pízza¡|   29|
++-----------------+-----+
+|    pizza!       |   25|
++-----------------+-----+
+|           PIZZA;|   23|
++-----------------+-----+
+
++--------+-----+
+|integers|count|
++--------+-----+
+|       8|   31|
++--------+-----+
+|       5|   24|
++--------+-----+
+|       1|   24|
++--------+-----+
+|       9|   20|
++--------+-----+
+|       6|   20|
++--------+-----+
+|       2|   19|
++--------+-----+
+|       3|   19|
++--------+-----+
+|       0|   17|
++--------+-----+
+|       4|   14|
++--------+-----+
+|       7|   12|
++--------+-----+
+
 DataFrameTransformer class
 --------------------------
 
@@ -589,7 +685,7 @@ DataFrameTransformer class
   - DataFrameTransformer.set_col(columns, func, dataType)
 
 * **Others**:
-  - DataFrameTransformer.explode_table(coldId, col, new_col_feature)
+  - DataFrameTransformer.count_items(col_id, col_search, new_col_feature, search_string)
   - DataFrameTransformer.age_calculate(column)
 
 DataFrameTransformer class receives a dataFrame as an argument. This

diff --git a/examples/Impute_Missing_Data_With_Optimus.ipynb b/examples/Impute_Missing_Data_With_Optimus.ipynb
@@ -74,9 +74,7 @@
    ],
    "source": [
     "# Import optimus\n",
-    "import optimus as op\n",
-    "# Import os for reading from local\n",
-    "import os"
+    "import optimus as op"
    ]
   },
   {
@@ -105,8 +103,7 @@
    },
    "outputs": [],
    "source": [
-    "path = \"file:///\" + os.getcwd() + \"/impute_data.csv\"\n",
-    "df = tools.read_dataset_csv(path, delimiter_mark=\",\", header=\"true\")"
+    "df = tools.read_csv(\"impute_data.csv\", delimiter_mark=\",\", header=\"true\")"
    ]
   },
   {
@@ -461,17 +458,17 @@
        "        </div>\n",
        "    </div>\n",
        "</div>\n",
-       "<div class=\"col-md-3 collapse in\" id=\"minihistogram-3373932852551852457\">\n",
+       "<div class=\"col-md-3 collapse in\" id=\"minihistogram-2176875969815125993\">\n",
        "    <img src=\"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMgAAABLCAYAAAA1fMjoAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAAPYQAAD2EBqD%2BnaQAAAP9JREFUeJzt1bEJQkEQQEG/WJJF2JOxPVmEPa25yEOju2AmP25ZeOwxM3MCvjqvHgB2dlk9wKfr/fn3m9fjtu0/O7Lj37kgEAQCQSAQBAJBIBAEAkEgEAQCQSAQBAJBIBAEAkEgEAQCQSAQBAJBIBAEAkEgEAQCQSAQBAJBIBAEAkEgEAQCQSAQBAJBIBAEAkEgEAQCQSAQBAJBIBAEAkEgEAQCQSAQBAJBIBAEAkEgEAQCQSAQBALhmJlZPQTsygWBIBAIAoEgEAgCgSAQCAKBIBAIAoEgEAgCgSAQCAKBIBAIAoEgEAgCgSAQCAKBIBAIAoEgEAgCgSAQCAKB8AYfZxSPSASiNAAAAABJRU5ErkJggg%3D%3D\">\n",
        "\n",
        "</div>\n",
        "<div class=\"col-md-12 text-right\">\n",
-       "    <a role=\"button\" data-toggle=\"collapse\" data-target=\"#descriptives-3373932852551852457,#minihistogram-3373932852551852457\"\n",
+       "    <a role=\"button\" data-toggle=\"collapse\" data-target=\"#descriptives-2176875969815125993,#minihistogram-2176875969815125993\"\n",
        "       aria-expanded=\"false\" aria-controls=\"collapseExample\">\n",
        "        Toggle details\n",
        "    </a>\n",
        "</div>\n",
-       "<div class=\"row collapse col-md-12\" id=\"descriptives-3373932852551852457\">\n",
+       "<div class=\"row collapse col-md-12\" id=\"descriptives-2176875969815125993\">\n",
        "    <div class=\"col-sm-4\">\n",
        "        <p class=\"h4\">Quantile statistics</p>\n",
        "        <table class=\"stats indent\">\n",
@@ -619,7 +616,7 @@
        "</div>"
       ],
       "text/plain": [
-       "<spark_df_profiling_optimus.ProfileReport at 0x112e295f8>"
+       "<spark_df_profiling_optimus.ProfileReport at 0x107aa3358>"
       ]
      },
      "execution_count": 6,