Skip to content

Commit

Permalink
Merge branch 'master' into pyup-update-deprecated-1.2.3-to-1.2.4
Browse files Browse the repository at this point in the history
  • Loading branch information
FavioVazquez committed Nov 16, 2018
2 parents fa89d64 + 57f313b commit 2466e3a
Show file tree
Hide file tree
Showing 14 changed files with 3,979 additions and 541 deletions.
44 changes: 23 additions & 21 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,15 @@ op = Optimus()
def func(value, arg):
return "this was a number"

df =op.load.url("https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/foo.csv")
df =op.load.url("https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.csv")



df\
.rows.sort("product","desc")\
.cols.lower(["firstName","lastName"])\
.cols.date_transform("birth", "new_date", "yyyy/MM/dd", "dd-MM-YYYY")\
.cols.years_between("birth", "years_between", "yyyy/MM/dd")\
.cols.date_transform("birth", "yyyy/MM/dd", "dd-MM-YYYY")\
.cols.years_between("birth", "yyyy/MM/dd")\
.cols.remove_accents("lastName")\
.cols.remove_special_chars("lastName")\
.cols.replace("product","taaaccoo","taco")\
Expand All @@ -91,7 +93,7 @@ df\
.cols.rename(str.lower)\
.cols.apply_by_dtypes("product",func,"string", data_type="integer")\
.cols.trim("*")\
.show()
.table()
```

You transform this:
Expand Down Expand Up @@ -125,23 +127,23 @@ You transform this:
into this:

```
+---+---------+--------+---------+-----------------+-----+----------+----------+-------------+
| id|firstname|lastname|billingid| product|price| birth| new_date|years_between|
+---+---------+--------+---------+-----------------+-----+----------+----------+-------------+
| 10| james| maxwell| 875| taco| 3|1923/03/12|12-03-1923| 95.4355|
| 11| isaac| newton| 992| pasta| 9|1999/02/15|15-02-1999| 19.5108|
| 12| emmy| noether| 234| pasta| 9|1993/12/08|08-12-1993| 24.6962|
| 13| max| planck| 111| hamburguer| 4|1994/01/04|04-01-1994| 24.6237|
| 14| fred| hoyle| 553| pizza| 8|1997/06/27|27-06-1997| 21.1452|
| 15| heinrich| hertz| 116| pizza| 8|1956/11/30|30-11-1956| 61.7204|
| 16| william| gilbert| 886| BEER| 2|1958/03/26|26-03-1958| 60.3978|
| 17| marie| curie| 912| Rice| 1|2000/03/22|22-03-2000| 18.4086|
| 18| arthur| compton| 812|this was a number| 5|1899/01/01|01-01-1899| 119.6317|
| 19| james|chadwick| 467| null| 10|1921/05/03|03-05-1921| 97.293|
| 7| carl| gauss| 323| taco| 3|1970/07/13|13-07-1970| 48.0995|
| 8| david| hilbert| 624| taco| 3|1950/07/14|14-07-1950| 68.0968|
| 9| johannes| kepler| 735| taco| 3|1920/04/22|22-04-1920| 98.3253|
+---+---------+--------+---------+-----------------+-----+----------+----------+-------------+
+---+--------------------+--------+---------+-----------------+-----+----------+--------------------+-------------------+
| id| firstname|lastname|billingid| product|price| birth|birth_data_transform|birth_years_between|
+---+--------------------+--------+---------+-----------------+-----+----------+--------------------+-------------------+
| 9| johannes| kepler| 735| taco| 3|1920/04/22| 22-04-1920| 98.5511|
| 7| carl| gauss| 323| taco| 3|1970/07/13| 13-07-1970| 48.3253|
| 10| james| maxwell| 875| taco| 3|1923/03/12| 12-03-1923| 95.6613|
| 8| david| hilbert| 624| taco| 3|1950/07/14| 14-07-1950| 68.3226|
| 14| fred| hoyle| 553| pizza| 8|1997/06/27| 27-06-1997| 21.371|
| 15|((( heinrich )))))| hertz| 116| pizza| 8|1956/11/30| 30-11-1956| 61.9462|
| 12| emmy%%| noether| 234| pasta| 9|1993/12/08| 08-12-1993| 24.922|
| 11| isaac| newton| 992| pasta| 9|1999/02/15| 15-02-1999| 19.7366|
| 19| james|chadwick| 467| null| 10|1921/05/03| 03-05-1921| 97.5188|
| 13| max!!!| planck| 111| hamburguer| 4|1994/01/04| 04-01-1994| 24.8495|
| 17| marie| curie| 912| Rice| 1|2000/03/22| 22-03-2000| 18.6344|
| 16| william| gilbert| 886| BEER| 2|1958/03/26| 26-03-1958| 60.6237|
| 18| arthur| compton| 812|this was a number| 5|1899/01/01| 01-01-1899| 119.8575|
+---+--------------------+--------+---------+-----------------+-----+----------+--------------------+-------------------+
```
Note that you can use Optimus functions and Spark functions(`.WithColumn()`) at the same time. To know about all the Optimus functionality please go to this [notebooks](examples/)

Expand Down
2 changes: 1 addition & 1 deletion optimus/dataframe/columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -863,7 +863,7 @@ def _years_between(_new_col_name, attr):
df = self
for col_name in columns:
new_col_name = col_name + "_years_between"
df.cols.apply_expr(new_col_name, _years_between, [date_format, col_name]).cols.cast(new_col_name, "float")
df = df.cols.apply_expr(new_col_name, _years_between, [date_format, col_name]).cols.cast(new_col_name, "float")
return df

@add_attr(cols)
Expand Down
2 changes: 2 additions & 0 deletions optimus/helpers/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ def run(self, *args):
cls = "class Test" + self.name + "(object):\n"

test_file.write(cls)

# Write test to file
for t in args:
test_file.write(t)

Expand Down
4 changes: 2 additions & 2 deletions optimus/optimus.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ def __init__(self, master="local[*]", app_name="optimus", checkpoint=False, path
options=None,
additional_options=None,
enricher_host="localhost", enricher_port=27017,
queue_url="",
queue_exchange="",
queue_url=None,
queue_exchange=None,
queue_routing_key="optimus"
):

Expand Down
17 changes: 16 additions & 1 deletion optimus/profiler/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,21 @@ def write_json(data, path):
pass


def write_html(data, path):
"""
Write a json file with the profiler result
:param data:
:param path:
:return:
"""

try:
with open(path, 'w', encoding='utf-8') as outfile:
outfile.write(data)
except IOError:
pass


def sample_size(population_size, confidence_level, confidence_interval):
"""
Get a sample number of the whole population
Expand Down Expand Up @@ -144,5 +159,5 @@ def create_buckets(lower_bound, upper_bound, bins):
# ensure that the upper bound is exactly the higher value.
# Because floating point calculation it can miss the upper bound in the final sum

buckets[bins-1]["upper"] = upper_bound
buckets[bins - 1]["upper"] = upper_bound
return buckets
63 changes: 60 additions & 3 deletions optimus/profiler/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@
import jinja2
import pika
import pyspark.sql.functions as F
from IPython.core.display import display, HTML
from pyspark.sql.types import ArrayType, LongType

from optimus.functions import filter_row_by_data_type as fbdt, plot_hist, plot_freq
from optimus.helpers.decorators import time_it
from optimus.helpers.functions import parse_columns, print_html
from optimus.helpers.raiseit import RaiseIt
from optimus.profiler.functions import fill_missing_var_types, fill_missing_col_types, \
write_json
write_json, write_html


class Profiler:
Expand All @@ -42,6 +42,8 @@ def __init__(self, output_path=None, queue_url=None, queue_exchange=None, queue_
output_path = "data.json"
pass

self.html = None
self.json = None
self.path = output_path
self.queue_url = queue_url
self.queue_exchange = queue_exchange
Expand Down Expand Up @@ -238,9 +240,64 @@ def run(self, df, columns, buckets=40, infer=False, relative_error=1):
if self.queue_url is not None:
self.to_queue(output)

# Save to file
# JSON
# Save in case we want to output to a json file
self.json = output

# Save file in json format
write_json(output, self.path)

# Save in case we want to output to a html file
self.html = html

def to_file(self, path=None, output=None):
"""
Save profiler data to a file in the specified format (html, json)
:param output: html or json
:param path: filename in which the data will be saved
:return:
"""

if path is None:
RaiseIt.value_error(path, ["Invalid file path"])

# We need to append a some extra html tags to display it correctly in the browser.
if output is "html":
if self.html is None:
assert self.html is not None, "Please run the profiler first"

header = '''<!doctype html>
<html class="no-js" lang="">
<head>
<meta charset="utf-8">
<meta http-equiv="x-ua-compatible" content="ie=edge">
<title></title>
<meta name="description" content="">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<link rel="manifest" href="site.webmanifest">
<link rel="apple-touch-icon" href="icon.png">
<!-- Place favicon.ico in the root directory -->
<link rel="stylesheet" href="css/normalize.css">
<link rel="stylesheet" href="css/main.css">
</head>
<body>'''

footer = '''</body></html>'''

write_html(header + self.html + footer, path)
elif output is "json":
if self.json is None:
assert self.json is not None, "Please run the profiler first"

write_json(self.json, path)
else:
print("sdf")
RaiseIt.type_error(output, ["html", "json"])

def to_queue(self, message):
"""
Send the profiler information to a queue. By default it use a public encryted queue.
Expand Down
4 changes: 2 additions & 2 deletions requirements-docs.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
findspark==1.3.0
numpy==1.15.2
pytest==3.8.1
pytest==4.0.0
matplotlib==3.0.1
ipython==7.0.1
### pyspark==2.2.0 # it has no wheel
setuptools==40.5.0
setuptools==40.6.2
4 changes: 2 additions & 2 deletions requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@ python_dateutil==2.7.4
numpy==1.15.2
matplotlib==3.0.1
pyspark==2.3.1
pytest==3.8.1
pytest==4.0.0
findspark==1.3.0
nose==1.3.7
seaborn==0.9.0
setuptools==40.5.0
deprecated==1.2.4
setuptools==40.6.2
pyarrow==0.10.0
tabulate==0.8.2
Jinja2==2.10
Expand Down
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
requests==2.19.1
requests==2.20.1
tqdm==4.28.1
pymongo==3.7.2
fastnumbers==2.1.1
Expand All @@ -8,12 +8,12 @@ nose==1.3.7
numpy==1.15.2
matplotlib==3.0.1
pyspark==2.3.1
pytest==3.8.1
pytest==4.0.0
findspark==1.3.0
nose==1.3.7
seaborn==0.9.0
setuptools==40.5.0
deprecated==1.2.4
setuptools==40.6.2
pyarrow==0.10.0
tabulate==0.8.2
Jinja2==2.10
Expand Down
Loading

0 comments on commit 2466e3a

Please sign in to comment.