-
Notifications
You must be signed in to change notification settings - Fork 233
/
test_load.py
136 lines (103 loc) · 5.89 KB
/
test_load.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from optimus.tests.base import TestBase
class TestLoadPandas(TestBase):
def test_json(self):
df = self.load_dataframe("examples/data/foo.json", type="json")
self.assertEqual(df.rows.count(), 19)
self.assertEqual(df.cols.names(), ["id","firstName","lastName","billingId","product","price","birth","dummyCol"])
def test_json_13rows(self):
df = self.load_dataframe("examples/data/foo.json", type="json", n_rows=13)
self.assertEqual(df.rows.count(), 13)
self.assertEqual(df.cols.names(), ["id","firstName","lastName","billingId","product","price","birth","dummyCol"])
def test_json_50rows(self):
df = self.load_dataframe("examples/data/foo.json", type="json", n_rows=50)
self.assertLess(df.rows.count(), 50)
self.assertEqual(df.cols.names(), ["id","firstName","lastName","billingId","product","price","birth","dummyCol"])
def test_xml(self):
df = self.load_dataframe("examples/data/foo.xml", type="xml")
self.assertEqual(df.rows.count(), 19)
self.assertEqual(df.cols.names(), ["id","firstName","lastName","billingId","product","price","birth","dummyCol"])
def test_xml_13rows(self):
df = self.load_dataframe("examples/data/foo.xml", type="xml", n_rows=13)
self.assertEqual(df.rows.count(), 13)
self.assertEqual(df.cols.names(), ["id","firstName","lastName","billingId","product","price","birth","dummyCol"])
def test_xml_50rows(self):
df = self.load_dataframe("examples/data/foo.xml", type="xml", n_rows=50)
self.assertLess(df.rows.count(), 50)
self.assertEqual(df.cols.names(), ["id","firstName","lastName","billingId","product","price","birth","dummyCol"])
def test_parquet(self):
df = self.load_dataframe("examples/data/foo.parquet", type="parquet")
self.assertEqual(df.rows.count(), 19)
self.assertEqual(df.cols.names(), ["id","firstName","lastName","billingId","product","price","birth","dummyCol"])
def test_parquet_13rows(self):
df = self.load_dataframe("examples/data/foo.parquet", type="parquet", n_rows=13)
self.assertEqual(df.rows.count(), 13)
self.assertEqual(df.cols.names(), ["id","firstName","lastName","billingId","product","price","birth","dummyCol"])
def test_parquet_50rows(self):
df = self.load_dataframe("examples/data/foo.parquet", type="parquet", n_rows=50)
self.assertLess(df.rows.count(), 50)
self.assertEqual(df.cols.names(), ["id","firstName","lastName","billingId","product","price","birth","dummyCol"])
def test_avro(self):
df = self.load_dataframe("examples/data/foo.avro", type="avro")
self.assertEqual(df.rows.count(), 19)
self.assertEqual(df.cols.names(), ["id","firstName","lastName","billingId","product","price","birth","dummyCol"])
def test_avro_13rows(self):
df = self.load_dataframe("examples/data/foo.avro", type="avro", n_rows=13)
self.assertEqual(df.rows.count(), 13)
self.assertEqual(df.cols.names(), ["id","firstName","lastName","billingId","product","price","birth","dummyCol"])
def test_avro_50rows(self):
df = self.load_dataframe("examples/data/foo.avro", type="avro", n_rows=50)
self.assertLess(df.rows.count(), 50)
self.assertEqual(df.cols.names(), ["id","firstName","lastName","billingId","product","price","birth","dummyCol"])
def test_tsv(self):
df = self.load_dataframe("examples/data/foo.tsv", type="tsv")
self.assertEqual(df.rows.count(), 5)
self.assertEqual(df.cols.names(), ["Sepal length", "Sepal width", "Petal length", "Petal width", "Species"])
def test_tsv_3rows(self):
df = self.load_dataframe("examples/data/foo.tsv", type="tsv", n_rows=3)
self.assertEqual(df.rows.count(), 3)
self.assertEqual(df.cols.names(), ["Sepal length", "Sepal width", "Petal length", "Petal width", "Species"])
def test_tsv_50rows(self):
df = self.load_dataframe("examples/data/foo.tsv", type="tsv", n_rows=50)
self.assertLess(df.rows.count(), 50)
self.assertEqual(df.cols.names(), ["Sepal length", "Sepal width", "Petal length", "Petal width", "Species"])
def test_xls(self):
df = self.load_dataframe("examples/data/titanic3.xls", type="xls")
self.assertEqual(df.rows.count(), 19)
self.assertEqual(df.cols.names(), ["id","firstName","lastName","billingId","product","price","birth","dummyCol"])
def test_xls_13rows(self):
df = self.load_dataframe("examples/data/titanic3.xls", type="xls", n_rows=13)
self.assertEqual(df.rows.count(), 13)
self.assertEqual(df.cols.names(), ["id","firstName","lastName","billingId","product","price","birth","dummyCol"])
def test_xls_50rows(self):
df = self.load_dataframe("examples/data/titanic3.xls", type="xls", n_rows=50)
self.assertLess(df.rows.count(), 50)
self.assertEqual(df.cols.names(), ["id","firstName","lastName","billingId","product","price","birth","dummyCol"])
class TestLoadDask(TestLoadPandas):
config = {'engine': 'dask', 'n_partitions': 1}
class TestLoadPartitionDask(TestLoadPandas):
config = {'engine': 'dask', 'n_partitions': 2}
try:
import cudf # pyright: reportMissingImports=false
except:
pass
else:
class TestLoadCUDF(TestLoadPandas):
config = {'engine': 'cudf'}
try:
import dask_cudf # pyright: reportMissingImports=false
except:
pass
else:
class TestLoadDC(TestLoadPandas):
config = {'engine': 'dask_cudf', 'n_partitions': 1}
try:
import dask_cudf # pyright: reportMissingImports=false
except:
pass
else:
class TestLoadPartitionDC(TestLoadPandas):
config = {'engine': 'dask_cudf', 'n_partitions': 2}
class TestLoadSpark(TestLoadPandas):
config = {'engine': 'spark'}
class TestLoadVaex(TestLoadPandas):
config = {'engine': 'vaex'}