Skip to content

Commit

Permalink
added some python unit tests
Browse files Browse the repository at this point in the history
added more conversion tests

short type should have a bit-width of 16

closes apache#17
  • Loading branch information
BryanCutler committed Feb 23, 2017
1 parent 5dbad22 commit 5837b38
Show file tree
Hide file tree
Showing 17 changed files with 1,117 additions and 18 deletions.
42 changes: 29 additions & 13 deletions python/pyspark/sql/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2354,27 +2354,43 @@ class ArrowTests(ReusedPySparkTestCase):
def setUpClass(cls):
ReusedPySparkTestCase.setUpClass()
cls.spark = SparkSession(cls.sc)
cls.schema = StructType([
StructField("str_t", StringType(), True),
StructField("int_t", IntegerType(), True),
StructField("long_t", LongType(), True),
StructField("float_t", FloatType(), True),
StructField("double_t", DoubleType(), True)])
cls.data = [("a", 1, 10, 0.2, 2.0),
("b", 2, 20, 0.4, 4.0),
("c", 3, 30, 0.8, 6.0)]

def assertFramesEqual(self, df_with_arrow, df_without):
msg = ("DataFrame from Arrow is not equal" +
("\n\nWith Arrow:\n%s\n%s" % (df_with_arrow, df_with_arrow.dtypes)) +
("\n\nWithout:\n%s\n%s" % (df_without, df_without.dtypes)))
self.assertTrue(df_without.equals(df_with_arrow), msg=msg)

def test_arrow_toPandas(self):
schema = StructType([
StructField("str_t", StringType(), True), # Fails in conversion
StructField("int_t", IntegerType(), True), # Fails, without is converted to int64
StructField("long_t", LongType(), True), # Fails if nullable=False
StructField("double_t", DoubleType(), True)])
data = [("a", 1, 10, 2.0),
("b", 2, 20, 4.0),
("c", 3, 30, 6.0)]
def test_null_conversion(self):
df_null = self.spark.createDataFrame([tuple([None for _ in range(len(self.data[0]))])] +
self.data)
pdf = df_null.toPandas(useArrow=True)
null_counts = pdf.isnull().sum().tolist()
self.assertTrue(all([c == 1 for c in null_counts]))

def test_toPandas_arrow_toggle(self):
df = self.spark.createDataFrame(self.data, schema=self.schema)
# NOTE - toPandas(useArrow=False) will infer standard data types
df_sel = df.select("str_t", "long_t", "double_t")
pdf = df_sel.toPandas(useArrow=False)
pdf_arrow = df_sel.toPandas(useArrow=True)
self.assertFramesEqual(pdf_arrow, pdf)

df = self.spark.createDataFrame(data, schema=schema)
df = df.select("long_t", "double_t")
pdf = df.toPandas(useArrow=False)
pdf_arrow = df.toPandas(useArrow=True)
def test_pandas_round_trip(self):
import pandas as pd
data_dict = {name: [self.data[i][j] for i in range(len(self.data))]
for j, name in enumerate(self.schema.names)}
pdf = pd.DataFrame(data=data_dict)
pdf_arrow = self.spark.createDataFrame(pdf).toPandas(useArrow=True)
self.assertFramesEqual(pdf_arrow, pdf)


Expand Down
4 changes: 2 additions & 2 deletions sql/core/src/main/scala/org/apache/spark/sql/Arrow.scala
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ object Arrow {
buf.writeBoolean(row.getBoolean(ordinal)))
case ShortType =>
TypeFuncs(
() => new ArrowType.Int(4 * ShortType.defaultSize, true), // TODO - check on this
() => new ArrowType.Int(8 * ShortType.defaultSize, true),
(buf: ArrowBuf) => buf.writeShort(0),
(row: InternalRow, ordinal: Int, buf: ArrowBuf) => buf.writeShort(row.getShort(ordinal)))
case IntegerType =>
Expand Down Expand Up @@ -127,7 +127,7 @@ object Arrow {
val numOfRows = rows.length

field.dataType match {
case IntegerType | LongType | DoubleType | FloatType | BooleanType | ByteType =>
case ShortType | IntegerType | LongType | DoubleType | FloatType | BooleanType | ByteType =>
val validityVector = new BitVector("validity", allocator)
val validityMutator = validityVector.getMutator
validityVector.allocateNew(numOfRows)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
{
"schema": {
"fields": [
{
"name": "a",
"type": {"name": "floatingpoint", "precision": "DOUBLE"},
"nullable": false,
"children": [],
"typeLayout": {
"vectors": [
{"type": "VALIDITY", "typeBitWidth": 1},
{"type": "DATA", "typeBitWidth": 32}
]
}
},
{
"name": "b",
"type": {"name": "floatingpoint", "precision": "DOUBLE"},
"nullable": false,
"children": [],
"typeLayout": {
"vectors": [
{"type": "VALIDITY", "typeBitWidth": 1},
{"type": "DATA", "typeBitWidth": 32}
]
}
}
]
},

"batches": [
{
"count": 6,
"columns": [
{
"name": "a",
"count": 6,
"VALIDITY": [1, 1, 1, 1, 1, 1],
"DATA": [1, 1, 2, 2, 3, 3]
},
{
"name": "b",
"count": 6,
"VALIDITY": [1, 1, 1, 1, 1, 1],
"DATA": [1, 2, 1, 2, 1, 2]
}
]
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
{
"schema": {
"fields": [
{
"name": "i",
"type": {"name": "int", "isSigned": true, "bitWidth": 32},
"nullable": false,
"children": [],
"typeLayout": {
"vectors": [
{"type": "VALIDITY", "typeBitWidth": 1},
{"type": "DATA", "typeBitWidth": 8}
]
}
},
{
"name": "a_d",
"type": {"name": "floatingpoint", "precision": "DOUBLE"},
"nullable": false,
"children": [],
"typeLayout": {
"vectors": [
{"type": "VALIDITY", "typeBitWidth": 1},
{"type": "DATA", "typeBitWidth": 32}
]
}
},
{
"name": "b_d",
"type": {"name": "floatingpoint", "precision": "DOUBLE"},
"nullable": true,
"children": [],
"typeLayout": {
"vectors": [
{"type": "VALIDITY", "typeBitWidth": 1},
{"type": "DATA", "typeBitWidth": 32}
]
}
}
]
},

"batches": [
{
"count": 6,
"columns": [
{
"name": "i",
"count": 6,
"VALIDITY": [1, 1, 1, 1, 1, 1],
"DATA": [1, 2, 3, 4, 5, 6]
},
{
"name": "a_d",
"count": 6,
"VALIDITY": [1, 1, 1, 1, 1, 1],
"DATA": [1.0, 2.0, 0.01, 200.0, 0.0001, 20000.0]
},
{
"name": "b_d",
"count": 6,
"VALIDITY": [1, 0, 0, 1, 0, 1],
"DATA": [1.1, 0, 0, 2.2, 0, 3.3]
}
]
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
{
"schema": {
"fields": [
{
"name": "i",
"type": {"name": "int", "isSigned": true, "bitWidth": 32},
"nullable": false,
"children": [],
"typeLayout": {
"vectors": [
{"type": "VALIDITY", "typeBitWidth": 1},
{"type": "DATA", "typeBitWidth": 8}
]
}
},
{
"name": "a_f",
"type": {"name": "floatingpoint", "precision": "SINGLE"},
"nullable": false,
"children": [],
"typeLayout": {
"vectors": [
{"type": "VALIDITY", "typeBitWidth": 1},
{"type": "DATA", "typeBitWidth": 32}
]
}
},
{
"name": "b_f",
"type": {"name": "floatingpoint", "precision": "SINGLE"},
"nullable": true,
"children": [],
"typeLayout": {
"vectors": [
{"type": "VALIDITY", "typeBitWidth": 1},
{"type": "DATA", "typeBitWidth": 32}
]
}
}
]
},

"batches": [
{
"count": 6,
"columns": [
{
"name": "i",
"count": 6,
"VALIDITY": [1, 1, 1, 1, 1, 1],
"DATA": [1, 2, 3, 4, 5, 6]
},
{
"name": "a_f",
"count": 6,
"VALIDITY": [1, 1, 1, 1, 1, 1],
"DATA": [1.0, 2.0, 0.01, 200.0, 0.0001, 20000.0]
},
{
"name": "b_f",
"count": 6,
"VALIDITY": [1, 0, 0, 1, 0, 1],
"DATA": [1.1, 0, 0, 2.2, 0, 3.3]
}
]
}
]
}
32 changes: 32 additions & 0 deletions sql/core/src/test/resources/test-data/arrow/indexData-ints.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"schema": {
"fields": [
{
"name": "i",
"type": {"name": "int", "isSigned": true, "bitWidth": 32},
"nullable": false,
"children": [],
"typeLayout": {
"vectors": [
{"type": "VALIDITY", "typeBitWidth": 1},
{"type": "DATA", "typeBitWidth": 8}
]
}
}
]
},

"batches": [
{
"count": 6,
"columns": [
{
"name": "i",
"count": 6,
"VALIDITY": [1, 1, 1, 1, 1, 1],
"DATA": [1, 2, 3, 4, 5, 6]
}
]
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
{
"schema": {
"fields": [
{
"name": "i",
"type": {"name": "int", "isSigned": true, "bitWidth": 32},
"nullable": false,
"children": [],
"typeLayout": {
"vectors": [
{"type": "VALIDITY", "typeBitWidth": 1},
{"type": "DATA", "typeBitWidth": 8}
]
}
},
{
"name": "a_i",
"type": {"name": "int", "isSigned": true, "bitWidth": 32},
"nullable": false,
"children": [],
"typeLayout": {
"vectors": [
{"type": "VALIDITY", "typeBitWidth": 1},
{"type": "DATA", "typeBitWidth": 32}
]
}
},
{
"name": "b_i",
"type": {"name": "int", "isSigned": true, "bitWidth": 32},
"nullable": true,
"children": [],
"typeLayout": {
"vectors": [
{"type": "VALIDITY", "typeBitWidth": 1},
{"type": "DATA", "typeBitWidth": 32}
]
}
}
]
},

"batches": [
{
"count": 6,
"columns": [
{
"name": "i",
"count": 6,
"VALIDITY": [1, 1, 1, 1, 1, 1],
"DATA": [1, 2, 3, 4, 5, 6]
},
{
"name": "a_i",
"count": 6,
"VALIDITY": [1, 1, 1, 1, 1, 1],
"DATA": [1, -1, 2, -2, 2147483647, -2147483648]
},
{
"name": "b_i",
"count": 6,
"VALIDITY": [1, 0, 0, 1, 0, 1],
"DATA": [1, -1, 2, -2, 2147483647, -2147483648]
}
]
}
]
}
Loading

0 comments on commit 5837b38

Please sign in to comment.