From 72759d9b41d3bc1bcdd3bfb389afbf135d8a20d3 Mon Sep 17 00:00:00 2001
From: julbouln <jboulnois@gmail.com>
Date: Thu, 17 Nov 2016 00:20:13 +0100
Subject: [PATCH] wrong fork ...

---
 drm/etnaviv_drm.h    |   8 +
 etnaviv_buffer.c     | 267 +++++++++++++++++++++------------
 etnaviv_drv.h        |   3 -
 etnaviv_dump.c       |   6 +-
 etnaviv_gem.c        |  96 +++++++-----
 etnaviv_gem.h        |  18 ++-
 etnaviv_gem_prime.c  |   7 +
 etnaviv_gem_submit.c |  54 ++++---
 etnaviv_gpu.c        | 348 ++++++++++++++++++++++---------------------
 etnaviv_gpu.h        |   8 +-
 etnaviv_iommu.c      |  16 +-
 etnaviv_iommu.h      |   9 +-
 etnaviv_mmu.c        | 152 +++++++++++++++----
 etnaviv_mmu.h        |   9 +-
 state_3d.xml.h       |   9 ++
 15 files changed, 619 insertions(+), 391 deletions(-)
 create mode 100644 state_3d.xml.h

diff --git a/drm/etnaviv_drm.h b/drm/etnaviv_drm.h
index 01b8bfc..075e822 100644
--- a/drm/etnaviv_drm.h
+++ b/drm/etnaviv_drm.h
@@ -19,6 +19,10 @@
 
 #include "drm/drm.h"
 
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
 /* Please note that modifications to all structs defined here are
  * subject to backwards-compatibility constraints:
  *  1) Do not use pointers, use __u64 instead for 32 bit / 64 bit
@@ -222,4 +226,8 @@ struct drm_etnaviv_gem_wait {
 #define DRM_IOCTL_ETNAVIV_GEM_USERPTR  DRM_IOWR(DRM_COMMAND_BASE + DRM_ETNAVIV_GEM_USERPTR, struct drm_etnaviv_gem_userptr)
 #define DRM_IOCTL_ETNAVIV_GEM_WAIT     DRM_IOW(DRM_COMMAND_BASE + DRM_ETNAVIV_GEM_WAIT, struct drm_etnaviv_gem_wait)
 
+#if defined(__cplusplus)
+}
+#endif
+
 #endif /* __ETNAVIV_DRM_H__ */
diff --git a/etnaviv_buffer.c b/etnaviv_buffer.c
index bddbf6c..d923013 100644
--- a/etnaviv_buffer.c
+++ b/etnaviv_buffer.c
@@ -21,8 +21,9 @@
 
 #include "common.xml.h"
 #include "state.xml.h"
-#include "cmdstream.xml.h"
 #include "state_hi.xml.h"
+#include "state_3d.xml.h"
+#include "cmdstream.xml.h"
 
 /*
  * Command Buffer helper:
@@ -86,7 +87,6 @@ static inline void CMD_STALL(struct etnaviv_cmdbuf *buffer,
 	OUT(buffer, VIV_FE_STALL_TOKEN_FROM(from) | VIV_FE_STALL_TOKEN_TO(to));
 }
 
-
 static inline void CMD_SEM(struct etnaviv_cmdbuf *buffer, u32 from, u32 to)
 {
 	CMD_LOAD_STATE(buffer, VIVS_GL_SEMAPHORE_TOKEN,
@@ -94,10 +94,10 @@ static inline void CMD_SEM(struct etnaviv_cmdbuf *buffer, u32 from, u32 to)
 		       VIVS_GL_SEMAPHORE_TOKEN_TO(to));
 }
 
-static void etnaviv_cmd_select_pipe(struct etnaviv_cmdbuf *buffer, u8 pipe)
+static void etnaviv_cmd_select_pipe(struct etnaviv_gpu *gpu,
+	struct etnaviv_cmdbuf *buffer, u8 pipe)
 {
-	u32 flush;
-	u32 stall;
+	u32 flush = 0;
 
 	/*
 	 * This assumes that if we're switching to 2D, we're switching
@@ -105,28 +105,19 @@ static void etnaviv_cmd_select_pipe(struct etnaviv_cmdbuf *buffer, u8 pipe)
 	 * the 2D core, we need to flush the 3D depth and color caches,
 	 * otherwise we need to flush the 2D pixel engine cache.
 	 */
-	if (pipe == ETNA_PIPE_2D)
-		flush = VIVS_GL_FLUSH_CACHE_DEPTH | VIVS_GL_FLUSH_CACHE_COLOR;
-	else
+	if (gpu->exec_state == ETNA_PIPE_2D)
 		flush = VIVS_GL_FLUSH_CACHE_PE2D;
-
-	stall = VIVS_GL_SEMAPHORE_TOKEN_FROM(SYNC_RECIPIENT_FE) |
-		VIVS_GL_SEMAPHORE_TOKEN_TO(SYNC_RECIPIENT_PE);
+	else if (gpu->exec_state == ETNA_PIPE_3D)
+		flush = VIVS_GL_FLUSH_CACHE_DEPTH | VIVS_GL_FLUSH_CACHE_COLOR;
 
 	CMD_LOAD_STATE(buffer, VIVS_GL_FLUSH_CACHE, flush);
-	CMD_LOAD_STATE(buffer, VIVS_GL_SEMAPHORE_TOKEN, stall);
-
+	CMD_SEM(buffer, SYNC_RECIPIENT_FE, SYNC_RECIPIENT_PE);
 	CMD_STALL(buffer, SYNC_RECIPIENT_FE, SYNC_RECIPIENT_PE);
 
 	CMD_LOAD_STATE(buffer, VIVS_GL_PIPE_SELECT,
 		       VIVS_GL_PIPE_SELECT_PIPE(pipe));
 }
 
-static u32 gpu_va(struct etnaviv_gpu *gpu, struct etnaviv_cmdbuf *buf)
-{
-	return buf->paddr - gpu->memory_base;
-}
-
 static void etnaviv_buffer_dump(struct etnaviv_gpu *gpu,
 	struct etnaviv_cmdbuf *buf, u32 off, u32 len)
 {
@@ -134,12 +125,42 @@ static void etnaviv_buffer_dump(struct etnaviv_gpu *gpu,
 	u32 *ptr = buf->vaddr + off;
 
 	dev_info(gpu->dev, "virt %p phys 0x%08x free 0x%08x\n",
-			ptr, gpu_va(gpu, buf) + off, size - len * 4 - off);
+			ptr, etnaviv_iommu_get_cmdbuf_va(gpu, buf) + off, size - len * 4 - off);
 
 	print_hex_dump(KERN_INFO, "cmd ", DUMP_PREFIX_OFFSET, 16, 4,
 			ptr, len * 4, 0);
 }
 
+/*
+ * Safely replace the WAIT of a waitlink with a new command and argument.
+ * The GPU may be executing this WAIT while we're modifying it, so we have
+ * to write it in a specific order to avoid the GPU branching to somewhere
+ * else.  'wl_offset' is the offset to the first byte of the WAIT command.
+ */
+static void etnaviv_buffer_replace_wait(struct etnaviv_cmdbuf *buffer,
+	unsigned int wl_offset, u32 cmd, u32 arg)
+{
+	u32 *lw = buffer->vaddr + wl_offset;
+
+	lw[1] = arg;
+	mb();
+	lw[0] = cmd;
+	mb();
+}
+
+/*
+ * Ensure that there is space in the command buffer to contiguously write
+ * 'cmd_dwords' 64-bit words into the buffer, wrapping if necessary.
+ */
+static u32 etnaviv_buffer_reserve(struct etnaviv_gpu *gpu,
+	struct etnaviv_cmdbuf *buffer, unsigned int cmd_dwords)
+{
+	if (buffer->user_size + cmd_dwords * sizeof(u64) > buffer->size)
+		buffer->user_size = 0;
+
+	return etnaviv_iommu_get_cmdbuf_va(gpu, buffer) + buffer->user_size;
+}
+
 u16 etnaviv_buffer_init(struct etnaviv_gpu *gpu)
 {
 	struct etnaviv_cmdbuf *buffer = gpu->buffer;
@@ -148,12 +169,12 @@ u16 etnaviv_buffer_init(struct etnaviv_gpu *gpu)
 	buffer->user_size = 0;
 
 	CMD_WAIT(buffer);
-	CMD_LINK(buffer, 2, gpu_va(gpu, buffer) + buffer->user_size - 4);
+	CMD_LINK(buffer, 2, etnaviv_iommu_get_cmdbuf_va(gpu, buffer) +
+		 buffer->user_size - 4);
 
 	return buffer->user_size / 8;
 }
 
-
 u16 etnaviv_buffer_config_mmuv2(struct etnaviv_gpu *gpu, u32 mtlb_addr, u32 safe_addr)
 {
 	struct etnaviv_cmdbuf *buffer = gpu->buffer;
@@ -190,121 +211,175 @@ u16 etnaviv_buffer_config_mmuv2(struct etnaviv_gpu *gpu, u32 mtlb_addr, u32 safe
 void etnaviv_buffer_end(struct etnaviv_gpu *gpu)
 {
 	struct etnaviv_cmdbuf *buffer = gpu->buffer;
+	unsigned int waitlink_offset = buffer->user_size - 16;
+	u32 link_target, flush = 0;
 
-	/* Replace the last WAIT with an END */
-	buffer->user_size -= 16;
+	if (gpu->exec_state == ETNA_PIPE_2D)
+		flush = VIVS_GL_FLUSH_CACHE_PE2D;
+	else if (gpu->exec_state == ETNA_PIPE_3D)
+		flush = VIVS_GL_FLUSH_CACHE_DEPTH |
+			VIVS_GL_FLUSH_CACHE_COLOR |
+			VIVS_GL_FLUSH_CACHE_TEXTURE |
+			VIVS_GL_FLUSH_CACHE_TEXTUREVS |
+			VIVS_GL_FLUSH_CACHE_SHADER_L2;
 
-	CMD_END(buffer);
-	mb();
+	if (flush) {
+		unsigned int dwords = 7;
+
+		link_target = etnaviv_buffer_reserve(gpu, buffer, dwords);
+
+		CMD_SEM(buffer, SYNC_RECIPIENT_FE, SYNC_RECIPIENT_PE);
+		CMD_STALL(buffer, SYNC_RECIPIENT_FE, SYNC_RECIPIENT_PE);
+		CMD_LOAD_STATE(buffer, VIVS_GL_FLUSH_CACHE, flush);
+		if (gpu->exec_state == ETNA_PIPE_3D)
+			CMD_LOAD_STATE(buffer, VIVS_TS_FLUSH_CACHE,
+				       VIVS_TS_FLUSH_CACHE_FLUSH);
+		CMD_SEM(buffer, SYNC_RECIPIENT_FE, SYNC_RECIPIENT_PE);
+		CMD_STALL(buffer, SYNC_RECIPIENT_FE, SYNC_RECIPIENT_PE);
+		CMD_END(buffer);
+
+		etnaviv_buffer_replace_wait(buffer, waitlink_offset,
+					    VIV_FE_LINK_HEADER_OP_LINK |
+					    VIV_FE_LINK_HEADER_PREFETCH(dwords),
+					    link_target);
+	} else {
+		/* Replace the last link-wait with an "END" command */
+		etnaviv_buffer_replace_wait(buffer, waitlink_offset,
+					    VIV_FE_END_HEADER_OP_END, 0);
+	}
 }
 
+/* Append a command buffer to the ring buffer. */
 void etnaviv_buffer_queue(struct etnaviv_gpu *gpu, unsigned int event,
 	struct etnaviv_cmdbuf *cmdbuf)
 {
 	struct etnaviv_cmdbuf *buffer = gpu->buffer;
-	u32 *lw = buffer->vaddr + buffer->user_size - 16;
-	u32 back, link_target, link_size, reserve_size, extra_size = 0;
+	unsigned int waitlink_offset = buffer->user_size - 16;
+	u32 return_target, return_dwords;
+	u32 link_target, link_dwords;
 
 	if (drm_debug & DRM_UT_DRIVER)
 		etnaviv_buffer_dump(gpu, buffer, 0, 0x50);
 
+	link_target = etnaviv_iommu_get_cmdbuf_va(gpu, cmdbuf);
+	link_dwords = cmdbuf->size / 8;
+
 	/*
-	 * If we need to flush the MMU prior to submitting this buffer, we
-	 * will need to append a mmu flush load state, followed by a new
+	 * If we need maintanence prior to submitting this buffer, we will
+	 * need to append a mmu flush load state, followed by a new
 	 * link to this buffer - a total of four additional words.
 	 */
 	if (gpu->mmu->need_flush || gpu->switch_context) {
+		u32 target, extra_dwords;
+
 		/* link command */
-		extra_size += 2;
+		extra_dwords = 1;
+
 		/* flush command */
-		if (gpu->mmu->need_flush)
-			extra_size += 2;
+		if (gpu->mmu->need_flush) {
+			if (gpu->mmu->version == ETNAVIV_IOMMU_V1)
+				extra_dwords += 1;
+			else
+				extra_dwords += 3;
+		}
+
 		/* pipe switch commands */
 		if (gpu->switch_context)
-			extra_size += 8;
-	}
+			extra_dwords += 4;
 
-	reserve_size = (6 + extra_size) * 4;
-
-	/*
-	 * if we are going to completely overflow the buffer, we need to wrap.
-	 */
-	if (buffer->user_size + reserve_size > buffer->size)
-		buffer->user_size = 0;
-
-	/* save offset back into main buffer */
-	back = buffer->user_size + reserve_size - 6 * 4;
-	link_target = gpu_va(gpu, buffer) + buffer->user_size;
-	link_size = 6;
-
-	/* Skip over any extra instructions */
-	link_target += extra_size * sizeof(u32);
-
-	if (drm_debug & DRM_UT_DRIVER)
-		pr_info("stream link to 0x%08x @ 0x%08x %p\n",
-			link_target, gpu_va(gpu, cmdbuf), cmdbuf->vaddr);
-
-	/* jump back from cmd to main buffer */
-	CMD_LINK(cmdbuf, link_size, link_target);
-
-	link_target = gpu_va(gpu, cmdbuf);
-	link_size = cmdbuf->size / 8;
-
-
-
-	if (drm_debug & DRM_UT_DRIVER) {
-		print_hex_dump(KERN_INFO, "cmd ", DUMP_PREFIX_OFFSET, 16, 4,
-			       cmdbuf->vaddr, cmdbuf->size, 0);
-
-		pr_info("link op: %p\n", lw);
-		pr_info("link addr: %p\n", lw + 1);
-		pr_info("addr: 0x%08x\n", link_target);
-		pr_info("back: 0x%08x\n", gpu_va(gpu, buffer) + back);
-		pr_info("event: %d\n", event);
-	}
-
-	if (gpu->mmu->need_flush || gpu->switch_context) {
-		u32 new_target = gpu_va(gpu, buffer) + buffer->user_size;
+		target = etnaviv_buffer_reserve(gpu, buffer, extra_dwords);
 
 		if (gpu->mmu->need_flush) {
 			/* Add the MMU flush */
-			CMD_LOAD_STATE(buffer, VIVS_GL_FLUSH_MMU,
-				       VIVS_GL_FLUSH_MMU_FLUSH_FEMMU |
-				       VIVS_GL_FLUSH_MMU_FLUSH_UNK1 |
-				       VIVS_GL_FLUSH_MMU_FLUSH_UNK2 |
-				       VIVS_GL_FLUSH_MMU_FLUSH_PEMMU |
-				       VIVS_GL_FLUSH_MMU_FLUSH_UNK4);
+			if (gpu->mmu->version == ETNAVIV_IOMMU_V1) {
+				CMD_LOAD_STATE(buffer, VIVS_GL_FLUSH_MMU,
+					       VIVS_GL_FLUSH_MMU_FLUSH_FEMMU |
+					       VIVS_GL_FLUSH_MMU_FLUSH_UNK1 |
+					       VIVS_GL_FLUSH_MMU_FLUSH_UNK2 |
+					       VIVS_GL_FLUSH_MMU_FLUSH_PEMMU |
+					       VIVS_GL_FLUSH_MMU_FLUSH_UNK4);
+			} else {
+				CMD_LOAD_STATE(buffer, VIVS_MMUv2_CONFIGURATION,
+					VIVS_MMUv2_CONFIGURATION_MODE_MASK |
+					VIVS_MMUv2_CONFIGURATION_ADDRESS_MASK |
+					VIVS_MMUv2_CONFIGURATION_FLUSH_FLUSH);
+				CMD_SEM(buffer, SYNC_RECIPIENT_FE,
+					SYNC_RECIPIENT_PE);
+				CMD_STALL(buffer, SYNC_RECIPIENT_FE,
+					SYNC_RECIPIENT_PE);
+			}
 
 			gpu->mmu->need_flush = false;
 		}
 
 		if (gpu->switch_context) {
-			etnaviv_cmd_select_pipe(buffer, cmdbuf->exec_state);
+			etnaviv_cmd_select_pipe(gpu, buffer, cmdbuf->exec_state);
+			gpu->exec_state = cmdbuf->exec_state;
 			gpu->switch_context = false;
 		}
 
-		/* And the link to the first buffer */
-		CMD_LINK(buffer, link_size, link_target);
+		/* And the link to the submitted buffer */
+		CMD_LINK(buffer, link_dwords, link_target);
 
 		/* Update the link target to point to above instructions */
-		link_target = new_target;
-		link_size = extra_size;
+		link_target = target;
+		link_dwords = extra_dwords;
 	}
 
-	/* trigger event */
+	/*
+	 * Append a LINK to the submitted command buffer to return to
+	 * the ring buffer.  return_target is the ring target address.
+	 * We need at most 7 dwords in the return target: 2 cache flush +
+	 * 2 semaphore stall + 1 event + 1 wait + 1 link.
+	 */
+	return_dwords = 7;
+	return_target = etnaviv_buffer_reserve(gpu, buffer, return_dwords);
+	CMD_LINK(cmdbuf, return_dwords, return_target);
+
+	/*
+	 * Append a cache flush, stall, event, wait and link pointing back to
+	 * the wait command to the ring buffer.
+	 */
+	if (gpu->exec_state == ETNA_PIPE_2D) {
+		CMD_LOAD_STATE(buffer, VIVS_GL_FLUSH_CACHE,
+				       VIVS_GL_FLUSH_CACHE_PE2D);
+	} else {
+		CMD_LOAD_STATE(buffer, VIVS_GL_FLUSH_CACHE,
+				       VIVS_GL_FLUSH_CACHE_DEPTH |
+				       VIVS_GL_FLUSH_CACHE_COLOR);
+		CMD_LOAD_STATE(buffer, VIVS_TS_FLUSH_CACHE,
+				       VIVS_TS_FLUSH_CACHE_FLUSH);
+	}
+	CMD_SEM(buffer, SYNC_RECIPIENT_FE, SYNC_RECIPIENT_PE);
+	CMD_STALL(buffer, SYNC_RECIPIENT_FE, SYNC_RECIPIENT_PE);
 	CMD_LOAD_STATE(buffer, VIVS_GL_EVENT, VIVS_GL_EVENT_EVENT_ID(event) |
 		       VIVS_GL_EVENT_FROM_PE);
-
-	/* append WAIT/LINK to main buffer */
 	CMD_WAIT(buffer);
-	CMD_LINK(buffer, 2, gpu_va(gpu, buffer) + (buffer->user_size - 4));
+	CMD_LINK(buffer, 2, etnaviv_iommu_get_cmdbuf_va(gpu, buffer) +
+			    buffer->user_size - 4);
 
-	/* Change WAIT into a LINK command; write the address first. */
-	*(lw + 1) = link_target;
-	mb();
-	*(lw) = VIV_FE_LINK_HEADER_OP_LINK |
-		VIV_FE_LINK_HEADER_PREFETCH(link_size);
-	mb();
+	if (drm_debug & DRM_UT_DRIVER)
+		pr_info("stream link to 0x%08x @ 0x%08x %p\n",
+			return_target, etnaviv_iommu_get_cmdbuf_va(gpu, cmdbuf), cmdbuf->vaddr);
+
+	if (drm_debug & DRM_UT_DRIVER) {
+		print_hex_dump(KERN_INFO, "cmd ", DUMP_PREFIX_OFFSET, 16, 4,
+			       cmdbuf->vaddr, cmdbuf->size, 0);
+
+		pr_info("link op: %p\n", buffer->vaddr + waitlink_offset);
+		pr_info("addr: 0x%08x\n", link_target);
+		pr_info("back: 0x%08x\n", return_target);
+		pr_info("event: %d\n", event);
+	}
+
+	/*
+	 * Kick off the submitted command by replacing the previous
+	 * WAIT with a link to the address in the ring buffer.
+	 */
+	etnaviv_buffer_replace_wait(buffer, waitlink_offset,
+				    VIV_FE_LINK_HEADER_OP_LINK |
+				    VIV_FE_LINK_HEADER_PREFETCH(link_dwords),
+				    link_target);
 
 	if (drm_debug & DRM_UT_DRIVER)
 		etnaviv_buffer_dump(gpu, buffer, 0, 0x50);
diff --git a/etnaviv_drv.h b/etnaviv_drv.h
index ea80f19..65e0576 100644
--- a/etnaviv_drv.h
+++ b/etnaviv_drv.h
@@ -75,9 +75,6 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
 int etnaviv_gem_mmap(struct file *filp, struct vm_area_struct *vma);
 int etnaviv_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
 int etnaviv_gem_mmap_offset(struct drm_gem_object *obj, u64 *offset);
-int etnaviv_gem_get_iova(struct etnaviv_gpu *gpu,
-	struct drm_gem_object *obj, u32 *iova);
-void etnaviv_gem_put_iova(struct etnaviv_gpu *gpu, struct drm_gem_object *obj);
 struct sg_table *etnaviv_gem_prime_get_sg_table(struct drm_gem_object *obj);
 void *etnaviv_gem_prime_vmap(struct drm_gem_object *obj);
 void etnaviv_gem_prime_vunmap(struct drm_gem_object *obj, void *vaddr);
diff --git a/etnaviv_dump.c b/etnaviv_dump.c
index 4a29eea..2bef501 100644
--- a/etnaviv_dump.c
+++ b/etnaviv_dump.c
@@ -175,11 +175,13 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu)
 	etnaviv_core_dump_registers(&iter, gpu);
 	etnaviv_core_dump_mmu(&iter, gpu, mmu_size);
 	etnaviv_core_dump_mem(&iter, ETDUMP_BUF_RING, gpu->buffer->vaddr,
-			      gpu->buffer->size, gpu->buffer->paddr);
+			      gpu->buffer->size,
+			      etnaviv_iommu_get_cmdbuf_va(gpu, gpu->buffer));
 
 	list_for_each_entry(cmd, &gpu->active_cmd_list, node)
 		etnaviv_core_dump_mem(&iter, ETDUMP_BUF_CMD, cmd->vaddr,
-				      cmd->size, cmd->paddr);
+				      cmd->size,
+				      etnaviv_iommu_get_cmdbuf_va(gpu, cmd));
 
 	/* Reserve space for the bomap */
 	if (n_bomap_pages) {
diff --git a/etnaviv_gem.c b/etnaviv_gem.c
index 4b519e4..f7cac70 100644
--- a/etnaviv_gem.c
+++ b/etnaviv_gem.c
@@ -129,10 +129,9 @@ void etnaviv_gem_put_pages(struct etnaviv_gem_object *etnaviv_obj)
 	/* when we start tracking the pin count, then do something here */
 }
 
-static int etnaviv_gem_mmap_obj(struct drm_gem_object *obj,
+static int etnaviv_gem_mmap_obj(struct etnaviv_gem_object *etnaviv_obj,
 		struct vm_area_struct *vma)
 {
-	struct etnaviv_gem_object *etnaviv_obj = to_etnaviv_bo(obj);
 	pgprot_t vm_page_prot;
 
 	vma->vm_flags &= ~VM_PFNMAP;
@@ -151,9 +150,9 @@ static int etnaviv_gem_mmap_obj(struct drm_gem_object *obj,
 		 * in particular in the case of mmap'd dmabufs)
 		 */
 		fput(vma->vm_file);
-		get_file(obj->filp);
+		get_file(etnaviv_obj->base.filp);
 		vma->vm_pgoff = 0;
-		vma->vm_file  = obj->filp;
+		vma->vm_file  = etnaviv_obj->base.filp;
 
 		vma->vm_page_prot = vm_page_prot;
 	}
@@ -173,7 +172,7 @@ int etnaviv_gem_mmap(struct file *filp, struct vm_area_struct *vma)
 	}
 
 	obj = to_etnaviv_bo(vma->vm_private_data);
-	return etnaviv_gem_mmap_obj(vma->vm_private_data, vma);
+	return obj->ops->mmap(obj, vma);
 }
 
 int etnaviv_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -260,8 +259,32 @@ etnaviv_gem_get_vram_mapping(struct etnaviv_gem_object *obj,
 	return NULL;
 }
 
-int etnaviv_gem_get_iova(struct etnaviv_gpu *gpu,
-	struct drm_gem_object *obj, u32 *iova)
+void etnaviv_gem_mapping_reference(struct etnaviv_vram_mapping *mapping)
+{
+	struct etnaviv_gem_object *etnaviv_obj = mapping->object;
+
+	drm_gem_object_reference(&etnaviv_obj->base);
+
+	mutex_lock(&etnaviv_obj->lock);
+	WARN_ON(mapping->use == 0);
+	mapping->use += 1;
+	mutex_unlock(&etnaviv_obj->lock);
+}
+
+void etnaviv_gem_mapping_unreference(struct etnaviv_vram_mapping *mapping)
+{
+	struct etnaviv_gem_object *etnaviv_obj = mapping->object;
+
+	mutex_lock(&etnaviv_obj->lock);
+	WARN_ON(mapping->use == 0);
+	mapping->use -= 1;
+	mutex_unlock(&etnaviv_obj->lock);
+
+	drm_gem_object_unreference_unlocked(&etnaviv_obj->base);
+}
+
+struct etnaviv_vram_mapping *etnaviv_gem_mapping_get(
+	struct drm_gem_object *obj, struct etnaviv_gpu *gpu)
 {
 	struct etnaviv_gem_object *etnaviv_obj = to_etnaviv_bo(obj);
 	struct etnaviv_vram_mapping *mapping;
@@ -329,28 +352,12 @@ int etnaviv_gem_get_iova(struct etnaviv_gpu *gpu,
 out:
 	mutex_unlock(&etnaviv_obj->lock);
 
-	if (!ret) {
-		/* Take a reference on the object */
-		drm_gem_object_reference(obj);
-		*iova = mapping->iova;
-	}
-
-	return ret;
-}
-
-void etnaviv_gem_put_iova(struct etnaviv_gpu *gpu, struct drm_gem_object *obj)
-{
-	struct etnaviv_gem_object *etnaviv_obj = to_etnaviv_bo(obj);
-	struct etnaviv_vram_mapping *mapping;
-
-	mutex_lock(&etnaviv_obj->lock);
-	mapping = etnaviv_gem_get_vram_mapping(etnaviv_obj, gpu->mmu);
-
-	WARN_ON(mapping->use == 0);
-	mapping->use -= 1;
-	mutex_unlock(&etnaviv_obj->lock);
+	if (ret)
+		return ERR_PTR(ret);
 
-	drm_gem_object_unreference_unlocked(obj);
+	/* Take a reference on the object */
+	drm_gem_object_reference(obj);
+	return mapping;
 }
 
 void *etnaviv_gem_vmap(struct drm_gem_object *obj)
@@ -528,8 +535,7 @@ void etnaviv_gem_describe_objects(struct etnaviv_drm_private *priv,
 
 static void etnaviv_gem_shmem_release(struct etnaviv_gem_object *etnaviv_obj)
 {
-	if (etnaviv_obj->vaddr)
-		vunmap(etnaviv_obj->vaddr);
+	vunmap(etnaviv_obj->vaddr);
 	put_pages(etnaviv_obj);
 }
 
@@ -537,6 +543,7 @@ static const struct etnaviv_gem_ops etnaviv_gem_shmem_ops = {
 	.get_pages = etnaviv_gem_shmem_get_pages,
 	.release = etnaviv_gem_shmem_release,
 	.vmap = etnaviv_gem_vmap_impl,
+	.mmap = etnaviv_gem_mmap_obj,
 };
 
 void etnaviv_gem_free_object(struct drm_gem_object *obj)
@@ -652,7 +659,7 @@ static struct drm_gem_object *__etnaviv_gem_new(struct drm_device *dev,
 		 * why this is required _and_ expected if you're
 		 * going to pin these pages.
 		 */
-		mapping = file_inode(obj->filp)->i_mapping;
+		mapping = obj->filp->f_mapping;
 		mapping_set_gfp_mask(mapping, GFP_HIGHUSER);
 	}
 
@@ -662,9 +669,7 @@ static struct drm_gem_object *__etnaviv_gem_new(struct drm_device *dev,
 	return obj;
 
 fail:
-	if (obj)
-		drm_gem_object_unreference_unlocked(obj);
-
+	drm_gem_object_unreference_unlocked(obj);
 	return ERR_PTR(ret);
 }
 
@@ -743,11 +748,15 @@ static struct page **etnaviv_gem_userptr_do_get_pages(
 	int ret = 0, pinned, npages = etnaviv_obj->base.size >> PAGE_SHIFT;
 	struct page **pvec;
 	uintptr_t ptr;
+	unsigned int flags = 0;
 
 	pvec = drm_malloc_ab(npages, sizeof(struct page *));
 	if (!pvec)
 		return ERR_PTR(-ENOMEM);
 
+	if (!etnaviv_obj->userptr.ro)
+		flags |= FOLL_WRITE;
+
 	pinned = 0;
 	ptr = etnaviv_obj->userptr.ptr;
 
@@ -756,6 +765,9 @@ static struct page **etnaviv_gem_userptr_do_get_pages(
 		ret = get_user_pages(task, mm, ptr, npages - pinned,
 				     !etnaviv_obj->userptr.ro, 0,
 				     pvec + pinned, NULL);
+/*		ret = get_user_pages_remote(task, mm, ptr, npages - pinned,
+					    flags, pvec + pinned, NULL);
+*/
 		if (ret < 0)
 			break;
 
@@ -878,10 +890,17 @@ static void etnaviv_gem_userptr_release(struct etnaviv_gem_object *etnaviv_obj)
 	put_task_struct(etnaviv_obj->userptr.task);
 }
 
+static int etnaviv_gem_userptr_mmap_obj(struct etnaviv_gem_object *etnaviv_obj,
+		struct vm_area_struct *vma)
+{
+	return -EINVAL;
+}
+
 static const struct etnaviv_gem_ops etnaviv_gem_userptr_ops = {
 	.get_pages = etnaviv_gem_userptr_get_pages,
 	.release = etnaviv_gem_userptr_release,
 	.vmap = etnaviv_gem_vmap_impl,
+	.mmap = etnaviv_gem_userptr_mmap_obj,
 };
 
 int etnaviv_gem_new_userptr(struct drm_device *dev, struct drm_file *file,
@@ -901,15 +920,12 @@ int etnaviv_gem_new_userptr(struct drm_device *dev, struct drm_file *file,
 	get_task_struct(current);
 
 	ret = etnaviv_gem_obj_add(dev, &etnaviv_obj->base);
-	if (ret) {
-		drm_gem_object_unreference_unlocked(&etnaviv_obj->base);
-		return ret;
-	}
+	if (ret)
+		goto unreference;
 
 	ret = drm_gem_handle_create(file, &etnaviv_obj->base, handle);
-
+unreference:
 	/* drop reference from allocate - handle holds it now */
 	drm_gem_object_unreference_unlocked(&etnaviv_obj->base);
-
 	return ret;
 }
diff --git a/etnaviv_gem.h b/etnaviv_gem.h
index ab5df81..e63ff11 100644
--- a/etnaviv_gem.h
+++ b/etnaviv_gem.h
@@ -79,6 +79,7 @@ struct etnaviv_gem_ops {
 	int (*get_pages)(struct etnaviv_gem_object *);
 	void (*release)(struct etnaviv_gem_object *);
 	void *(*vmap)(struct etnaviv_gem_object *);
+	int (*mmap)(struct etnaviv_gem_object *, struct vm_area_struct *);
 };
 
 static inline bool is_active(struct etnaviv_gem_object *etnaviv_obj)
@@ -88,6 +89,12 @@ static inline bool is_active(struct etnaviv_gem_object *etnaviv_obj)
 
 #define MAX_CMDS 4
 
+struct etnaviv_gem_submit_bo {
+	u32 flags;
+	struct etnaviv_gem_object *obj;
+	struct etnaviv_vram_mapping *mapping;
+};
+
 /* Created per submit-ioctl, to track bo's and cmdstream bufs, etc,
  * associated with the cmdstream submission for synchronization (and
  * make it easier to unwind when things go wrong, etc).  This only
@@ -99,11 +106,7 @@ struct etnaviv_gem_submit {
 	struct ww_acquire_ctx ticket;
 	u32 fence;
 	unsigned int nr_bos;
-	struct {
-		u32 flags;
-		struct etnaviv_gem_object *obj;
-		u32 iova;
-	} bos[0];
+	struct etnaviv_gem_submit_bo bos[0];
 };
 
 int etnaviv_gem_wait_bo(struct etnaviv_gpu *gpu, struct drm_gem_object *obj,
@@ -115,4 +118,9 @@ int etnaviv_gem_obj_add(struct drm_device *dev, struct drm_gem_object *obj);
 struct page **etnaviv_gem_get_pages(struct etnaviv_gem_object *obj);
 void etnaviv_gem_put_pages(struct etnaviv_gem_object *obj);
 
+struct etnaviv_vram_mapping *etnaviv_gem_mapping_get(
+	struct drm_gem_object *obj, struct etnaviv_gpu *gpu);
+void etnaviv_gem_mapping_reference(struct etnaviv_vram_mapping *mapping);
+void etnaviv_gem_mapping_unreference(struct etnaviv_vram_mapping *mapping);
+
 #endif /* __ETNAVIV_GEM_H__ */
diff --git a/etnaviv_gem_prime.c b/etnaviv_gem_prime.c
index 4e67395..b93618c 100644
--- a/etnaviv_gem_prime.c
+++ b/etnaviv_gem_prime.c
@@ -84,10 +84,17 @@ static void *etnaviv_gem_prime_vmap_impl(struct etnaviv_gem_object *etnaviv_obj)
 	return dma_buf_vmap(etnaviv_obj->base.import_attach->dmabuf);
 }
 
+static int etnaviv_gem_prime_mmap_obj(struct etnaviv_gem_object *etnaviv_obj,
+		struct vm_area_struct *vma)
+{
+	return dma_buf_mmap(etnaviv_obj->base.dma_buf, vma, 0);
+}
+
 static const struct etnaviv_gem_ops etnaviv_gem_prime_ops = {
 	/* .get_pages should never be called */
 	.release = etnaviv_gem_prime_release,
 	.vmap = etnaviv_gem_prime_vmap_impl,
+	.mmap = etnaviv_gem_prime_mmap_obj,
 };
 
 struct drm_gem_object *etnaviv_gem_prime_import_sg_table(struct drm_device *dev,
diff --git a/etnaviv_gem_submit.c b/etnaviv_gem_submit.c
index 1aba01a..44454c5 100644
--- a/etnaviv_gem_submit.c
+++ b/etnaviv_gem_submit.c
@@ -19,6 +19,13 @@
 #include "etnaviv_gpu.h"
 #include "etnaviv_gem.h"
 
+
+#define u64_to_user_ptr(x) (            \
+ {                                       \
+         typecheck(u64, x);              \
+         (void __user *)(uintptr_t)x;    \
+ }                                       \
+)
 /*
  * Cmdstream submission:
  */
@@ -28,11 +35,6 @@
 #define BO_LOCKED   0x4000
 #define BO_PINNED   0x2000
 
-static inline void __user *to_user_ptr(u64 address)
-{
-	return (void __user *)(uintptr_t)address;
-}
-
 static struct etnaviv_gem_submit *submit_create(struct drm_device *dev,
 		struct etnaviv_gpu *gpu, size_t nr)
 {
@@ -187,12 +189,10 @@ static void submit_unpin_objects(struct etnaviv_gem_submit *submit)
 	int i;
 
 	for (i = 0; i < submit->nr_bos; i++) {
-		struct etnaviv_gem_object *etnaviv_obj = submit->bos[i].obj;
-
 		if (submit->bos[i].flags & BO_PINNED)
-			etnaviv_gem_put_iova(submit->gpu, &etnaviv_obj->base);
+			etnaviv_gem_mapping_unreference(submit->bos[i].mapping);
 
-		submit->bos[i].iova = 0;
+		submit->bos[i].mapping = NULL;
 		submit->bos[i].flags &= ~BO_PINNED;
 	}
 }
@@ -203,22 +203,24 @@ static int submit_pin_objects(struct etnaviv_gem_submit *submit)
 
 	for (i = 0; i < submit->nr_bos; i++) {
 		struct etnaviv_gem_object *etnaviv_obj = submit->bos[i].obj;
-		u32 iova;
+		struct etnaviv_vram_mapping *mapping;
 
-		ret = etnaviv_gem_get_iova(submit->gpu, &etnaviv_obj->base,
-					   &iova);
-		if (ret)
+		mapping = etnaviv_gem_mapping_get(&etnaviv_obj->base,
+						  submit->gpu);
+		if (IS_ERR(mapping)) {
+			ret = PTR_ERR(mapping);
 			break;
+		}
 
 		submit->bos[i].flags |= BO_PINNED;
-		submit->bos[i].iova = iova;
+		submit->bos[i].mapping = mapping;
 	}
 
 	return ret;
 }
 
 static int submit_bo(struct etnaviv_gem_submit *submit, u32 idx,
-		struct etnaviv_gem_object **obj, u32 *iova)
+	struct etnaviv_gem_submit_bo **bo)
 {
 	if (idx >= submit->nr_bos) {
 		DRM_ERROR("invalid buffer index: %u (out of %u)\n",
@@ -226,10 +228,7 @@ static int submit_bo(struct etnaviv_gem_submit *submit, u32 idx,
 		return -EINVAL;
 	}
 
-	if (obj)
-		*obj = submit->bos[idx].obj;
-	if (iova)
-		*iova = submit->bos[idx].iova;
+	*bo = &submit->bos[idx];
 
 	return 0;
 }
@@ -245,8 +244,8 @@ static int submit_reloc(struct etnaviv_gem_submit *submit, void *stream,
 
 	for (i = 0; i < nr_relocs; i++) {
 		const struct drm_etnaviv_gem_submit_reloc *r = relocs + i;
-		struct etnaviv_gem_object *bobj;
-		u32 iova, off;
+		struct etnaviv_gem_submit_bo *bo;
+		u32 off;
 
 		if (unlikely(r->flags)) {
 			DRM_ERROR("invalid reloc flags\n");
@@ -268,17 +267,16 @@ static int submit_reloc(struct etnaviv_gem_submit *submit, void *stream,
 			return -EINVAL;
 		}
 
-		ret = submit_bo(submit, r->reloc_idx, &bobj, &iova);
+		ret = submit_bo(submit, r->reloc_idx, &bo);
 		if (ret)
 			return ret;
 
-		if (r->reloc_offset >=
-		    bobj->base.size - sizeof(*ptr)) {
+		if (r->reloc_offset >= bo->obj->base.size - sizeof(*ptr)) {
 			DRM_ERROR("relocation %u outside object", i);
 			return -EINVAL;
 		}
 
-		ptr[off] = iova + r->reloc_offset;
+		ptr[off] = bo->mapping->iova + r->reloc_offset;
 
 		last_offset = off;
 	}
@@ -351,21 +349,21 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
 	cmdbuf->exec_state = args->exec_state;
 	cmdbuf->ctx = file->driver_priv;
 
-	ret = copy_from_user(bos, to_user_ptr(args->bos),
+	ret = copy_from_user(bos, u64_to_user_ptr(args->bos),
 			     args->nr_bos * sizeof(*bos));
 	if (ret) {
 		ret = -EFAULT;
 		goto err_submit_cmds;
 	}
 
-	ret = copy_from_user(relocs, to_user_ptr(args->relocs),
+	ret = copy_from_user(relocs, u64_to_user_ptr(args->relocs),
 			     args->nr_relocs * sizeof(*relocs));
 	if (ret) {
 		ret = -EFAULT;
 		goto err_submit_cmds;
 	}
 
-	ret = copy_from_user(stream, to_user_ptr(args->stream),
+	ret = copy_from_user(stream, u64_to_user_ptr(args->stream),
 			     args->stream_size);
 	if (ret) {
 		ret = -EFAULT;
diff --git a/etnaviv_gpu.c b/etnaviv_gpu.c
index 3a6eace..b430f30 100644
--- a/etnaviv_gpu.c
+++ b/etnaviv_gpu.c
@@ -18,20 +18,17 @@
 #include <linux/fence.h>
 #include <linux/moduleparam.h>
 #include <linux/of_device.h>
-
-//#define PHYS_OFFSET 0x80000000
-
 #include "etnaviv_dump.h"
 #include "etnaviv_gpu.h"
 #include "etnaviv_gem.h"
 #include "etnaviv_mmu.h"
-#include "etnaviv_iommu.h"
-#include "etnaviv_iommu_v2.h"
 #include "common.xml.h"
 #include "state.xml.h"
 #include "state_hi.xml.h"
 #include "cmdstream.xml.h"
 
+//#define PHYS_OFFSET 0x80000000
+
 static const struct platform_device_id gpu_ids[] = {
 	{ .name = "etnaviv-gpu,2d" },
 	{ },
@@ -332,6 +329,18 @@ static void etnaviv_hw_identify(struct etnaviv_gpu *gpu)
 				gpu->identity.revision = 0x1051;
 			}
 		}
+
+		/*
+		 * NXP likes to call the GPU on the i.MX6QP GC2000+, but in
+		 * reality it's just a re-branded GC3000. We can identify this
+		 * core by the upper half of the revision register being all 1.
+		 * Fix model/rev here, so all other places can refer to this
+		 * core by its real identity.
+		 */
+		if (etnaviv_is_model_rev(gpu, GC2000, 0xffff5450)) {
+			gpu->identity.model = chipModel_GC3000;
+			gpu->identity.revision &= 0xffff;
+		}
 	}
 
 	dev_info(gpu->dev, "model: GC%x, revision: %x\n",
@@ -490,6 +499,46 @@ static int etnaviv_hw_reset(struct etnaviv_gpu *gpu)
 	return 0;
 }
 
+static void etnaviv_gpu_enable_mlcg(struct etnaviv_gpu *gpu)
+{
+	u32 pmc, ppc;
+
+	/* enable clock gating */
+	ppc = gpu_read(gpu, VIVS_PM_POWER_CONTROLS);
+	ppc |= VIVS_PM_POWER_CONTROLS_ENABLE_MODULE_CLOCK_GATING;
+
+	/* Disable stall module clock gating for 4.3.0.1 and 4.3.0.2 revs */
+	if (gpu->identity.revision == 0x4301 ||
+	    gpu->identity.revision == 0x4302)
+		ppc |= VIVS_PM_POWER_CONTROLS_DISABLE_STALL_MODULE_CLOCK_GATING;
+
+	gpu_write(gpu, VIVS_PM_POWER_CONTROLS, ppc);
+
+	pmc = gpu_read(gpu, VIVS_PM_MODULE_CONTROLS);
+
+	/* Disable PA clock gating for GC400+ except for GC420 */
+	if (gpu->identity.model >= chipModel_GC400 &&
+	    gpu->identity.model != chipModel_GC420)
+		pmc |= VIVS_PM_MODULE_CONTROLS_DISABLE_MODULE_CLOCK_GATING_PA;
+
+	/*
+	 * Disable PE clock gating on revs < 5.0.0.0 when HZ is
+	 * present without a bug fix.
+	 */
+	if (gpu->identity.revision < 0x5000 &&
+	    gpu->identity.minor_features0 & chipMinorFeatures0_HZ &&
+	    !(gpu->identity.minor_features1 &
+	      chipMinorFeatures1_DISABLE_PE_GATING))
+		pmc |= VIVS_PM_MODULE_CONTROLS_DISABLE_MODULE_CLOCK_GATING_PE;
+
+	if (gpu->identity.revision < 0x5422)
+		pmc |= BIT(15); /* Unknown bit */
+
+	pmc |= VIVS_PM_MODULE_CONTROLS_DISABLE_MODULE_CLOCK_GATING_RA_HZ;
+	pmc |= VIVS_PM_MODULE_CONTROLS_DISABLE_MODULE_CLOCK_GATING_RA_EZ;
+
+	gpu_write(gpu, VIVS_PM_MODULE_CONTROLS, pmc);
+}
 
 void etnaviv_gpu_start_fe(struct etnaviv_gpu *gpu, u32 address, u16 prefetch)
 {
@@ -499,7 +548,6 @@ void etnaviv_gpu_start_fe(struct etnaviv_gpu *gpu, u32 address, u16 prefetch)
 		  VIVS_FE_COMMAND_CONTROL_PREFETCH(prefetch));
 }
 
-
 static void etnaviv_gpu_hw_init(struct etnaviv_gpu *gpu)
 {
 	u16 prefetch;
@@ -519,6 +567,9 @@ static void etnaviv_gpu_hw_init(struct etnaviv_gpu *gpu)
 		gpu_write(gpu, VIVS_MC_DEBUG_MEMORY, mc_memory_debug);
 	}
 
+	/* enable module-level clock gating */
+	etnaviv_gpu_enable_mlcg(gpu);
+
 	/*
 	 * Update GPU AXI cache atttribute to "cacheable, no allocate".
 	 * This is necessary to prevent the iMX6 SoC locking up.
@@ -537,37 +588,26 @@ static void etnaviv_gpu_hw_init(struct etnaviv_gpu *gpu)
 		gpu_write(gpu, VIVS_MC_BUS_CONFIG, bus_config);
 	}
 
-	/* set base addresses */
-	gpu_write(gpu, VIVS_MC_MEMORY_BASE_ADDR_RA, gpu->memory_base);
-	gpu_write(gpu, VIVS_MC_MEMORY_BASE_ADDR_FE, gpu->memory_base);
-	gpu_write(gpu, VIVS_MC_MEMORY_BASE_ADDR_TX, gpu->memory_base);
-	gpu_write(gpu, VIVS_MC_MEMORY_BASE_ADDR_PEZ, gpu->memory_base);
-	gpu_write(gpu, VIVS_MC_MEMORY_BASE_ADDR_PE, gpu->memory_base);
-
-	/* setup the MMU page table pointers */
-	etnaviv_iommu_domain_restore(gpu, gpu->mmu->domain);
+	/* setup the MMU */
+	etnaviv_iommu_restore(gpu);
 
 	/* Start command processor */
 	prefetch = etnaviv_buffer_init(gpu);
 
 	gpu_write(gpu, VIVS_HI_INTR_ENBL, ~0U);
-	gpu_write(gpu, VIVS_FE_COMMAND_ADDRESS,
-		  gpu->buffer->paddr - gpu->memory_base);
-	gpu_write(gpu, VIVS_FE_COMMAND_CONTROL,
-		  VIVS_FE_COMMAND_CONTROL_ENABLE |
-		  VIVS_FE_COMMAND_CONTROL_PREFETCH(prefetch));
+	etnaviv_gpu_start_fe(gpu, etnaviv_iommu_get_cmdbuf_va(gpu, gpu->buffer),
+			     prefetch);
 }
 
 int etnaviv_gpu_init(struct etnaviv_gpu *gpu)
 {
 	int ret, i;
-	struct iommu_domain *iommu;
-	enum etnaviv_iommu_version version;
-	bool mmuv2;
 
 	ret = pm_runtime_get_sync(gpu->dev);
-	if (ret < 0)
+	if (ret < 0) {
+		dev_err(gpu->dev, "Failed to enable GPU power domain\n");
 		return ret;
+	}
 
 	etnaviv_hw_identify(gpu);
 
@@ -585,36 +625,34 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu)
 		goto fail;
 	}
 
-	ret = etnaviv_hw_reset(gpu);
-	if (ret)
-		goto fail;
-
-	/* Setup IOMMU.. eventually we will (I think) do this once per context
-	 * and have separate page tables per context.  For now, to keep things
-	 * simple and to get something working, just use a single address space:
+	/*
+	 * Set the GPU linear window to be at the end of the DMA window, where
+	 * the CMA area is likely to reside. This ensures that we are able to
+	 * map the command buffers while having the linear window overlap as
+	 * much RAM as possible, so we can optimize mappings for other buffers.
+	 *
+	 * For 3D cores only do this if MC2.0 is present, as with MC1.0 it leads
+	 * to different views of the memory on the individual engines.
 	 */
-	mmuv2 = gpu->identity.minor_features1 & chipMinorFeatures1_MMU_VERSION;
-	dev_info(gpu->dev, "mmuv2: %d\n", mmuv2);
-
-	if (!mmuv2) {
-		iommu = etnaviv_iommu_domain_alloc(gpu);
-		version = ETNAVIV_IOMMU_V1;
-	} else {
-		iommu = etnaviv_iommuv2_domain_alloc(gpu);
-		version = ETNAVIV_IOMMU_V2;
+	if (!(gpu->identity.features & chipFeatures_PIPE_3D) ||
+	    (gpu->identity.minor_features0 & chipMinorFeatures0_MC20)) {
+		u32 dma_mask = (u32)dma_get_required_mask(gpu->dev);
+		if (dma_mask < PHYS_OFFSET + SZ_2G)
+			gpu->memory_base = PHYS_OFFSET;
+		else
+			gpu->memory_base = dma_mask - SZ_2G + 1;
 	}
 
-	if (!iommu) {
-		dev_err(gpu->dev, "no mmu\n");
-		ret = -ENOMEM;
+	ret = etnaviv_hw_reset(gpu);
+	if (ret) {
+		dev_err(gpu->dev, "GPU reset failed\n");
 		goto fail;
 	}
 
-	gpu->mmu = etnaviv_iommu_new(gpu, iommu, version);
-	if (!gpu->mmu) {
-		dev_err(gpu->dev, "mmu init fail\n");
-		iommu_domain_free(iommu);
-		ret = -ENOMEM;
+	gpu->mmu = etnaviv_iommu_new(gpu);
+	if (IS_ERR(gpu->mmu)) {
+		dev_err(gpu->dev, "Failed to instantiate GPU IOMMU\n");
+		ret = PTR_ERR(gpu->mmu);
 		goto fail;
 	}
 
@@ -625,7 +663,9 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu)
 		dev_err(gpu->dev, "could not create command buffer\n");
 		goto destroy_iommu;
 	}
-	if (gpu->buffer->paddr - gpu->memory_base > 0x80000000) {
+
+	if (gpu->mmu->version == ETNAVIV_IOMMU_V1 &&
+	    gpu->buffer->paddr - gpu->memory_base > 0x80000000) {
 		ret = -EINVAL;
 		dev_err(gpu->dev,
 			"command buffer outside valid memory window\n");
@@ -643,6 +683,7 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu)
 	/* Now program the hardware */
 	mutex_lock(&gpu->lock);
 	etnaviv_gpu_hw_init(gpu);
+	gpu->exec_state = -1;
 	mutex_unlock(&gpu->lock);
 
 	pm_runtime_mark_last_busy(gpu->dev);
@@ -792,9 +833,9 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m)
 	    debug.state[0] == debug.state[1]) {
 		seq_puts(m, "seems to be stuck\n");
 	} else if (debug.address[0] == debug.address[1]) {
-		seq_puts(m, "adress is constant\n");
+		seq_puts(m, "address is constant\n");
 	} else {
-		seq_puts(m, "is runing\n");
+		seq_puts(m, "is running\n");
 	}
 
 	seq_printf(m, "\t address 0: 0x%08x\n", debug.address[0]);
@@ -813,45 +854,6 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m)
 }
 #endif
 
-/*
- * Power Management:
- */
-static int enable_clk(struct etnaviv_gpu *gpu)
-{
-	if (gpu->clk_core)
-		clk_prepare_enable(gpu->clk_core);
-	if (gpu->clk_shader)
-		clk_prepare_enable(gpu->clk_shader);
-
-	return 0;
-}
-
-static int disable_clk(struct etnaviv_gpu *gpu)
-{
-	if (gpu->clk_core)
-		clk_disable_unprepare(gpu->clk_core);
-	if (gpu->clk_shader)
-		clk_disable_unprepare(gpu->clk_shader);
-
-	return 0;
-}
-
-static int enable_axi(struct etnaviv_gpu *gpu)
-{
-	if (gpu->clk_bus)
-		clk_prepare_enable(gpu->clk_bus);
-
-	return 0;
-}
-
-static int disable_axi(struct etnaviv_gpu *gpu)
-{
-	if (gpu->clk_bus)
-		clk_disable_unprepare(gpu->clk_bus);
-
-	return 0;
-}
-
 /*
  * Hangcheck detection for locked gpu:
  */
@@ -886,17 +888,13 @@ static void recover_worker(struct work_struct *work)
 		gpu->event[i].fence = NULL;
 		gpu->event[i].used = false;
 		complete(&gpu->event_free);
-		/*
-		 * Decrement the PM count for each stuck event. This is safe
-		 * even in atomic context as we use ASYNC RPM here.
-		 */
-		pm_runtime_put_autosuspend(gpu->dev);
 	}
 	spin_unlock_irqrestore(&gpu->event_spinlock, flags);
 	gpu->completed_fence = gpu->active_fence;
 
 	etnaviv_gpu_hw_init(gpu);
-	gpu->switch_context = true;
+	gpu->lastctx = NULL;
+	gpu->exec_state = -1;
 
 	mutex_unlock(&gpu->lock);
 	pm_runtime_mark_last_busy(gpu->dev);
@@ -1121,15 +1119,18 @@ struct etnaviv_cmdbuf *etnaviv_gpu_cmdbuf_new(struct etnaviv_gpu *gpu, u32 size,
 	size_t nr_bos)
 {
 	struct etnaviv_cmdbuf *cmdbuf;
-	size_t sz = size_vstruct(nr_bos, sizeof(cmdbuf->bo[0]),
+	size_t sz = size_vstruct(nr_bos, sizeof(cmdbuf->bo_map[0]),
 				 sizeof(*cmdbuf));
 
 	cmdbuf = kzalloc(sz, GFP_KERNEL);
 	if (!cmdbuf)
 		return NULL;
 
+	if (gpu->mmu->version == ETNAVIV_IOMMU_V2)
+		size = ALIGN(size, SZ_4K);
+
 	cmdbuf->vaddr = dma_alloc_writecombine(gpu->dev, size, &cmdbuf->paddr,
-					       GFP_KERNEL);
+				     GFP_KERNEL);
 	if (!cmdbuf->vaddr) {
 		kfree(cmdbuf);
 		return NULL;
@@ -1143,8 +1144,9 @@ struct etnaviv_cmdbuf *etnaviv_gpu_cmdbuf_new(struct etnaviv_gpu *gpu, u32 size,
 
 void etnaviv_gpu_cmdbuf_free(struct etnaviv_cmdbuf *cmdbuf)
 {
-	dma_free_writecombine(cmdbuf->gpu->dev, cmdbuf->size,
-			      cmdbuf->vaddr, cmdbuf->paddr);
+	etnaviv_iommu_put_cmdbuf_va(cmdbuf->gpu, cmdbuf);
+	dma_free_writecombine(cmdbuf->gpu->dev, cmdbuf->size, cmdbuf->vaddr,
+		    cmdbuf->paddr);
 	kfree(cmdbuf);
 }
 
@@ -1165,14 +1167,23 @@ static void retire_worker(struct work_struct *work)
 		fence_put(cmdbuf->fence);
 
 		for (i = 0; i < cmdbuf->nr_bos; i++) {
-			struct etnaviv_gem_object *etnaviv_obj = cmdbuf->bo[i];
+			struct etnaviv_vram_mapping *mapping = cmdbuf->bo_map[i];
+			struct etnaviv_gem_object *etnaviv_obj = mapping->object;
 
 			atomic_dec(&etnaviv_obj->gpu_active);
 			/* drop the refcount taken in etnaviv_gpu_submit */
-			etnaviv_gem_put_iova(gpu, &etnaviv_obj->base);
+			etnaviv_gem_mapping_unreference(mapping);
 		}
 
 		etnaviv_gpu_cmdbuf_free(cmdbuf);
+		/*
+		 * We need to balance the runtime PM count caused by
+		 * each submission.  Upon submission, we increment
+		 * the runtime PM counter, and allocate one event.
+		 * So here, we put the runtime PM count for each
+		 * completed event.
+		 */
+		pm_runtime_put_autosuspend(gpu->dev);
 	}
 
 	gpu->retired_fence = fence;
@@ -1274,8 +1285,6 @@ int etnaviv_gpu_submit(struct etnaviv_gpu *gpu,
 	if (ret < 0)
 		return ret;
 
-	mutex_lock(&gpu->lock);
-
 	/*
 	 * TODO
 	 *
@@ -1289,16 +1298,18 @@ int etnaviv_gpu_submit(struct etnaviv_gpu *gpu,
 	if (unlikely(event == ~0U)) {
 		DRM_ERROR("no free event\n");
 		ret = -EBUSY;
-		goto out_unlock;
+		goto out_pm_put;
 	}
 
 	fence = etnaviv_gpu_fence_alloc(gpu);
 	if (!fence) {
 		event_free(gpu, event);
 		ret = -ENOMEM;
-		goto out_unlock;
+		goto out_pm_put;
 	}
 
+	mutex_lock(&gpu->lock);
+
 	gpu->event[event].fence = fence;
 	submit->fence = fence->seqno;
 	gpu->active_fence = submit->fence;
@@ -1319,11 +1330,10 @@ int etnaviv_gpu_submit(struct etnaviv_gpu *gpu,
 
 	for (i = 0; i < submit->nr_bos; i++) {
 		struct etnaviv_gem_object *etnaviv_obj = submit->bos[i].obj;
-		u32 iova;
 
-		/* Each cmdbuf takes a refcount on the iova */
-		etnaviv_gem_get_iova(gpu, &etnaviv_obj->base, &iova);
-		cmdbuf->bo[i] = etnaviv_obj;
+		/* Each cmdbuf takes a refcount on the mapping */
+		etnaviv_gem_mapping_reference(submit->bos[i].mapping);
+		cmdbuf->bo_map[i] = submit->bos[i].mapping;
 		atomic_inc(&etnaviv_obj->gpu_active);
 
 		if (submit->bos[i].flags & ETNA_SUBMIT_BO_WRITE)
@@ -1337,9 +1347,9 @@ int etnaviv_gpu_submit(struct etnaviv_gpu *gpu,
 	hangcheck_timer_reset(gpu);
 	ret = 0;
 
-out_unlock:
 	mutex_unlock(&gpu->lock);
 
+out_pm_put:
 	etnaviv_gpu_pm_put(gpu);
 
 	return ret;
@@ -1367,6 +1377,21 @@ static irqreturn_t irq_handler(int irq, void *data)
 			intr &= ~VIVS_HI_INTR_ACKNOWLEDGE_AXI_BUS_ERROR;
 		}
 
+		if (intr & VIVS_HI_INTR_ACKNOWLEDGE_MMU_EXCEPTION) {
+			int i;
+
+			dev_err_ratelimited(gpu->dev,
+				"MMU fault status 0x%08x\n",
+				gpu_read(gpu, VIVS_MMUv2_STATUS));
+			for (i = 0; i < 4; i++) {
+				dev_err_ratelimited(gpu->dev,
+					"MMU %d fault addr 0x%08x\n",
+					i, gpu_read(gpu,
+					VIVS_MMUv2_EXCEPTION_ADDR(i)));
+			}
+			intr &= ~VIVS_HI_INTR_ACKNOWLEDGE_MMU_EXCEPTION;
+		}
+
 		while ((event = ffs(intr)) != 0) {
 			struct fence *fence;
 
@@ -1393,15 +1418,6 @@ static irqreturn_t irq_handler(int irq, void *data)
 				gpu->completed_fence = fence->seqno;
 
 			event_free(gpu, event);
-
-			/*
-			 * We need to balance the runtime PM count caused by
-			 * each submission.  Upon submission, we increment
-			 * the runtime PM counter, and allocate one event.
-			 * So here, we put the runtime PM count for each
-			 * completed event.
-			 */
-			pm_runtime_put_autosuspend(gpu->dev);
 		}
 
 		/* Retire the buffer objects in a work */
@@ -1417,30 +1433,44 @@ static int etnaviv_gpu_clk_enable(struct etnaviv_gpu *gpu)
 {
 	int ret;
 
-	ret = enable_clk(gpu);
-	if (ret)
-		return ret;
+	if (gpu->clk_bus) {
+		ret = clk_prepare_enable(gpu->clk_bus);
+		if (ret)
+			return ret;
+	}
 
-	ret = enable_axi(gpu);
-	if (ret) {
-		disable_clk(gpu);
-		return ret;
+	if (gpu->clk_core) {
+		ret = clk_prepare_enable(gpu->clk_core);
+		if (ret)
+			goto disable_clk_bus;
+	}
+
+	if (gpu->clk_shader) {
+		ret = clk_prepare_enable(gpu->clk_shader);
+		if (ret)
+			goto disable_clk_core;
 	}
 
 	return 0;
+
+disable_clk_core:
+	if (gpu->clk_core)
+		clk_disable_unprepare(gpu->clk_core);
+disable_clk_bus:
+	if (gpu->clk_bus)
+		clk_disable_unprepare(gpu->clk_bus);
+
+	return ret;
 }
 
 static int etnaviv_gpu_clk_disable(struct etnaviv_gpu *gpu)
 {
-	int ret;
-
-	ret = disable_axi(gpu);
-	if (ret)
-		return ret;
-
-	ret = disable_clk(gpu);
-	if (ret)
-		return ret;
+	if (gpu->clk_shader)
+		clk_disable_unprepare(gpu->clk_shader);
+	if (gpu->clk_core)
+		clk_disable_unprepare(gpu->clk_core);
+	if (gpu->clk_bus)
+		clk_disable_unprepare(gpu->clk_bus);
 
 	return 0;
 }
@@ -1469,8 +1499,6 @@ int etnaviv_gpu_wait_idle(struct etnaviv_gpu *gpu, unsigned int timeout_ms)
 static int etnaviv_gpu_hw_suspend(struct etnaviv_gpu *gpu)
 {
 	if (gpu->buffer) {
-		unsigned long timeout;
-
 		/* Replace the last WAIT with END */
 		etnaviv_buffer_end(gpu);
 
@@ -1479,22 +1507,7 @@ static int etnaviv_gpu_hw_suspend(struct etnaviv_gpu *gpu)
 		 * happen quickly (as the WAIT is only 200 cycles).  If
 		 * we fail, just warn and continue.
 		 */
-		timeout = jiffies + msecs_to_jiffies(100);
-		do {
-			u32 idle = gpu_read(gpu, VIVS_HI_IDLE_STATE);
-
-			if ((idle & gpu->idle_mask) == gpu->idle_mask)
-				break;
-
-			if (time_is_before_jiffies(timeout)) {
-				dev_warn(gpu->dev,
-					 "timed out waiting for idle: idle=0x%x\n",
-					 idle);
-				break;
-			}
-
-			udelay(5);
-		} while (1);
+		etnaviv_gpu_wait_idle(gpu, 100);
 	}
 
 	return etnaviv_gpu_clk_disable(gpu);
@@ -1517,6 +1530,7 @@ static int etnaviv_gpu_hw_resume(struct etnaviv_gpu *gpu)
 	etnaviv_gpu_hw_init(gpu);
 
 	gpu->switch_context = true;
+	gpu->exec_state = -1;
 
 	mutex_unlock(&gpu->lock);
 
@@ -1551,7 +1565,9 @@ static int etnaviv_gpu_bind(struct device *dev, struct device *master,
 
 	setup_timer(&gpu->hangcheck_timer, hangcheck_handler,
 			(unsigned long)gpu);
-
+/*	setup_deferrable_timer(&gpu->hangcheck_timer, hangcheck_handler,
+			       (unsigned long)gpu);
+*/
 	priv->gpu[priv->num_gpus++] = gpu;
 
 	pm_runtime_mark_last_busy(gpu->dev);
@@ -1605,7 +1621,7 @@ static int etnaviv_gpu_platform_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct etnaviv_gpu *gpu;
-	int err = 0;
+	int err;
 
 	gpu = devm_kzalloc(dev, sizeof(*gpu), GFP_KERNEL);
 	if (!gpu)
@@ -1614,14 +1630,6 @@ static int etnaviv_gpu_platform_probe(struct platform_device *pdev)
 	gpu->dev = &pdev->dev;
 	mutex_init(&gpu->lock);
 
-	/*
-	 * Set the GPU base address to the start of physical memory.  This
-	 * ensures that if we have up to 2GB, the v1 MMU can address the
-	 * highest memory.  This is important as command buffers may be
-	 * allocated outside of this limit.
-	 */
-	gpu->memory_base = PHYS_OFFSET;
-
 	/* Map registers: */
 	gpu->mmio = etnaviv_ioremap(pdev, NULL, dev_name(gpu->dev));
 	if (IS_ERR(gpu->mmio))
@@ -1630,16 +1638,15 @@ static int etnaviv_gpu_platform_probe(struct platform_device *pdev)
 	/* Get Interrupt: */
 	gpu->irq = platform_get_irq(pdev, 0);
 	if (gpu->irq < 0) {
-		err = gpu->irq;
-		dev_err(dev, "failed to get irq: %d\n", err);
-		goto fail;
+		dev_err(dev, "failed to get irq: %d\n", gpu->irq);
+		return gpu->irq;
 	}
 
 	err = devm_request_irq(&pdev->dev, gpu->irq, irq_handler, 0,
 			       dev_name(gpu->dev), gpu);
 	if (err) {
 		dev_err(dev, "failed to request IRQ%u: %d\n", gpu->irq, err);
-		goto fail;
+		return err;
 	}
 
 	/* Get Clocks: */
@@ -1673,13 +1680,10 @@ static int etnaviv_gpu_platform_probe(struct platform_device *pdev)
 	err = component_add(&pdev->dev, &gpu_ops);
 	if (err < 0) {
 		dev_err(&pdev->dev, "failed to register component: %d\n", err);
-		goto fail;
+		return err;
 	}
 
 	return 0;
-
-fail:
-	return err;
 }
 
 static int etnaviv_gpu_platform_remove(struct platform_device *pdev)
diff --git a/etnaviv_gpu.h b/etnaviv_gpu.h
index 17abaa0..73c278d 100644
--- a/etnaviv_gpu.h
+++ b/etnaviv_gpu.h
@@ -23,6 +23,7 @@
 #include "etnaviv_drv.h"
 
 struct etnaviv_gem_submit;
+struct etnaviv_vram_mapping;
 
 struct etnaviv_chip_identity {
 	/* Chip model. */
@@ -103,6 +104,7 @@ struct etnaviv_gpu {
 
 	/* 'ring'-buffer: */
 	struct etnaviv_cmdbuf *buffer;
+	int exec_state;
 
 	/* bus base address of memory  */
 	u32 memory_base;
@@ -123,7 +125,7 @@ struct etnaviv_gpu {
 	u32 completed_fence;
 	u32 retired_fence;
 	wait_queue_head_t fence_event;
-	unsigned int fence_context;
+	u64 fence_context;
 	spinlock_t fence_spinlock;
 
 	/* worker for handling active-list retiring: */
@@ -158,6 +160,8 @@ struct etnaviv_cmdbuf {
 	dma_addr_t paddr;
 	u32 size;
 	u32 user_size;
+	/* vram node used if the cmdbuf is mapped through the MMUv2 */
+	struct drm_mm_node vram_node;
 	/* fence after which this buffer is to be disposed */
 	struct fence *fence;
 	/* target exec state */
@@ -166,7 +170,7 @@ struct etnaviv_cmdbuf {
 	struct list_head node;
 	/* BOs attached to this command buffer */
 	unsigned int nr_bos;
-	struct etnaviv_gem_object *bo[0];
+	struct etnaviv_vram_mapping *bo_map[0];
 };
 
 static inline void gpu_write(struct etnaviv_gpu *gpu, u32 reg, u32 data)
diff --git a/etnaviv_iommu.c b/etnaviv_iommu.c
index 522cfd4..f769af7 100644
--- a/etnaviv_iommu.c
+++ b/etnaviv_iommu.c
@@ -196,12 +196,19 @@ static struct etnaviv_iommu_ops etnaviv_iommu_ops = {
 	.dump = etnaviv_iommuv1_dump,
 };
 
-void etnaviv_iommu_domain_restore(struct etnaviv_gpu *gpu,
-	struct iommu_domain *domain)
+void etnaviv_iommuv1_restore(struct etnaviv_gpu *gpu)
 {
-	struct etnaviv_iommu_domain *etnaviv_domain = to_etnaviv_domain(domain);
+	struct etnaviv_iommu_domain *etnaviv_domain =
+			to_etnaviv_domain(gpu->mmu->domain);
 	u32 pgtable;
 
+	/* set base addresses */
+	gpu_write(gpu, VIVS_MC_MEMORY_BASE_ADDR_RA, gpu->memory_base);
+	gpu_write(gpu, VIVS_MC_MEMORY_BASE_ADDR_FE, gpu->memory_base);
+	gpu_write(gpu, VIVS_MC_MEMORY_BASE_ADDR_TX, gpu->memory_base);
+	gpu_write(gpu, VIVS_MC_MEMORY_BASE_ADDR_PEZ, gpu->memory_base);
+	gpu_write(gpu, VIVS_MC_MEMORY_BASE_ADDR_PE, gpu->memory_base);
+
 	/* set page table address in MC */
 	pgtable = (u32)etnaviv_domain->pgtable.paddr;
 
@@ -212,7 +219,7 @@ void etnaviv_iommu_domain_restore(struct etnaviv_gpu *gpu,
 	gpu_write(gpu, VIVS_MC_MMU_RA_PAGE_TABLE, pgtable);
 }
 
-struct iommu_domain *etnaviv_iommu_domain_alloc(struct etnaviv_gpu *gpu)
+struct iommu_domain *etnaviv_iommuv1_domain_alloc(struct etnaviv_gpu *gpu)
 {
 	struct etnaviv_iommu_domain *etnaviv_domain;
 	int ret;
@@ -225,6 +232,7 @@ struct iommu_domain *etnaviv_iommu_domain_alloc(struct etnaviv_gpu *gpu)
 
 	etnaviv_domain->domain.type = __IOMMU_DOMAIN_PAGING;
 	etnaviv_domain->domain.ops = &etnaviv_iommu_ops.ops;
+//	etnaviv_domain->domain.pgsize_bitmap = SZ_4K;
 	etnaviv_domain->domain.geometry.aperture_start = GPU_MEM_START;
 	etnaviv_domain->domain.geometry.aperture_end = GPU_MEM_START + PT_ENTRIES * SZ_4K - 1;
 
diff --git a/etnaviv_iommu.h b/etnaviv_iommu.h
index 00350e3..8b51e7c 100644
--- a/etnaviv_iommu.h
+++ b/etnaviv_iommu.h
@@ -17,13 +17,12 @@
 #ifndef __ETNAVIV_IOMMU_H__
 #define __ETNAVIV_IOMMU_H__
 
-#include <linux/iommu.h>
 struct etnaviv_gpu;
 
-struct iommu_domain *etnaviv_iommu_domain_alloc(struct etnaviv_gpu *gpu);
-void etnaviv_iommu_domain_restore(struct etnaviv_gpu *gpu,
-	struct iommu_domain *domain);
-struct iommu_domain *etnaviv_iommu_v2_domain_alloc(struct etnaviv_gpu *gpu);
+struct iommu_domain *etnaviv_iommuv1_domain_alloc(struct etnaviv_gpu *gpu);
+void etnaviv_iommuv1_restore(struct etnaviv_gpu *gpu);
+
+struct iommu_domain *etnaviv_iommuv2_domain_alloc(struct etnaviv_gpu *gpu);
 void etnaviv_iommuv2_restore(struct etnaviv_gpu *gpu);
 
 #endif /* __ETNAVIV_IOMMU_H__ */
diff --git a/etnaviv_mmu.c b/etnaviv_mmu.c
index 6743bc6..169ac96 100644
--- a/etnaviv_mmu.c
+++ b/etnaviv_mmu.c
@@ -14,9 +14,11 @@
  * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include "common.xml.h"
 #include "etnaviv_drv.h"
 #include "etnaviv_gem.h"
 #include "etnaviv_gpu.h"
+#include "etnaviv_iommu.h"
 #include "etnaviv_mmu.h"
 
 static int etnaviv_fault_handler(struct iommu_domain *iommu, struct device *dev,
@@ -101,40 +103,21 @@ static void etnaviv_iommu_remove_mapping(struct etnaviv_iommu *mmu,
 	drm_mm_remove_node(&mapping->vram_node);
 }
 
-int etnaviv_iommu_map_gem(struct etnaviv_iommu *mmu,
-	struct etnaviv_gem_object *etnaviv_obj, u32 memory_base,
-	struct etnaviv_vram_mapping *mapping)
+static int etnaviv_iommu_find_iova(struct etnaviv_iommu *mmu,
+				   struct drm_mm_node *node, size_t size)
 {
 	struct etnaviv_vram_mapping *free = NULL;
-	struct sg_table *sgt = etnaviv_obj->sgt;
-	struct drm_mm_node *node;
 	int ret;
 
-	lockdep_assert_held(&etnaviv_obj->lock);
-
-	mutex_lock(&mmu->lock);
-
-	/* v1 MMU can optimize single entry (contiguous) scatterlists */
-	if (sgt->nents == 1 && !(etnaviv_obj->flags & ETNA_BO_FORCE_MMU)) {
-		u32 iova;
-
-		iova = sg_dma_address(sgt->sgl) - memory_base;
-		if (iova < 0x80000000 - sg_dma_len(sgt->sgl)) {
-			mapping->iova = iova;
-			list_add_tail(&mapping->mmu_node, &mmu->mappings);
-			mutex_unlock(&mmu->lock);
-			return 0;
-		}
-	}
+	lockdep_assert_held(&mmu->lock);
 
-	node = &mapping->vram_node;
 	while (1) {
 		struct etnaviv_vram_mapping *m, *n;
 		struct list_head list;
 		bool found;
 
 		ret = drm_mm_insert_node_in_range(&mmu->mm, node,
-			etnaviv_obj->base.size, 0, mmu->last_iova, ~0UL,
+			size, 0, mmu->last_iova, ~0UL,
 			DRM_MM_SEARCH_DEFAULT);
 
 		if (ret != -ENOSPC)
@@ -151,7 +134,7 @@ int etnaviv_iommu_map_gem(struct etnaviv_iommu *mmu,
 		}
 
 		/* Try to retire some entries */
-		drm_mm_init_scan(&mmu->mm, etnaviv_obj->base.size, 0, 0);
+		drm_mm_init_scan(&mmu->mm, size, 0, 0);
 
 		found = 0;
 		INIT_LIST_HEAD(&list);
@@ -193,7 +176,7 @@ int etnaviv_iommu_map_gem(struct etnaviv_iommu *mmu,
 
 		/*
 		 * Unmap the blocks which need to be reaped from the MMU.
-		 * Clear the mmu pointer to prevent the get_iova finding
+		 * Clear the mmu pointer to prevent the mapping_get finding
 		 * this mapping.
 		 */
 		list_for_each_entry_safe(m, n, &list, scan_node) {
@@ -212,6 +195,38 @@ int etnaviv_iommu_map_gem(struct etnaviv_iommu *mmu,
 		mmu->need_flush = true;
 	}
 
+	return ret;
+}
+
+int etnaviv_iommu_map_gem(struct etnaviv_iommu *mmu,
+	struct etnaviv_gem_object *etnaviv_obj, u32 memory_base,
+	struct etnaviv_vram_mapping *mapping)
+{
+	struct sg_table *sgt = etnaviv_obj->sgt;
+	struct drm_mm_node *node;
+	int ret;
+
+	lockdep_assert_held(&etnaviv_obj->lock);
+
+	mutex_lock(&mmu->lock);
+
+	/* v1 MMU can optimize single entry (contiguous) scatterlists */
+	if (mmu->version == ETNAVIV_IOMMU_V1 &&
+	    sgt->nents == 1 && !(etnaviv_obj->flags & ETNA_BO_FORCE_MMU)) {
+		u32 iova;
+
+		iova = sg_dma_address(sgt->sgl) - memory_base;
+		if (iova < 0x80000000 - sg_dma_len(sgt->sgl)) {
+			mapping->iova = iova;
+			list_add_tail(&mapping->mmu_node, &mmu->mappings);
+			mutex_unlock(&mmu->lock);
+			return 0;
+		}
+	}
+
+	node = &mapping->vram_node;
+
+	ret = etnaviv_iommu_find_iova(mmu, node, etnaviv_obj->base.size);
 	if (ret < 0) {
 		mutex_unlock(&mmu->lock);
 		return ret;
@@ -256,30 +271,103 @@ void etnaviv_iommu_destroy(struct etnaviv_iommu *mmu)
 	kfree(mmu);
 }
 
-struct etnaviv_iommu *etnaviv_iommu_new(struct etnaviv_gpu *gpu,
-	struct iommu_domain *domain, enum etnaviv_iommu_version version)
+struct etnaviv_iommu *etnaviv_iommu_new(struct etnaviv_gpu *gpu)
 {
+	enum etnaviv_iommu_version version;
 	struct etnaviv_iommu *mmu;
 
 	mmu = kzalloc(sizeof(*mmu), GFP_KERNEL);
 	if (!mmu)
 		return ERR_PTR(-ENOMEM);
 
-	mmu->domain = domain;
+	if (!(gpu->identity.minor_features1 & chipMinorFeatures1_MMU_VERSION)) {
+		mmu->domain = etnaviv_iommuv1_domain_alloc(gpu);
+		version = ETNAVIV_IOMMU_V1;
+	} else {
+		mmu->domain = etnaviv_iommuv2_domain_alloc(gpu);
+		version = ETNAVIV_IOMMU_V2;
+	}
+
+	if (!mmu->domain) {
+		dev_err(gpu->dev, "Failed to allocate GPU IOMMU domain\n");
+		kfree(mmu);
+		return ERR_PTR(-ENOMEM);
+	}
+
 	mmu->gpu = gpu;
 	mmu->version = version;
 	mutex_init(&mmu->lock);
 	INIT_LIST_HEAD(&mmu->mappings);
 
-	drm_mm_init(&mmu->mm, domain->geometry.aperture_start,
-		    domain->geometry.aperture_end -
-		      domain->geometry.aperture_start + 1);
+	drm_mm_init(&mmu->mm, mmu->domain->geometry.aperture_start,
+		    mmu->domain->geometry.aperture_end -
+		    mmu->domain->geometry.aperture_start + 1);
 
-	iommu_set_fault_handler(domain, etnaviv_fault_handler, gpu->dev);
+	iommu_set_fault_handler(mmu->domain, etnaviv_fault_handler, gpu->dev);
 
 	return mmu;
 }
 
+void etnaviv_iommu_restore(struct etnaviv_gpu *gpu)
+{
+	if (gpu->mmu->version == ETNAVIV_IOMMU_V1)
+		etnaviv_iommuv1_restore(gpu);
+	else
+		etnaviv_iommuv2_restore(gpu);
+}
+
+u32 etnaviv_iommu_get_cmdbuf_va(struct etnaviv_gpu *gpu,
+				struct etnaviv_cmdbuf *buf)
+{
+	struct etnaviv_iommu *mmu = gpu->mmu;
+
+	if (mmu->version == ETNAVIV_IOMMU_V1) {
+		return buf->paddr - gpu->memory_base;
+	} else {
+		int ret;
+
+		if (buf->vram_node.allocated)
+			return (u32)buf->vram_node.start;
+
+		mutex_lock(&mmu->lock);
+		ret = etnaviv_iommu_find_iova(mmu, &buf->vram_node,
+					      buf->size + SZ_64K);
+		if (ret < 0) {
+			mutex_unlock(&mmu->lock);
+			return 0;
+		}
+		ret = iommu_map(mmu->domain, buf->vram_node.start, buf->paddr,
+				buf->size, IOMMU_READ);
+		if (ret < 0) {
+			drm_mm_remove_node(&buf->vram_node);
+			mutex_unlock(&mmu->lock);
+			return 0;
+		}
+		/*
+		 * At least on GC3000 the FE MMU doesn't properly flush old TLB
+		 * entries. Make sure to space the command buffers out in a way
+		 * that the FE MMU prefetch won't load invalid entries.
+		 */
+		mmu->last_iova = buf->vram_node.start + buf->size + SZ_64K;
+		gpu->mmu->need_flush = true;
+		mutex_unlock(&mmu->lock);
+
+		return (u32)buf->vram_node.start;
+	}
+}
+
+void etnaviv_iommu_put_cmdbuf_va(struct etnaviv_gpu *gpu,
+				 struct etnaviv_cmdbuf *buf)
+{
+	struct etnaviv_iommu *mmu = gpu->mmu;
+
+	if (mmu->version == ETNAVIV_IOMMU_V2 && buf->vram_node.allocated) {
+		mutex_lock(&mmu->lock);
+		iommu_unmap(mmu->domain, buf->vram_node.start, buf->size);
+		drm_mm_remove_node(&buf->vram_node);
+		mutex_unlock(&mmu->lock);
+	}
+}
 size_t etnaviv_iommu_dump_size(struct etnaviv_iommu *iommu)
 {
 	struct etnaviv_iommu_ops *ops;
diff --git a/etnaviv_mmu.h b/etnaviv_mmu.h
index fff215a..e787e49 100644
--- a/etnaviv_mmu.h
+++ b/etnaviv_mmu.h
@@ -62,10 +62,15 @@ void etnaviv_iommu_unmap_gem(struct etnaviv_iommu *mmu,
 	struct etnaviv_vram_mapping *mapping);
 void etnaviv_iommu_destroy(struct etnaviv_iommu *iommu);
 
+u32 etnaviv_iommu_get_cmdbuf_va(struct etnaviv_gpu *gpu,
+				struct etnaviv_cmdbuf *buf);
+void etnaviv_iommu_put_cmdbuf_va(struct etnaviv_gpu *gpu,
+				 struct etnaviv_cmdbuf *buf);
+
 size_t etnaviv_iommu_dump_size(struct etnaviv_iommu *iommu);
 void etnaviv_iommu_dump(struct etnaviv_iommu *iommu, void *buf);
 
-struct etnaviv_iommu *etnaviv_iommu_new(struct etnaviv_gpu *gpu,
-	struct iommu_domain *domain, enum etnaviv_iommu_version version);
+struct etnaviv_iommu *etnaviv_iommu_new(struct etnaviv_gpu *gpu);
+void etnaviv_iommu_restore(struct etnaviv_gpu *gpu);
 
 #endif /* __ETNAVIV_MMU_H__ */
diff --git a/state_3d.xml.h b/state_3d.xml.h
new file mode 100644
index 0000000..d7146fd
--- /dev/null
+++ b/state_3d.xml.h
@@ -0,0 +1,9 @@
+#ifndef STATE_3D_XML
+#define STATE_3D_XML
+
+/* This is a cut-down version of the state_3d.xml.h file */
+
+#define VIVS_TS_FLUSH_CACHE					0x00001650
+#define VIVS_TS_FLUSH_CACHE_FLUSH				0x00000001
+
+#endif /* STATE_3D_XML */