# Body of the code

This code is intended to be ran on Google Colab. This is because I am unable to run Python efficiently locally. <br />
No more than the free tier is required, as the T4 GPU offers plenty of computational power for the below code. In fact, the free tier sooner runs out of memory allocation capabilities than into any computation on the order of 10s. <br />
The *!uv pip install* part is due to newer versions of cuda (as of Sep 2025) not running properly on Google colab. Thanks to Dr. Ahmed Al-Refaie for the code snippet, allowing the below implementation to run.

In [None]:
import numpy as np
import cupy as cp
import matplotlib.pyplot as plt

!uv pip install -q --system numba-cuda==0.4.0
from numba import config
config.CUDA_ENABLE_PYNVJITLINK = 1

import numba
from numba import cuda
from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_normal_float32

## Parameters

In [None]:
M = 100         #number of trajectories

N = 5_000_000   #number of integration steps

dt = 2e-3       #integration step size

## Potential

In [None]:
# Device function (runs on GPU only) - this will inform the CUDA simulations
@cuda.jit(device=True)
def device_Vprime(x, a):
  if x>a: return 3.0*(x-(a+2.0))*(x-a)**2 + (x-a)**3
  elif x<-a: return 3.0*(x+(a+2.0))*(x+a)**2 + (x+a)**3
  else: return 2.0/(5.0*a**2)*x


#@cuda.jit(device=True)
#def device_Vprime(x, a):
#  return x*x*x - 3*x

## RNG

In [None]:
host_seed = 2**63 - 1
# This is necessary for the xoroshiro generator

## Simulation Functions

Initiate M parallel processes, each simulating N steps of the Overdamped Langevin equation, from the given $V'(x)$ above.

Check that at no point do we convert to double, or else all this we lose a lot of optimisation... In any case the current issue is memory allocation due to the size of the arrays, not speed... Still!

In [None]:
def launch_EMOL_M32(a=2.0, N=25_000, M=200, dt=np.single(1e-3), eps=0.5, blockdim=256, ic=0.0, rngseed=host_seed):
  griddim = int(np.ceil(M/blockdim)) # for some reason np.ceil() returns double?
  host_X = np.empty((M,N), dtype=np.single)
  host_X[:,0] = ic
  dev_X = cuda.to_device(host_X)
  a = np.single(a)

  rng_states = create_xoroshiro128p_states(M, seed=rngseed) # necessary for device rng
  sqrt2epsdt = np.single(np.sqrt(2*eps*dt))

  kernel_update_traj32_devrng[griddim, blockdim](dev_X, a, N, M, dt, sqrt2epsdt, rng_states)
  cuda.synchronize()

  host_X = dev_X.copy_to_host()
  return host_X


@cuda.jit # kernel callcable from host
def kernel_update_traj32_devrng(dev_X, a, N, M, dt, sqrt2epsdt, rng_states):
  # a kernel function that uses xoroshiro pseudo-rng to generate noises matrix
  glob_idx = cuda.grid(1)
  if glob_idx >= M:
    return
  x = dev_X[glob_idx, 0]
  for i in range(N-1):
    noise = xoroshiro128p_normal_float32(rng_states, glob_idx)
    x = kernel_1stepEMOL(x, a, sqrt2epsdt, dt, noise)
    dev_X[glob_idx, i+1] = x


@cuda.jit(device=True) # device function
def kernel_1stepEMOL(x, a, sqrt2epsdt, dt, noise):
  drift = -device_Vprime(x, a) * dt
  diff = sqrt2epsdt * noise
  x += drift + diff
  return x

## Running Simulations

In [None]:
X = launch_EMOL_M32(a=2.0, M=M, N=N, dt=dt, blockdim=128)

## Plots

In [None]:
plt.figure(figsize=(12, 5))

plt.hist(X.flatten(), bins=800, density=True, alpha=0.7, align="mid")
plt.xlabel("x")
plt.ylabel("Density")
plt.title("Histogram of X values")