1+ /* **************************************************************************************************
2+  * Copyright (C) 2025 Intel Corporation, All rights reserved. 
3+  * SPDX-License-Identifier: BSD-3-Clause 
4+  * 
5+  * Redistribution and use in source and binary forms, with or without 
6+  * modification, are permitted provided that the following conditions are met: 
7+  * 
8+  * 1. Redistributions of source code must retain the above copyright notice, this 
9+  * list of conditions and the following disclaimer. 
10+  * 
11+  * 2. Redistributions in binary form must reproduce the above copyright notice, 
12+  * this list of conditions and the following disclaimer in the documentation 
13+  * and/or other materials provided with the distribution. 
14+  * 
15+  * 3. Neither the name of the copyright holder nor the names of its 
16+  * contributors may be used to endorse or promote products derived from 
17+  * this software without specific prior written permission. 
18+  * 
19+  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
20+  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
21+  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
22+  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 
23+  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
24+  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 
25+  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
26+  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 
27+  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
28+  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
29+  * 
30+  **************************************************************************************************/  
31+ 
32+ #pragma  once
33+ 
34+ namespace  cute  {
35+ 
36+ /*  Flat copies */ 
37+ template  <class  SrcEngine , class  SrcLayout ,
38+           class  DstEngine , class  DstLayout >
39+ CUTE_HOST_DEVICE
40+ void 
41+ copy_block_r2s (Tensor<SrcEngine, SrcLayout> const & src,
42+                Tensor<DstEngine, DstLayout>      & dst)
43+ {
44+   static_assert (is_rmem_v<SrcEngine> && is_smem_v<DstEngine>, " Expected rmem->smem copy"  );
45+ 
46+   auto  atom_r2s = Copy_Atom<XE_1D_STSM<float >, float >{};    //  TODO: larger block messages
47+ 
48+   auto  atom_shape = make_shape (_1{}, size (src));
49+   auto  src_v = src.compose (make_layout (atom_shape));
50+   auto  dst_v = dst.compose (make_layout (atom_shape, Stride<_0, _16>{}));
51+ 
52+   copy (atom_r2s, src_v, dst_v);
53+ }
54+ 
55+ template  <class  SrcEngine , class  SrcLayout ,
56+           class  DstEngine , class  DstLayout >
57+ CUTE_HOST_DEVICE
58+ void 
59+ copy_block_s2r (Tensor<SrcEngine, SrcLayout> const & src,
60+                Tensor<DstEngine, DstLayout>      & dst)
61+ {
62+   static_assert (is_smem_v<SrcEngine> && is_rmem_v<DstEngine>, " Expected smem->rmem copy"  );
63+ 
64+   auto  atom_s2r = Copy_Atom<XE_1D_LDSM<float >, float >{};
65+ 
66+   auto  atom_shape = make_shape (_1{}, size (dst));
67+   auto  src_v = src.compose (make_layout (atom_shape, Stride<_0, _16>{}));
68+   auto  dst_v = dst.compose (make_layout (atom_shape));
69+ 
70+   copy (atom_s2r, src_v, dst_v);
71+ }
72+ 
73+ /*  Coordinate-aware copies */ 
74+ template  <class  SrcEngine , class  SrcLayout , class  SrcCoordLayout ,
75+           class  DstEngine , class  DstLayout , class  DstCoordLayout >
76+ CUTE_HOST_DEVICE
77+ void 
78+ copy_block_r2s (SubgroupTensor<SrcEngine, SrcLayout, SrcCoordLayout> const & src,
79+                Tensor<DstEngine, DstLayout>                              & dst,
80+                DstCoordLayout                                       const & dst_c)
81+ {
82+   using  _SG = intel::_SGSize;
83+ 
84+   static_assert (is_rmem_v<SrcEngine> && is_smem_v<DstEngine>, " Expected rmem->smem copy"  );
85+   static_assert (sizeof_bits_v<typename  SrcEngine::value_type> == 32 , " Only 32-bit data supported"  );
86+ 
87+   auto  atom_r2s = Copy_Atom<XE_1D_STSM<float >, float >{};    //  TODO: larger block messages
88+ 
89+   auto  atom_shape = make_shape (_1{}, size (SrcLayout{}));
90+ 
91+   auto  src_c_wi0 = composition (project_strides (SrcCoordLayout{}), make_layout (atom_shape, Stride<_0, _SG>{}));
92+   auto  rlayout = composition (right_inverse (project_strides (dst_c)), src_c_wi0);
93+ 
94+   auto  src_v = src.compose (make_layout (atom_shape));
95+   auto  dst_v = dst.compose (rlayout);
96+ 
97+   copy (atom_r2s, src_v, dst_v);
98+ }
99+ 
100+ template  <class  SrcEngine , class  SrcLayout , class  SrcCoordLayout ,
101+           class  DstEngine , class  DstLayout , class  DstCoordLayout >
102+ CUTE_HOST_DEVICE
103+ void 
104+ copy_block_s2r (Tensor<SrcEngine, SrcLayout>                         const & src,
105+                SrcCoordLayout                                       const & src_c,
106+                SubgroupTensor<DstEngine, DstLayout, DstCoordLayout>      & dst)
107+ {
108+   using  _SG = intel::_SGSize;
109+ 
110+   static_assert (is_smem_v<SrcEngine> && is_rmem_v<DstEngine>, " Expected smem->rmem copy"  );
111+   static_assert (sizeof_bits_v<typename  SrcEngine::value_type> == 32 , " Only 32-bit data supported"  );
112+ 
113+   auto  atom_s2r = Copy_Atom<XE_1D_LDSM<float >, float >{};
114+ 
115+   auto  atom_shape = make_shape (_1{}, size (DstLayout{}));
116+ 
117+   auto  dst_c_wi0 = composition (project_strides (DstCoordLayout{}), make_layout (atom_shape, Stride<_0, _SG>{}));
118+   auto  rlayout = composition (right_inverse (project_strides (src_c)), dst_c_wi0);
119+ 
120+   auto  src_v = src.compose (rlayout);
121+   auto  dst_v = dst.compose (make_layout (atom_shape));
122+ 
123+   copy (atom_s2r, src_v, dst_v);
124+ }
125+ 
126+ /*  Variants accepting rvalue dst */ 
127+ template  <class  SrcEngine , class  SrcLayout ,
128+           class  DstEngine , class  DstLayout >
129+ CUTE_HOST_DEVICE
130+ void 
131+ copy_block_r2s (Tensor<SrcEngine, SrcLayout> const & src,
132+                Tensor<DstEngine, DstLayout>     && dst)
133+ {
134+   return  copy_block_r2s (src, dst);
135+ }
136+ 
137+ template  <class  SrcEngine , class  SrcLayout ,
138+           class  DstEngine , class  DstLayout >
139+ CUTE_HOST_DEVICE
140+ void 
141+ copy_block_s2r (Tensor<SrcEngine, SrcLayout> const & src,
142+                Tensor<DstEngine, DstLayout>     && dst)
143+ {
144+   return  copy_block_s2r (src, dst);
145+ }
146+ 
147+ template  <class  SrcEngine , class  SrcLayout , class  SrcCoordLayout ,
148+           class  DstEngine , class  DstLayout , class  DstCoordLayout >
149+ CUTE_HOST_DEVICE
150+ void 
151+ copy_block_r2s (SubgroupTensor<SrcEngine, SrcLayout, SrcCoordLayout> const & src,
152+                Tensor<DstEngine, DstLayout>                             && dst,
153+                DstCoordLayout                                       const & dst_c)
154+ {
155+   return  copy_block_r2s (src, dst, dst_c);
156+ }
157+ 
158+ template  <class  SrcEngine , class  SrcLayout , class  SrcCoordLayout ,
159+           class  DstEngine , class  DstLayout , class  DstCoordLayout >
160+ CUTE_HOST_DEVICE
161+ void 
162+ copy_block_s2r (Tensor<SrcEngine, SrcLayout>                         const & src,
163+                SrcCoordLayout                                       const & src_c,
164+                SubgroupTensor<DstEngine, DstLayout, DstCoordLayout>     && dst)
165+ {
166+   return  copy_block_s2r (src, dst);
167+ }
168+ 
169+ } /*  namespace cute */ 
0 commit comments