In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.getOrCreate()

In [78]:
df = spark.createDataFrame([{'id': 1, 'value': 1,'amount':2}, {'id': 2, 'value': 2,'amount':3}])

In [145]:
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
from typing import List

def addColumnPrefix(df:DataFrame,
                    prefix:str='prefix',
                    colsList:List=[]):
    if not colsList:
        colsList = df.columns
    def _keyExists(s):
        return s in colsList
    cols = list(map(
             lambda col_name:F.col(col_name).alias('{0}_{1}'.format(prefix,col_name))
             if _keyExists(col_name) else F.col(col_name),df.columns))
    return df.select(*cols)
    #if not colsList:
    #    colsList = df.columns
    #return df.select([F.col(c).alias('{0}_{1}'.format(prefix,c)) if c in colsList else
    #                  F.col(c).alias('{0}'.format(c))
    #                  for c in df.columns])

def addColumnSuffix(df:DataFrame,
                    suffix:str='suffix',colsList:List=[]):
    if not colsList:
        colsList = df.columns
    def _keyExists(s):
        return s in colsList
    cols = list(map(
             lambda col_name:F.col(col_name).alias('{0}_{1}'.format(col_name,suffix))
             if _keyExists(col_name) else F.col(col_name),df.columns))
    return df.select(*cols)
    #return df.select([F.col(c).alias('{0}_{1}'.format(c,suffix)) if c in colsList else
    #                  F.col(c).alias('{0}'.format(c))
    #                  for c in df.columns])

In [144]:
addColumnPrefix(df,prefix='test',colsList=['id']).show()

+------+-------+-----+
|amount|test_id|value|
+------+-------+-----+
|     2|      1|    1|
|     3|      2|    2|
+------+-------+-----+



In [103]:
addColumnPrefix(df,prefix='test').show()

+-----------+-------+----------+
|test_amount|test_id|test_value|
+-----------+-------+----------+
|          2|      1|         1|
|          3|      2|         2|
+-----------+-------+----------+



In [80]:
data = spark.read.parquet(r'C:\work\python-packages\mimic\mimic\examples\orders.parquet')

In [83]:
addColumnPrefix(data,prefix='test').show()

['orderID', 'OrderDate', 'OrderValue', 'OrderType', 'OrderCategory', 'city']
+------------+--------------------+---------------+--------------+------------------+---------+
|test_orderID|      test_OrderDate|test_OrderValue|test_OrderType|test_OrderCategory|test_city|
+------------+--------------------+---------------+--------------+------------------+---------+
| ORDLBWA5119|2004-10-02 09:28:...|        8135.52|           COD|              Home|   durham|
| ORDJQOW0469|2013-08-30 12:04:...|        6013.05|           COD|              Home|  toronto|
| ORDADHV6229|2017-09-13 20:33:...|        5115.79|           COD|           Furnish|  toronto|
| ORDOPWV1894|2004-03-03 15:36:...|        5361.52|           COD|       Electronics|  toronto|
| ORDENUU3212|2004-05-26 21:57:...|        2505.97|           COD|           Furnish|   durham|
| ORDKWLI8989|2013-05-30 07:26:...|        6577.39|           COD|              Home|  halifax|
| ORDNTEZ1527|2013-02-18 01:17:...|         5928.3|        

In [88]:
addColumnPrefix(df,prefix='test',colsList=['id']).show()

['amount', 'id', 'value']
+-----------+---+----------+
|test_amount| id|test_value|
+-----------+---+----------+
|          2|  1|         1|
|          3|  2|         2|
+-----------+---+----------+



In [89]:
df.columns

['amount', 'id', 'value']

In [90]:
colsList = ['id']

In [93]:
cols = [x for x in df.columns if x not in colsList]

In [94]:
cols

['amount', 'value']

In [104]:
import re

def removeColumnSpaces(df:DataFrame)->DataFrame:
    """
    Adds a suffix to Column names
    Args:
        df       : pyspark DataFrame
    Return:
        pyspark Dataframe
    Example:
    >>>df = spark.createDataFrame([{'id': 1, 'goods value': 1,'total amount':2}])
    >>>removeColumnSpaces(df).show()
        +----------+---+-----------+
        |goodsvalue| id|totalamount|
        +----------+---+-----------+
        |         1|  1|          2|
        +----------+---+-----------+
    
    """
    return df.select([F.col(c).alias('{0}'.format(re.sub(r"\s+", "", c, flags=re.UNICODE))) for c in df.columns])

In [108]:
cd = spark.createDataFrame([{'id': 1, 'goods value': 1,'total amount':2}])

In [109]:
cd.show()

+-----------+---+------------+
|goods value| id|total amount|
+-----------+---+------------+
|          1|  1|           2|
+-----------+---+------------+



In [110]:
removeColumnSpaces(cd).show()

+----------+---+-----------+
|goodsvalue| id|totalamount|
+----------+---+-----------+
|         1|  1|          2|
+----------+---+-----------+



In [127]:
mapping = {'amount':'cash','id':'uniqueID','value':'transaction'}

In [122]:
list(columnsMapping.values())

['cash', 'uniqueID', 'transaction']

In [115]:
columnsMapping['amount']

'cash'

In [112]:
df.columns

['amount', 'id', 'value']

In [128]:
def returnKeys(s):
    return mapping[s]
def ifKeyExists(s):
    return s in mapping
cols =  list(map(
         lambda col_name:F.col(col_name).alias(returnKeys(col_name))
                 if ifKeyExists(col_name) else F.col(col_name),df.columns))

In [129]:
cols

[Column<b'amount AS `cash`'>,
 Column<b'id AS `uniqueID`'>,
 Column<b'value AS `transaction`'>]

In [138]:
from typing import Dict,Callable

def withSomeColumnsRenamed(df:DataFrame,
                              mapping:Dict)->DataFrame:
    def _keys(s):
        return mapping[s]
    def _keysExists(s):
        return s in mapping
    cols = list(map(
         lambda col_name:F.col(col_name).alias(_keys(col_name))
                 if _keysExists(col_name) else F.col(col_name),df.columns))
    return df.select(*cols)

def withColumnsRenamedFunc(df:DataFrame,func:Callable)->DataFrame:
    cols = list(map(
            lambda col_name:F.col(col_name).alias(func(col_name)),df.columns))
    return df.select(*cols)

In [136]:
withSomeColumnsRenamed(df,mapping).show()

+----+--------+-----------+
|cash|uniqueID|transaction|
+----+--------+-----------+
|   2|       1|          1|
|   3|       2|          2|
+----+--------+-----------+



In [146]:
def renameF(s):
    if 'amount' in s:
        return 'cash'
    else:
        return s

withColumnsRenamedFunc(df,renameF).show()

+----+---+-----+
|cash| id|value|
+----+---+-----+
|   2|  1|    1|
|   3|  2|    2|
+----+---+-----+



In [134]:
from typing import function

ImportError: cannot import name 'function' from 'typing' (C:\programming\miniconda\lib\typing.py)