In [1]:
import pathlib
if (cwd := pathlib.Path().cwd()).parts[-1] == 'drafts':
    import sys
    sys.path.append(cwd.parent.as_posix())

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.feature import PCA, VectorAssembler

from pyspark.sql.functions import col
from pyspark.ml.functions import vector_to_array

data_folder = pathlib.Path('../data' if cwd.parts[-1] == 'drafts' else 'data')

In [2]:
config = SparkConf().setAll([
    # ('spark.executor.memory', '12g'),
    ('spark.driver.memory','12g'),
])

In [4]:
spark = SparkSession.builder\
    .appName('PySpark_PCA_compressor')\
    .config(conf=config)\
    .getOrCreate()
spark

In [4]:
# load
drop_feats = ['75', '81', '85', '139', '203']
feats = spark.read.csv(data_folder.joinpath('features.csv').as_posix(), sep='\t', header=True, inferSchema=True).drop('Unnamed: 0', *drop_feats)
columns = feats.columns[3:]

                                                                                

In [32]:
# apply PCA
assembler = VectorAssembler(inputCols=columns, outputCol='features')
assembled = assembler.transform(feats)

pca = PCA(k=3, inputCol='features', outputCol='compressed').fit(assembled)
compressed = pca.transform(assembled)

                                                                                

In [25]:
# collect features
features = compressed.withColumn('f', vector_to_array('compressed'))\
                     .select(['id', 'buy_time'] + [col('f')[i].alias(f'{i}') for i in range(3)])

In [27]:
# save
features.repartition(1).write.mode('overwrite').csv('compressed_features.csv', header=True, sep=',')

                                                                                

In [28]:
features.show()

+-------+----------+--------------------+--------------------+--------------------+
|     id|  buy_time|                   0|                   1|                   2|
+-------+----------+--------------------+--------------------+--------------------+
|2013026|1531688400|-8.444253424006808E8|-6.478667382867271E8|1.3497696940835571E9|
|2014722|1539550800|-1.21539417372582...| 7.114047412611544E8| -1.89668587513758E8|
|2015199|1545598800|-1.236285956670907E9| 7.607866375670496E8|-1.92144971960505...|
|2021765|1534107600|-7.928709258349836E8|-6.997295869577612E8|-1.25276320669852...|
|2027465|1533502800|  6.77941325548863E8|-2.63496346347243...|-8.669098698450881E7|
|2028410|1534107600|   6.7794132555685E8|-2.63496346386207...| -8.66909870059191E7|
|2030773|1544994000|-8.029734124193027E8|-7.027259167663276E8|-1.25541348890979...|
|2032337|1537736400|-7.963488313823535E8|-7.007611096505101E8|-1.25367559456098...|
|2033486|1545598800| 6.779413255575365E8|-2.63496346374613...|-8.66909869963

In [None]:
# На заметочку https://habr.com/ru/company/otus/blog/579008/