Ноутбук по проекту chicago_spark.  
Агрегация данных по гео-ключам (районы, округа) и временным промежуткам

## Импорты

In [1]:
import os
import sys
import warnings

In [2]:
import pandas as pd
from scipy import stats
from scipy.signal import welch
from sklearn.neighbors import NearestNeighbors
import numpy as np

In [3]:
from importlib import reload
import time
from tqdm import tqdm
from functools import reduce
from itertools import islice
from collections import defaultdict

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
from IPython.display import display, HTML

In [5]:
from enviserv.dictan import DictAnalyzer # анализ словарей
import pandserv as pds # форматирование небольших пандас ДФ

In [6]:
from sparkserv import SparkApp, Cols
# в SparkApp упакованы функции создания спарк приложения 
# с определением IP мастер-ноды и с подключением к кластеру

# Col - класс для формирования коротких псевдонимов имен столбцов
# при этом исходные имена полей не меняются

In [57]:
from pyspark.sql import DataFrame
import pyspark.sql.functions as f
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, LongType
from pyspark import StorageLevel
from pyspark.sql.window import Window
from pyspark.sql import Row
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [8]:
# гео библиотеки
import geopandas as gpd
from shapely import wkt
from shapely.geometry import Point

## Создание сессии, загрузка данных

In [9]:
spark_app = SparkApp(my_logger_create_level = 'INFO')

INFO:spark_app:spark_master_ip: 172.18.0.2
INFO:spark_app:pyspark version: 3.4.1
INFO:spark_app:starting building spark app object: pyspark-taxi-forecasting
INFO:spark_app:Spark app object built as: <pyspark.sql.session.SparkSession object at 0x7f7e92efd710>
INFO:spark_app:Spark object can be accessed as the SparkApp_object.spark property


In [10]:
spark_master_ip = spark_app.get_spark_master_ip()
# print(spark_master_ip)

INFO:spark_app:spark_master_ip: 172.18.0.2


In [11]:
# spark = spark_app.build_spark_app(spark_master_ip=spark_master_ip)
# spark = spark_app.spark

Для корректного завершения спарк-сессии (например, для переключения между ноутбуками) следует останавливать сессию полностью. Для этого использую метод .stop_spark_app() класса SparkApp

In [12]:
# spark_app.stop_spark_app()

In [13]:
print(spark_app.spark)

<pyspark.sql.session.SparkSession object at 0x7f7e92efd710>


In [14]:
spark_app.build_spark_app()

INFO:spark_app:pyspark version: 3.4.1
INFO:spark_app:starting building spark app object: pyspark-taxi-forecasting
INFO:spark_app:Spark app object built as: <pyspark.sql.session.SparkSession object at 0x7f7e92efd710>
INFO:spark_app:Spark object can be accessed as the SparkApp_object.spark property


Получим стандартный объект `spark` из созданного выше объекта `spark_app`

In [15]:
spark = spark_app.spark

Проверка работы спарк-объекта на кластере. Если все в порядке, то тест должен выполниться достаточно быстро и отобразить тестовый ДФ.  
```txt
+------------+-----------+
|student_name|student_age|
+------------+-----------+
|       Alice|         10|
|         Bob|         20|
+------------+-----------+
```

Если исходные образы кластера собраны с ошибкой, возможно "зависание" работы теста.

In [16]:
spark_app.test_spark_functionality()

Spark session created successfully.
DataFrame created successfully.
Alias DataFrame created successfully.
DataFrame data matches expected result.
+------------+-----------+
|student_name|student_age|
+------------+-----------+
|       Alice|         10|
|         Bob|         20|
+------------+-----------+

DataFrame show output matches expected output.

*      ____              __    *
*     / __/__  ___ _____/ /__  *
*    _\ \/ _ \/ _ `/ __/  '_/  *
*   /__ / .__/\_,_/_/ /_/\_\   *
*      /_/                     * 
        


In [17]:
da = DictAnalyzer()

In [18]:
# Функция простой рандомизированной выборки
def random_sample_dataframe(dataframe, percentage):
    # Генерируем случайные числа от 0 до 1 и фильтруем строки
    df = dataframe.filter(f.rand() < percentage)
    
    return df

In [19]:
# %%time
# agg_hour.coalesce(1).write.csv("/work/data/taxis_agg_hour_growth.csv", header=True, mode="overwrite")

### Загрузка данных

In [20]:
%%time
agg_hour = spark.read.load('/work/data/taxis_agg_hour_growth.csv', 
                       format='csv', header='true'
                        , inferSchema='true'
                       )

CPU times: user 11.6 ms, sys: 10.6 ms, total: 22.2 ms
Wall time: 43.9 s


In [21]:
agg_hour.count()

3079450

In [22]:
agg_hour.cache()

DataFrame[ct: bigint, ca: int, hour_start: timestamp, trips_p: int, time_p: int, miles_p: double, velocity_p: double, farem_p: double, tipsm_p: double, tollsm_p: double, extrasm_p: double, totalm_p: double, comp1_p: int, comp2_p: int, comp3_p: int, comp4_p: int, comp5_p: int, compless5_p: int, trips_d: int, time_d: int, miles_d: double, velocity_d: double, farem_d: double, tipsm_d: double, tollsm_d: double, extrasm_d: double, totalm_d: double, comp1_d: int, comp2_d: int, comp3_d: int, comp4_d: int, comp5_d: int, compless5_d: int, cumulative_balance: int, trips_p_growth_1_to_0: double, trips_p_growth_2_to_1: double, trips_p_growth_3_to_2: double, trips_p_growth_4_to_3: double, trips_d_growth_1_to_0: double, trips_d_growth_2_to_1: double, trips_d_growth_3_to_2: double, trips_d_growth_4_to_3: double, velocity_p_growth_1_to_0: double, velocity_p_growth_2_to_1: double, velocity_p_growth_3_to_2: double, velocity_p_growth_4_to_3: double, velocity_d_growth_1_to_0: double, velocity_d_growth_2_t

In [23]:
agg_null = agg_hour.select([f.count(f.when(f.col(c).isNull(), c)).alias(c) for c in agg_hour.columns]).toPandas()

In [24]:
agg_null = agg_null.transpose()

In [25]:
agg_null[agg_null.iloc[:, 0] > 0]

Unnamed: 0,0


Загрузим данные по исключаемым (высокоррелированным) полям

In [26]:
excl_f = pd.read_csv('/work/data/excluded_fields.csv')

In [27]:
# Функция для преобразования DataFrame в словарь
def get_excuded_fields_to_dict(df):
    excluded_fields_tot = {}
    grouped = df.groupby(['ct', 'ca'])
    for (ct, ca), group in grouped:
        excluded_fields_tot[(ct, ca)] = group['excluded_field'].tolist()
    return excluded_fields_tot

In [28]:
excluded_fields_tot = get_excuded_fields_to_dict(excl_f)

In [29]:
da.print_dict(dict(islice(excluded_fields_tot.items(), 3)))

{
    (10000000000, 91): ['time_p', 'miles_p', 'farem_p', 'comp1_p', 'comp3_p', 'comp4_p', 'comp5_p', 'extrasm_p']
    (12000000001, 53): ['time_p', 'farem_p', 'trips_d', 'time_d', 'miles_d', 'farem_d', 'totalm_p', 'velocity_p_growth_3_to_2']
    (12000000002, 75): ['time_p', 'miles_p', 'farem_p', 'tipsm_p', 'comp5_p', 'trips_d', 'time_d', 'miles_d', 'farem_d', 'tipsm_d', 'comp4_p']
}


Будем считать, что от мультиколлинеарности в линейных моделях с помощью исключения этих полей получится избавиться.

In [30]:
# %%time
# agg_hour.coalesce(1).write.csv("/work/data/taxis_agg_hour_result.csv", header=True, mode="overwrite")

In [31]:
%%time
data = spark.read.load('/work/data/taxis_agg_hour_result.csv', 
                       format='csv', header='true'
                        , inferSchema='true'
                       )

CPU times: user 19.1 ms, sys: 11 ms, total: 30.1 ms
Wall time: 45 s


In [32]:
print(pds.gvf(data.count()))
data.cache()

3'062'043


DataFrame[ct: bigint, ca: int, hour_start: timestamp, time_p: int, miles_p: double, velocity_p: double, farem_p: double, tipsm_p: double, tollsm_p: double, extrasm_p: double, totalm_p: double, comp1_p: int, comp2_p: int, comp3_p: int, comp4_p: int, comp5_p: int, compless5_p: int, trips_d: int, time_d: int, miles_d: double, velocity_d: double, farem_d: double, tipsm_d: double, tollsm_d: double, extrasm_d: double, totalm_d: double, comp1_d: int, comp2_d: int, comp3_d: int, comp4_d: int, comp5_d: int, compless5_d: int, cumulative_balance: int, trips_p_growth_1_to_0: double, trips_p_growth_2_to_1: double, trips_p_growth_3_to_2: double, trips_p_growth_4_to_3: double, trips_d_growth_1_to_0: double, trips_d_growth_2_to_1: double, trips_d_growth_3_to_2: double, trips_d_growth_4_to_3: double, velocity_p_growth_1_to_0: double, velocity_p_growth_2_to_1: double, velocity_p_growth_3_to_2: double, velocity_p_growth_4_to_3: double, velocity_d_growth_1_to_0: double, velocity_d_growth_2_to_1: double, v

### Фрагмент данных

Отберем несколько полей и один район для проверки

In [33]:
f_to_sel = [
    'ct', 'ca', 'hour_start', 'trips_target', 'time_p', 'miles_p', 'velocity_p',
]

data_sample = data.select(*f_to_sel).filter(f.col('ct')==17031090200)
data_sample.show(3)
# data_sample.cache()

+-----------+---+-------------------+------------+------+-------+--------------------+
|         ct| ca|         hour_start|trips_target|time_p|miles_p|          velocity_p|
+-----------+---+-------------------+------------+------+-------+--------------------+
|17031090200|  6|2021-01-08 01:00:00|           1|     0|    0.0|                 0.0|
|17031090200|  6|2021-01-08 02:00:00|           2|   706|   3.24|0.004589235127478754|
|17031090200|  6|2021-01-08 03:00:00|           1|   990|    2.9|0.002929292929292929|
+-----------+---+-------------------+------------+------+-------+--------------------+
only showing top 3 rows



In [34]:
" ".join(excluded_fields_tot[(17031090200,6)])

'time_p miles_p farem_p tipsm_p comp1_p comp3_p comp4_p trips_d time_d miles_d farem_d tipsm_d extrasm_d comp2_d comp3_d comp4_d extrasm_p comp5_p'

Данные загрузились. Общее количество и отображение фрагмента соответствуют ожиданиям. Исключаемые поля в доступе.

## Подготовка данных к МО

Векторизация признаков

In [35]:
# data.cache()

DataFrame[ct: bigint, ca: int, hour_start: timestamp, time_p: int, miles_p: double, velocity_p: double, farem_p: double, tipsm_p: double, tollsm_p: double, extrasm_p: double, totalm_p: double, comp1_p: int, comp2_p: int, comp3_p: int, comp4_p: int, comp5_p: int, compless5_p: int, trips_d: int, time_d: int, miles_d: double, velocity_d: double, farem_d: double, tipsm_d: double, tollsm_d: double, extrasm_d: double, totalm_d: double, comp1_d: int, comp2_d: int, comp3_d: int, comp4_d: int, comp5_d: int, compless5_d: int, cumulative_balance: int, trips_p_growth_1_to_0: double, trips_p_growth_2_to_1: double, trips_p_growth_3_to_2: double, trips_p_growth_4_to_3: double, trips_d_growth_1_to_0: double, trips_d_growth_2_to_1: double, trips_d_growth_3_to_2: double, trips_d_growth_4_to_3: double, velocity_p_growth_1_to_0: double, velocity_p_growth_2_to_1: double, velocity_p_growth_3_to_2: double, velocity_p_growth_4_to_3: double, velocity_d_growth_1_to_0: double, velocity_d_growth_2_to_1: double, v

In [35]:
# def all_output_columns(df,target_variable_name):
#     base_cols = df.columns
#     base_cols.remove(target_variable_name)
#     selectedCols = [target_variable_name, 'features'] + base_cols
#     return selectedCols

In [36]:
# def assemble_vectors(df, features_list, target_variable_name):
#     stages = []
#     # assemble vectors
#     assembler = VectorAssembler(inputCols=features_list, outputCol='features')
#     stages = [assembler]
#     # select all the columns + target + newly created 'features' column
#     selectedCols = all_output_columns(df,target_variable_name)
#     # use pipeline to process sequentially
#     pipeline = Pipeline(stages=stages)
#     # assemble model
#     assembleModel = pipeline.fit(df)
#     # apply assembler model on data
#     df = assembleModel.transform(df).select(selectedCols)
#     return df

In [37]:
# Получение уникальных комбинаций районов и округов
# geo_keys = data.select('ct', 'ca').distinct().collect()

In [38]:
# исключаемые поля для всех районов
# exc_cols = ['ct', 'ca', 'hour_start']

In [39]:
# # векторизация
# df_vec = None
# for row in tqdm(geo_keys):
    
#     ct = row['ct']
#     ca = row['ca']
#     # Фильтрация данных для текущего района и округа
#     geo_data = data.filter((f.col('ct') == ct) & (f.col('ca') == ca))
#     # print(row,geo_data.count())
#     excluded_fields_list = excluded_fields_tot[row] + exc_cols
#     features_list = [col for col in data.columns if col not in excluded_fields_list]
#     # print(features_list)
#     assembled_data = assemble_vectors(df=geo_data,
#                                 features_list=features_list,
                                
#                                 target_variable_name='trips_target')
#     # print(assembled_data.count())
#     # Если это первая итерация, инициализируем df_vec
#     if df_vec is None:
#         df_vec = assembled_data
#     else:
#         df_vec = df_vec.union(assembled_data)
#         # print(df_vec.count())

In [41]:
# Создание нового пустого DataFrame с той же схемой
# empty_df = spark.createDataFrame(spark.sparkContext.emptyRDD(), geo_data.schema)

In [40]:
# df_vec.show(3)
# df_vec.cache()
# print(pds.gvf(df_vec.count()))

In [41]:
# f_to_sel = [
#     'ct', 'ca', 'hour_start', 'trips_target', 'features',
# ]

# df_vec.select(*f_to_sel).filter(f.col('ct')==17031838200).show(3)

In [42]:
# if df_vec.is_cached:
#  print("DataFrame закэширован")
# else:
#  print("DataFrame не закэширован")

# if df_vec.storageLevel == StorageLevel.MEMORY_ONLY:
#  print("DataFrame закэширован в памяти")
# elif df_vec.storageLevel == StorageLevel.DISK_ONLY:
#  print("DataFrame закэширован на диске")
# else:
 # print("DataFrame закэширован в:",df_vec.storageLevel)

данные DataFrame были закэшированы с использованием уровня хранения Disk Memory Deserialized 1x Replicated. Это значит, что данные сначала сериализуются и сохраняются на диске, а затем десериализуются в память для обработки. Также данные были реплицированы один раз, что обеспечивает дополнительную надежность и доступность данных.

In [60]:
# %%time
# print(pds.gvf(df_vec.count()))

3'062'043
CPU times: user 38.3 ms, sys: 9.54 ms, total: 47.8 ms
Wall time: 2min 27s


In [61]:
# %%time
# print(pds.gvf(data.count()))

3'062'043
CPU times: user 10.7 ms, sys: 83 µs, total: 10.7 ms
Wall time: 19.9 s


На данный момент непонятна выгода от векторизации. При более медленном выполнении действий, нет пока понятного способа корректно сохранить-загрузить веторизированный ДФ. В csv нельзя сохранить тип vector, в parquet - можно, но после загрузки выдаются ошибки работы JVM объектов..

In [63]:
# df_vec.printSchema()

### Train-test split

Разделю ДФ на обучающую и тестовую выборки: тест - 2024 год, трэйн - 2021-2023 гг. 

In [43]:
# Разделение данных на тренировочный и тестовый наборы
train = data.filter(f.year(f.col("hour_start")) < 2024)
test = data.filter(f.year(f.col("hour_start")) == 2024)

In [45]:
print(pds.gvf(train.count()))
train.cache()
print(pds.gvf(test.count()))
test.cache()

2'686'518
375'525


DataFrame[ct: bigint, ca: int, hour_start: timestamp, time_p: int, miles_p: double, velocity_p: double, farem_p: double, tipsm_p: double, tollsm_p: double, extrasm_p: double, totalm_p: double, comp1_p: int, comp2_p: int, comp3_p: int, comp4_p: int, comp5_p: int, compless5_p: int, trips_d: int, time_d: int, miles_d: double, velocity_d: double, farem_d: double, tipsm_d: double, tollsm_d: double, extrasm_d: double, totalm_d: double, comp1_d: int, comp2_d: int, comp3_d: int, comp4_d: int, comp5_d: int, compless5_d: int, cumulative_balance: int, trips_p_growth_1_to_0: double, trips_p_growth_2_to_1: double, trips_p_growth_3_to_2: double, trips_p_growth_4_to_3: double, trips_d_growth_1_to_0: double, trips_d_growth_2_to_1: double, trips_d_growth_3_to_2: double, trips_d_growth_4_to_3: double, velocity_p_growth_1_to_0: double, velocity_p_growth_2_to_1: double, velocity_p_growth_3_to_2: double, velocity_p_growth_4_to_3: double, velocity_d_growth_1_to_0: double, velocity_d_growth_2_to_1: double, v

Масштабирование

In [60]:
# Получение уникальных комбинаций районов и округов
geo_keys = data.select('ct', 'ca').distinct().collect()

In [61]:
# исключаемые поля для всех районов
exc_cols = ['ct', 'ca', 'hour_start']

In [87]:
def fitted_scaler(df, featureCol, outputCol):
    
    stages = []

    scaler = StandardScaler(inputCol = featureCol,
                            outputCol=outputCol,
                            withStd=True, withMean=True
                           )
    
    stages = [scaler]
    
    # use pipeline to process sequentially
    pipeline = Pipeline(stages=stages)
    # assemble model
    scaledAssembleModel = pipeline.fit(df)

    return scaledAssembleModel

In [78]:
base_cols = data.columns
target_variable_name = 'trips_target'

In [79]:
test_point = (17031090200,6)
" ".join(excluded_fields_tot[test_point])

'time_p miles_p farem_p tipsm_p comp1_p comp3_p comp4_p trips_d time_d miles_d farem_d tipsm_d extrasm_d comp2_d comp3_d comp4_d extrasm_p comp5_p'

In [80]:
test_point[0]

17031090200

In [93]:
ct, ca = test_point

excluded_fields_list = excluded_fields_tot[(ct,ca)] + exc_cols
features_list = [col for col in data.columns if col not in excluded_fields_list]
scaled_features_list = [f"{col}_scaled" for col in features_list]

mean_and_std_cols=[c for col in features_list for c in 
    (f.mean(col).alias(f"{col}_mean"),f.stddev(col).alias(f"{col}_std"))]

# mean_and_std_cols

# mean_and_std = test_df.select(mean_and_std_cols).first()
scaled_cols=[((f.col(col) - mean_and_std[f"{col}_mean"])
    /mean_and_std[f"{col}_std"]).alias(f"{col}_scaled") for col in features_list]
# test_df = test_df.select(test_df.columns + scaled_cols)
scaled_cols

NameError: name 'mean_and_std' is not defined

In [98]:
# for row in tqdm(geo_keys):
    
#     ct = row['ct']
#     ca = row['ca']
    
def lin_regr_with_scale(train, test, point):

    ct = point[0]
    ca = point[1]
    geo_train = train.filter((f.col('ct') == ct) & (f.col('ca') == ca))
    geo_train.cache()
    geo_test = test.filter((f.col('ct') == ct) & (f.col('ca') == ca))
    geo_test.cache()

    
    excluded_fields_list = excluded_fields_tot[(ct,ca)] + exc_cols
    features_list = [col for col in data.columns if col not in excluded_fields_list]
    scaled_features_list = [f"{col}_scaled" for col in features_list]

    # масштабируем в лоб, поскольку работаем не с ветор-столбцом
    mean_and_std_cols=[c for col in features_list for c in 
    (f.mean(col).alias(f"{col}_mean"),f.stddev(col).alias(f"{col}_std"))]
    
    mean_and_std = geo_train.select(mean_and_std_cols).first()
    scaled_cols=[((f.col(col) - mean_and_std[f"{col}_mean"])
        /mean_and_std[f"{col}_std"]).alias(f"{col}_scaled") for col in features_list]
    scaled_geo_train = geo_train.select(geo_train.columns + scaled_cols)
    scaled_geo_test = geo_test.select(geo_test.columns + scaled_cols)
    
    selectedCols = exc_cols + [target_variable_name] + scaled_features_list

    # for feat in features_list:
    #     scaled_feat = f"{feat}_scaled"
    #     geo_scaler = fitted_scaler(geo_train, feat, scaled_feat)
    #     scaled_geo_train = geo_scaler.transform(geo_train)
    #     scaled_geo_test = geo_scaler.transform(geo_test)

    scaled_geo_train = scaled_geo_train.select(selectedCols)
    scaled_geo_test = scaled_geo_test.select(selectedCols)
    
    reg = LinearRegression(featuresCol = scaled_features_list, labelCol = target_variable_name)

    reg_model = reg.fit(scaled_geo_train)
    
    # Создание объекта RegressionEvaluator для оценки модели
    evaluator = RegressionEvaluator()

    # Настройка параметров оценки (RMSE и MAPE)
    evaluator.setMetricName("rmse")
    evaluator.setPredictionCol("prediction")
    evaluator.setLabelCol("label")

    # Оценка модели
    rmse = evaluator.evaluate(reg_model.transform(scaled_geo_train))
    
    # Аналогично для MAPE
    evaluator.setMetricName("mape")
    mape = evaluator.evaluate(reg_model.transform(scaled_geo_train))

    return reg_model, scaled_geo_train, rmse, mape

In [99]:
reg_model, scaled_geo_train, rmse, mape = lin_regr_with_scale(train, test, test_point)
# scaled_geo_train = lin_regr_with_scale(train, test, test_point)

TypeError: Invalid param value given for param "featuresCol". Could not convert <class 'list'> to string type

In [96]:
" ".join(scaled_geo_train.columns)

'ct ca hour_start time_p miles_p velocity_p farem_p tipsm_p tollsm_p extrasm_p totalm_p comp1_p comp2_p comp3_p comp4_p comp5_p compless5_p trips_d time_d miles_d velocity_d farem_d tipsm_d tollsm_d extrasm_d totalm_d comp1_d comp2_d comp3_d comp4_d comp5_d compless5_d cumulative_balance trips_p_growth_1_to_0 trips_p_growth_2_to_1 trips_p_growth_3_to_2 trips_p_growth_4_to_3 trips_d_growth_1_to_0 trips_d_growth_2_to_1 trips_d_growth_3_to_2 trips_d_growth_4_to_3 velocity_p_growth_1_to_0 velocity_p_growth_2_to_1 velocity_p_growth_3_to_2 velocity_p_growth_4_to_3 velocity_d_growth_1_to_0 velocity_d_growth_2_to_1 velocity_d_growth_3_to_2 velocity_d_growth_4_to_3 trips_sh_168 trips_sh_84 trips_sh_24 trips_sh_28 trips_sh_12 trips_sh_8 trips_ma_168 trips_ma_84 trips_ma_24 trips_ma_28 trips_ma_12 trips_ma_8 trips_sh_4 trips_ma_4 trips_sh_1 trips_ma_1 trips_ma_168_growth trips_ma_8_growth trips_ma_4_growth trips_target velocity_p_scaled tollsm_p_scaled totalm_p_scaled comp2_p_scaled compless5_p_s

In [None]:
# scaled_geo_test = geo_scaler.transform(geo_test).select(selectedCols)

#     # Если это первая итерация, инициализируем df_scaled_
#     if df_scaled_train is None:
#         df_scaled_train = scaled_geo_train
#     else:
#         df_scaled_train = df_scaled_train.union(scaled_geo_train)
        
#     if df_scaled_test is None:
#         df_scaled_test = scaled_geo_test
#     else:
#         df_scaled_test = df_scaled_test.union(scaled_geo_test)

In [55]:
# масштабирование
# df_scaled_train = None
# df_scaled_test = None
# for row in tqdm(geo_keys):
    
#     ct = row['ct']
#     ca = row['ca']
#     # Фильтрация данных для текущего района и округа
#     geo_train = train.filter((f.col('ct') == ct) & (f.col('ca') == ca))
#     geo_test = test.filter((f.col('ct') == ct) & (f.col('ca') == ca))
    
#     # print(features_list)
#      # select all the columns + target + newly created 'features' column
#     geo_scaler = fitted_scaler(geo_train, target_variable_name)
#     scaled_geo_train = geo_scaler.transform(geo_train).select(selectedCols)
#     scaled_geo_test = geo_scaler.transform(geo_test).select(selectedCols)
    
#     # print(assembled_data.count())
#     # Если это первая итерация, инициализируем df_scaled_
#     if df_scaled_train is None:
#         df_scaled_train = scaled_geo_train
#     else:
#         df_scaled_train = df_scaled_train.union(scaled_geo_train)
        
#     if df_scaled_test is None:
#         df_scaled_test = scaled_geo_test
#     else:
#         df_scaled_test = df_scaled_test.union(scaled_geo_test)

In [None]:
# df_scaled_test.cache()

In [None]:
# df_scaled_train.cache()

In [None]:
# df_scaled_test.count()

In [56]:
# df_scaled_train.count()