In [None]:
!pip install pyspark findspark catboost

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, IntegerType, FloatType, StructField, DoubleType

conf = SparkConf().set('spark.ui.port', '4050').set('spark.serializer', 'org.apache.spark.serializer.KryoSerializer')\
                  .set('spark.dynamicAllocation.enabled', 'true')\
                  .set('spark.shuffle.service.enabled', 'true') #трекер, чтобы возвращать ресурсы

sc = SparkContext(conf=conf)
spark = SparkSession.builder.master('local[*]').getOrCreate()

Вспомним наши данные из преыдущего семинара

In [2]:
data = spark.read.csv('credit_card_data.csv', header=True, inferSchema=True)
data = data.fillna({'Credit_Product': 'No'})

In [None]:
data.printSchema()

In [None]:
data.show()

Мы уже знакомы с классическими udf которые работают на уровне 1 строки

In [5]:
@F.udf(returnType=IntegerType())
def preprocess_gender(gender):
    return 1 if gender == 'Male' else 0

In [None]:
data.withColumn('Gender_index', preprocess_gender(F.col('Gender')))\
    .select('Gender', 'Gender_index')\
    .show()

Можно аналогично работать с несколькими столбцами, но все также в рамках 1 строки

In [7]:
@F.udf(returnType=IntegerType())
def preprocess_gender_age(gender, age):
    if gender == 'Male' and age <= 30:
        return 0
    elif gender == 'Male' and age > 30:
        return 1
    elif gender == 'Female' and age <= 30:
        return 2
    else:
        return 3

In [None]:
data.withColumn('Gender_age_index', preprocess_gender_age(F.col('Gender'), F.col('Age')))\
    .select('Gender', 'Age', 'Gender_age_index')\
    .show()

Возникают ситуации, когда необходимо посчитать какие-либо величины не на уровне строки, а на уровне нескольких столбцов/строк и это не реализовано в API PySpark

В таких ситуациях может помочь pandas_udf. Например, если хотим посчитать gini

In [9]:
from sklearn.metrics import roc_auc_score
import pandas as pd

@F.pandas_udf('float')
def gini(target: pd.Series, score: pd.Series) -> float:
    score = score.fillna(0)
    gini = 2 * roc_auc_score(target, score) - 1
    return gini

#простой скор для примера
@F.udf(returnType=FloatType())
def get_base_score(gender, age):
    if gender == 'Male' and age <= 30:
        return 0.2
    elif gender == 'Male' and age > 30:
        return 0.5
    elif gender == 'Female' and age <= 30:
        return 0.1
    else:
        return 0.4

In [None]:
data.withColumn('base_score', get_base_score(F.col('Gender'), F.col('Age')))\
    .select(gini(F.col('Is_Lead'), F.col('base_score')))\
    .show()

Можно удобно работать с преобразованиями

In [11]:
@F.pandas_udf('string')
def upper_case(s: pd.Series) -> pd.Series:
    return s.str.upper()

In [None]:
data.withColumn('Occupation_upper', upper_case(F.col('Occupation')))\
    .select('Occupation', 'Occupation_upper')\
    .show()

А можно ли сделать аналог pd.DataFrame.groupby(cols).apply(f)?

In [None]:
@F.pandas_udf(returnType=FloatType(), functionType=F.PandasUDFType.GROUPED_AGG)
def gini(target: pd.Series, score: pd.Series):
    score = score.fillna(0)
    gini = 2 * roc_auc_score(target, score) - 1
    return gini

In [None]:
data.withColumn('base_score', get_base_score(F.col('Gender'), F.col('Age')))\
    .groupBy('Channel_Code')\
    .agg(gini(F.col('Is_Lead'), F.col('base_score')))\
    .show()

А что если мы хотим сделать стандартизацию данных в группах?

In [15]:
def normalize(df):
    bal = df.Avg_Account_Balance
    return df.assign(Avg_Account_Balance = ((bal - bal.mean()) / bal.std()))

In [16]:
data = data.withColumn(
    'Avg_Account_Balance',
    F.col('Avg_Account_Balance').cast(FloatType())
)

In [None]:
data.groupBy('Channel_Code')\
    .applyInPandas(normalize, schema=data.schema)\
    .show()

Давайте обучим простую модель без использования Spark

In [18]:
data = pd.read_csv('credit_card_data.csv')
data = data.fillna({'Credit_Product': 'No'}).drop('ID', axis=1)

In [None]:
data

In [20]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['Is_Lead', 'Gender', 'Region_Code', 'Occupation', 'Channel_Code'], axis=1), data['Is_Lead'],
    test_size=0.3,
    random_state=7
)

model = CatBoostClassifier(
    task_type='CPU', eval_metric='AUC',
    iterations=10, depth=5, min_data_in_leaf=100,
    random_seed=7, max_ctr_complexity=1).fit(
        X_train,
        y_train,
        cat_features=['Credit_Product', 'Is_Active'],
        early_stopping_rounds=20,
        eval_set=(X_test, y_test),
        verbose=2
    )

In [22]:
def gini(df: pd.DataFrame) -> float:
    return 2 * roc_auc_score(df['Is_Lead'], df['score']) - 1

In [None]:
%%time

data['score'] = model.predict_proba(data[model.feature_names_])[:, 1]

data.groupby('Channel_Code').apply(gini)

Предположим, что у нас есть модель и хотем сделать инференс модели на Spark. Как это сделать?

In [24]:
data = spark.read.csv('credit_card_data.csv', header=True, inferSchema=True)
data = data.fillna({'Credit_Product': 'No'})

Самое простое через RDD

In [None]:
data.rdd.take(1)[0].asDict()

In [26]:
def score_by_rdd_map(row, scorer):
    features = row.asDict()
    idx = features['ID']
    features = pd.DataFrame([features])
    score = scorer.predict_proba(features[scorer.feature_names_])[:, 1]
    return idx, score[0]

In [None]:
data.rdd.map(lambda x: score_by_rdd_map(x, model)).take(5)

In [None]:
data.count()

In [None]:
%%time

a = data.rdd.map(lambda x: score_by_rdd_map(x, model)).take(25000)

Давайте уберем pandas

In [29]:
def score_by_rdd_map_no_pandas(row, scorer):
    features = row.asDict()
    idx = features['ID']
    features = [features[col] for col in scorer.feature_names_]
    score = model.predict_proba(features)
    return idx, score[0]

In [None]:
%%time

a = data.rdd.map(lambda x: score_by_rdd_map_no_pandas(x, model)).take(25000)

А теперь попробуем работать с партициями

In [32]:
def score_by_rdd_map_no_pandas(rows, scorer):
    x = []
    list_idx = []
    for row in rows:
        features = row.asDict()
        idx = features['ID']
        features = [features[col] for col in scorer.feature_names_]
        x.append(features)
        list_idx.append(idx)
    scores = scorer.predict_proba(x)[:, 1]
    scores = [float(score) for score in scores]
    return list(zip(list_idx, scores))

In [None]:
%%time

a = data.rdd.mapPartitions(lambda x: score_by_rdd_map_no_pandas(x, model)).collect()

In [None]:
@F.pandas_udf(returnType=FloatType(), functionType=F.PandasUDFType.GROUPED_AGG)
def gini(target: pd.Series, score: pd.Series):
    score = score.fillna(0)
    gini = 2 * roc_auc_score(target, score) - 1
    return gini

In [None]:
%%time

schema = StructType(
    [
        StructField('ID', StringType(), False),
        StructField('score', DoubleType(), False),
    ]
)

scores = data.rdd.mapPartitions(lambda x: score_by_rdd_map_no_pandas(x, model))
scores = scores.toDF(schema)

data = data.join(
    scores,
    how='inner',
    on='ID'
)

data.groupBy('Channel_Code')\
    .agg(gini(F.col('Is_Lead'), F.col('score')))\
    .show()

Есть альтернативный вариант через applyInPandas

In [37]:
data = spark.read.csv('credit_card_data.csv', header=True, inferSchema=True)
data = data.fillna({'Credit_Product': 'No'})

In [38]:
def score(df, scorer):
    scores = scorer.predict_proba(df[scorer.feature_names_])
    return df.assign(score=scores)

In [None]:
%%time

data = data.withColumn('score', F.lit(1).cast(DoubleType()))

data = data.groupBy(F.spark_partition_id())\
           .applyInPandas(lambda x: score(x, model), schema=data.schema)

data.groupBy('Channel_Code')\
    .agg(gini(F.col('Is_Lead'), F.col('score')))\
    .show()