In [0]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import warnings
from sklearn.preprocessing import LabelEncoder

In [0]:
# Ignore SettingWithCopyWarning
# Ignore all warnings
warnings.filterwarnings("ignore")

In [0]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [0]:

def rfe_feature_importance(df, target, estimator, n_features):
    """
    Calculates feature importance using Recursive Feature Elimination (RFE).
    """
    X = df.drop(columns=[target])
    y = df[target]
    rfe = RFE(estimator=estimator, n_features_to_select=n_features)
    rfe.fit(X, y)
    importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': rfe.ranking_
    }).sort_values(by='Importance', ascending=True)
    return importance.reset_index(drop=True)

def lasso_feature_importance(df, target, alpha=0.01):
    """
    Calculates feature importance using Lasso regularization.
    """
    X = df.drop(columns=[target])
    y = df[target]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)  # Standardize for Lasso
    lasso = Lasso(alpha=alpha)
    lasso.fit(X_scaled, y)
    importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': np.abs(lasso.coef_)
    }).sort_values(by='Importance', ascending=False)
    return importance.reset_index(drop=True)

def randomforest_feature_importance(df, target, n_estimators=100, random_state=42):
    """
    Calculates feature importance using Random Forest.
    """
    X = df.drop(columns=[target])
    y = df[target]
    rf = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
    rf.fit(X, y)
    importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': rf.feature_importances_
    }).sort_values(by='Importance', ascending=False)
    return importance.reset_index(drop=True)



In [0]:
#Loading the datasets

ay_burr_agg = spark.table("hive_metastore.default.ay_aggr_bur_target")
ay_burr_agg.createOrReplaceTempView("ay_burr_agg")
deq_data = ay_burr_agg.toPandas()




In [0]:
deq_data.head()

In [0]:
deq_data.shape

In [0]:
deq_data.isnull().sum()

In [0]:
# Calculate fill rate for each column
fill_rate = deq_data.notnull().mean() * 100

# Display the fill rate as a DataFrame
fill_rate_df = fill_rate.reset_index()
fill_rate_df.columns = ['Column', 'Fill Rate (%)']
print(fill_rate_df)

In [0]:

# Calculate fill rate
fill_rate = deq_data.notnull().mean() * 100

# Filter and keep only columns with fill rate >= 50%
data_1 = deq_data.loc[:, fill_rate >= 50]


In [0]:
%python
# Drop columns starting with 'vin_credit_utilization_'
columns_to_drop = [col for col in data_1.columns if col.startswith('vin_credit_utilization_')]
data_1 = data_1.drop(columns=columns_to_drop)

In [0]:
data_1.head()

In [0]:
# data_1.drop(columns=["NAME_CONTRACT_TYPE"], inplace=True)

In [0]:
# # Function to get all categorical variables
# def get_categorical_variables(df):
#     return df.select_dtypes(include=['object', 'category']).columns.tolist()

# categorical_vars = get_categorical_variables(data_1)
# print("Categorical Variables:", categorical_vars)

In [0]:
# data_1['CODE_GENDER'] = data_1['CODE_GENDER'].map({'M': 1, 'F': 0})
# data_1['FLAG_OWN_CAR'] = data_1['FLAG_OWN_CAR'].map({'N': 0, 'Y': 1})
# data_1['FLAG_OWN_REALTY'] = data_1['FLAG_OWN_REALTY'].map({'N': 0, 'Y': 1})

In [0]:
# label_encoder = LabelEncoder()

# # Apply LabelEncoder to all columns
# data_2 = data_1.apply(lambda col: label_encoder.fit_transform(col) if col.dtypes == 'object' else col)

In [0]:
# #drop columns where code_gender is null
# # Drop rows where CODE_GENDER is null
# data_2 = data_2.dropna(subset=['CODE_GENDER'])

In [0]:
app_train = spark.table("hive_metastore.default.application_train")
app_train_df = app_train.toPandas()

In [0]:
type(app_train_df)

In [0]:
#inner join and take only target 
data_1 = data_1.merge(app_train_df[['SK_ID_CURR','TARGET']], on='SK_ID_CURR', how='inner')

In [0]:
data_2 = data_1.drop(columns=['SK_ID_CURR','residual_bur','pred_proba_bur'])

In [0]:
# Replace nulls with a placeholder 

data_cleaned = data_2.copy()
placeholder_value = -10e10+1
data_cleaned = data_cleaned.fillna(placeholder_value)

In [0]:
data_cleaned.isnull().sum()

In [0]:
data_cleaned.head()

In [0]:
# Assume `df` is your DataFrame and `target` is the column with target values
target_column = 'TARGET'  # Replace with the name of your target column
sample_size = 30000

# Calculate the fraction of rows needed
fraction = sample_size / len(data_cleaned)

# Perform stratified sampling
_, df_sample = train_test_split(
    data_cleaned, 
    test_size=fraction, 
    stratify=data_cleaned[target_column], 
    random_state=42  # Set seed for reproducibility
)

In [0]:
df_sample.shape

In [0]:

# RFE
print("RFE Feature Importance:")
rfe_result = rfe_feature_importance(df_sample, target='TARGET', estimator=LogisticRegression(), n_features=5)
print(rfe_result)

# Lasso
print("\nLasso Feature Importance:")
lasso_result = lasso_feature_importance(df_sample, target='TARGET', alpha=0.01)
print(lasso_result)

# Random Forest
print("\nRandom Forest Feature Importance:")
rf_result = randomforest_feature_importance(df_sample, target='TARGET')
print(rf_result)

- vin_total_overdue_amount_last_12_months_mean
- vin_days_since_first_loan_taken_max_sum
- vin_num_active_loans_sum
- vin_days_since_credit_update_min_max






In [0]:
selected_columns = [
    'SK_ID_CURR',
    'residual_bur',
    'pred_proba_bur',
    'vin_total_overdue_amount_last_12_months_mean',
    'vin_days_since_first_loan_taken_max_sum',
    'vin_num_active_loans_sum',
    'vin_days_since_credit_update_min_max'
]

# Create a new DataFrame with the selected columns
final_ay_bur_vintage = deq_data[selected_columns]

# Display the new DataFrame
display(final_ay_bur_vintage)

In [0]:
pyspark_df = spark.createDataFrame(final_ay_bur_vintage)

In [0]:
pyspark_df.createOrReplaceTempView("pyspark_df")

ay_prev_agg = spark.table("hive_metastore.default.ay_aggr_prev_target")

In [0]:
pyspark_df.count()

In [0]:
%python
# Replace spaces with underscores in all column names
for col in pyspark_df.columns:
    new_col = col.replace(" ", "_")
    pyspark_df = pyspark_df.withColumnRenamed(col, new_col)

# Display the DataFrame to verify the changes
display(pyspark_df)

In [0]:
pyspark_df.createOrReplaceTempView("pyspark_df")

In [0]:
%sql

create table default.ay_vin_bur_pred as
select
  *
from
  pyspark_df;