In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msgn
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import f_classif
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import *

In [None]:
import pickle
with open("all_df.pickle","rb") as f:
    df = pickle.load(f)

In [None]:
df

In [None]:
df['is_elite_user'] = df['is_elite_user'].astype(np.int32)
df['promotion_award_value'] = df['promotion_award_value'].astype(np.float32)
df['coupon_discount'] = df['coupon_discount'].astype(np.float32)
df['is_wallet_trx'] = df['is_wallet_trx'].astype('category')
df['ship_cost'] = df['ship_cost'].astype(np.float32)
df['brand_id'] = df['brand_id'].astype(np.int32)
df['category_id'] = df['category_id'].astype(np.int32)
df['gender_id'] = df['gender_id'].astype(np.float32)
df['color_id'] = df['color_id'].astype(np.float32)
df['gender'] = df['gender'].astype('category')
df['returnRate'] = df['returnRate'].astype(np.float32)
df['returnRate_def'] = df['returnRate_def'].astype(np.float32)
df['total_claim'] = df['total_claim'].astype(np.float32)
df['unresolvedclaim_percentage'] = df['unresolvedclaim_percentage'].astype(np.float32)
df['unresolved_percentage'] = df['unresolved_percentage'].astype(np.float32)
df['paid_amount'] = df['paid_amount'].astype(np.float32)
df['discount_ratio'] = df['discount_ratio'].astype(np.float32)
df['hour_interval'] = df['hour_interval'].astype('category')
df['diff_order_memdate'] = df['diff_order_memdate'].astype(np.int32)
df['order_weekday'] = df['order_weekday'].astype('category')
df['order_day'] = df['order_day'].astype('category')
df['order_week'] = df['order_week'].astype('category')
df['age'] = df['age'].astype(np.int32)
df['is_bday_close'] = df['is_bday_close'].astype('category')
df['original_price'] = df['original_price'].astype(np.float32)
df['discounted_price'] = df['discounted_price'].astype(np.float32)
df['is_returned'] = df['is_returned'].astype('category')

In [None]:
label = ['is_returned']

categorical_str = [
'is_saved_card_trx',
'gender',
'zodiac'
] 

categorical_int = [
'gender_id',
'category_id',
'color_id',
'order_weekday',
'order_week',
'brand_id',
'order_day',
'hour_interval',
]

numerical = [
'original_price',
'discounted_price',
'ship_cost',
'coupon_discount',
'age',
'diff_order_memdate',
'total_claim',
'promotion_award_value',
'discount_ratio',
'paid_amount'
]

numerical2 = [
'returnRate',
'returnRate_def',
'unresolvedclaim_percentage',
'unresolved_percentage',
'item_return_rate_x',
'item_return_rate_y',
'shifted_return_rate',
]
passthrough = [
'is_elite_user',
'is_wallet_trx',
'is_bday_close']

drops = [
'color_name',
'supplier_color_name',
'attributet_name',
'gender_name',
'category_name',
'brand_name',
'product_name',
'coupon_id',
'promotion_name',
'order_date',
'birth_date',
'membership_date']

In [None]:
merged_df = df[categorical_str+categorical_int+numerical+numerical2+passthrough+label]

In [None]:
matrix = merged_df.corr().abs()

# Create a mask
mask = np.triu(np.ones_like(matrix, dtype=bool))

# Create a custom diverging palette
cmap = sns.diverging_palette(250, 15, s=75, l=40, n=9, center="light", as_cmap=True)
plt.figure(figsize=(16, 12))
sns.heatmap(matrix[matrix>0.5], mask=mask, center=0, annot=True, fmt='.2f', square=True, cmap='crest')

plt.show();

In [None]:
def print_metrics(y_pred, y_test, model):
    print("Model results: ", type(model).__name__)
    print(classification_report(y_test, y_pred))
    print("Confusion matrix:")
    cm = confusion_matrix(y_test,y_pred, labels= model.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels= model.classes_)
    disp.plot()

In [None]:
def create_pipeline_rf():
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])

    numeric_transformer2 = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean'))])
    
    categorical_str_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('one-hot', OneHotEncoder(handle_unknown='ignore'))])
    
    categorical_int_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value=-99)),
        ('one-hot', OneHotEncoder(handle_unknown='ignore'))])
    

    preprocessor = ColumnTransformer(
                                    transformers=[
                                                  ('num1', numeric_transformer, numerical),
                                                  ('num2', numeric_transformer2, numerical2),
                                                  ('cat_str1', categorical_str_transformer, categorical_str),
                                                  ('cat_int1', categorical_int_transformer, categorical_int),
                                                  ], 
                                    remainder='passthrough')

    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                          ('f_selector', SelectKBest(f_classif, k='all')),
                          ('classifier', RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=0))])
    
    return pipeline

veriyi alırken

1. random
2. time based

In [None]:
pos_df = merged_df[merged_df['is_returned']==1].sample(200000)
neg_df = merged_df[merged_df['is_returned']==0].sample(200000)

model_df = pd.concat([pos_df, neg_df], axis=0)
model_df.sample(frac=1)

In [None]:
y = model_df['is_returned']
X = model_df[categorical_str+categorical_int+numerical+numerical2+passthrough]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.20, random_state=1)

pipe1 = create_pipeline_rf()

pipe1.fit(X_train, y_train)
preds = pipe1.predict(X_test)

In [None]:
return_ratio'yu sadece train data kullanarak hesapla

In [None]:
print_metrics(preds, y_test, pipe1)