# Library

In [1]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
import joblib
import src.util as util

# Load Data

In [6]:
def read_raw_data(config: dict) -> pd.DataFrame:
    # Create variable to store raw dataset
    raw_dataset = pd.DataFrame()
    # Raw dataset dir   
    raw_dataset_dir = config["raw_dataset_dir"]
    # Look and load add CSV files
    raw_dataset = pd.read_csv(raw_dataset_dir)
    # Return raw dataset
    return raw_dataset


In [2]:
# read data
config_data = util.load_config()
df = read_raw_data(config_data)

In [3]:
# display data
df

Unnamed: 0,Customer ID,Gender,Age,Married,Number of Dependents,City,Zip Code,Latitude,Longitude,Number of Referrals,...,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status,Churn Category,Churn Reason
0,0002-ORFBO,Female,37,Yes,0,Frazier Park,93225,34.827662,-118.999073,2,...,Credit Card,65.60,593.30,0.00,0,381.51,974.81,Stayed,,
1,0003-MKNFE,Male,46,No,0,Glendale,91206,34.162515,-118.203869,0,...,Credit Card,-4.00,542.40,38.33,10,96.21,610.28,Stayed,,
2,0004-TLHLJ,Male,50,No,0,Costa Mesa,92627,33.645672,-117.922613,0,...,Bank Withdrawal,73.90,280.85,0.00,0,134.60,415.45,Churned,Competitor,Competitor had better devices
3,0011-IGKFF,Male,78,Yes,0,Martinez,94553,38.014457,-122.115432,1,...,Bank Withdrawal,98.00,1237.85,0.00,0,361.66,1599.51,Churned,Dissatisfaction,Product dissatisfaction
4,0013-EXCHZ,Female,75,Yes,0,Camarillo,93010,34.227846,-119.079903,3,...,Credit Card,83.90,267.40,0.00,0,22.14,289.54,Churned,Dissatisfaction,Network reliability
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,9987-LUTYD,Female,20,No,0,La Mesa,91941,32.759327,-116.997260,0,...,Credit Card,55.15,742.90,0.00,0,606.84,1349.74,Stayed,,
7039,9992-RRAMN,Male,40,Yes,0,Riverbank,95367,37.734971,-120.954271,1,...,Bank Withdrawal,85.10,1873.70,0.00,0,356.40,2230.10,Churned,Dissatisfaction,Product dissatisfaction
7040,9992-UJOEL,Male,22,No,0,Elk,95432,39.108252,-123.645121,0,...,Credit Card,50.30,92.75,0.00,0,37.24,129.99,Joined,,
7041,9993-LHIEB,Male,21,Yes,0,Solana Beach,92075,33.001813,-117.263628,5,...,Credit Card,67.85,4627.65,0.00,0,142.04,4769.69,Stayed,,


In [4]:
# dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 38 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Customer ID                        7043 non-null   object 
 1   Gender                             7043 non-null   object 
 2   Age                                7043 non-null   int64  
 3   Married                            7043 non-null   object 
 4   Number of Dependents               7043 non-null   int64  
 5   City                               7043 non-null   object 
 6   Zip Code                           7043 non-null   int64  
 7   Latitude                           7043 non-null   float64
 8   Longitude                          7043 non-null   float64
 9   Number of Referrals                7043 non-null   int64  
 10  Tenure in Months                   7043 non-null   int64  
 11  Offer                              7043 non-null   objec

# Data Validation

In [5]:
# print all features
df.columns

Index(['Customer ID', 'Gender', 'Age', 'Married', 'Number of Dependents',
       'City', 'Zip Code', 'Latitude', 'Longitude', 'Number of Referrals',
       'Tenure in Months', 'Offer', 'Phone Service',
       'Avg Monthly Long Distance Charges', 'Multiple Lines',
       'Internet Service', 'Internet Type', 'Avg Monthly GB Download',
       'Online Security', 'Online Backup', 'Device Protection Plan',
       'Premium Tech Support', 'Streaming TV', 'Streaming Movies',
       'Streaming Music', 'Unlimited Data', 'Contract', 'Paperless Billing',
       'Payment Method', 'Monthly Charge', 'Total Charges', 'Total Refunds',
       'Total Extra Data Charges', 'Total Long Distance Charges',
       'Total Revenue', 'Customer Status', 'Churn Category', 'Churn Reason'],
      dtype='object')

In [6]:
# check data types
df.dtypes

Customer ID                           object
Gender                                object
Age                                    int64
Married                               object
Number of Dependents                   int64
City                                  object
Zip Code                               int64
Latitude                             float64
Longitude                            float64
Number of Referrals                    int64
Tenure in Months                       int64
Offer                                 object
Phone Service                         object
Avg Monthly Long Distance Charges    float64
Multiple Lines                        object
Internet Service                      object
Internet Type                         object
Avg Monthly GB Download              float64
Online Security                       object
Online Backup                         object
Device Protection Plan                object
Premium Tech Support                  object
Streaming 

In [7]:
# check number of unique data of each features
df.nunique()

Customer ID                          7043
Gender                                  2
Age                                    62
Married                                 2
Number of Dependents                   10
City                                 1106
Zip Code                             1626
Latitude                             1626
Longitude                            1625
Number of Referrals                    12
Tenure in Months                       72
Offer                                   6
Phone Service                           2
Avg Monthly Long Distance Charges    3583
Multiple Lines                          2
Internet Service                        2
Internet Type                           3
Avg Monthly GB Download                49
Online Security                         2
Online Backup                           2
Device Protection Plan                  2
Premium Tech Support                    2
Streaming TV                            2
Streaming Movies                  

In [8]:
# data range (numerical)
df.describe()

Unnamed: 0,Age,Number of Dependents,Zip Code,Latitude,Longitude,Number of Referrals,Tenure in Months,Avg Monthly Long Distance Charges,Avg Monthly GB Download,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue
count,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,6361.0,5517.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0
mean,46.509726,0.468692,93486.070567,36.197455,-119.756684,1.951867,32.386767,25.420517,26.189958,63.596131,2280.381264,1.962182,6.860713,749.099262,3034.379056
std,16.750352,0.962802,1856.767505,2.468929,2.154425,3.001199,24.542061,14.200374,19.586585,31.204743,2266.220462,7.902614,25.104978,846.660055,2865.204542
min,19.0,0.0,90001.0,32.555828,-124.301372,0.0,1.0,1.01,2.0,-10.0,18.8,0.0,0.0,0.0,21.36
25%,32.0,0.0,92101.0,33.990646,-121.78809,0.0,9.0,13.05,13.0,30.4,400.15,0.0,0.0,70.545,605.61
50%,46.0,0.0,93518.0,36.205465,-119.595293,0.0,29.0,25.69,21.0,70.05,1394.55,0.0,0.0,401.44,2108.64
75%,60.0,0.0,95329.0,38.161321,-117.969795,3.0,55.0,37.68,30.0,89.75,3786.6,0.0,0.0,1191.1,4801.145
max,80.0,9.0,96150.0,41.962127,-114.192901,11.0,72.0,49.99,85.0,118.75,8684.8,49.79,150.0,3564.72,11979.34


In [9]:
# data range (categorical)
df.describe(include=['O']).transpose()

Unnamed: 0,count,unique,top,freq
Customer ID,7043,7043,0002-ORFBO,1
Gender,7043,2,Male,3555
Married,7043,2,No,3641
City,7043,1106,Los Angeles,293
Offer,7043,6,,3877
Phone Service,7043,2,Yes,6361
Multiple Lines,6361,2,No,3390
Internet Service,7043,2,Yes,5517
Internet Type,5517,3,Fiber Optic,3035
Online Security,5517,2,No,3498


In [10]:
# check unique values on potential terget
df['Customer Status'].unique()

array(['Stayed', 'Churned', 'Joined'], dtype=object)

> Early findings:

    - Target variable is 'Customer Status'. This variable has no null values. It contains values of 'Stayed', 'Churned', 'Joined'. Since we want to focus on churned and retained customers, we will drop rows that Customer Status = 'Joined'
    - 'Churn Reason' and 'Churn Category' contain so many null rows
    - All the data types seems all good
    - There are some features that directly related to the target, which are 'Churn Reason', 'Churn Category'. These features will be dropped to prevent feature leakage.	
    - There are some features that will not useful for modeling, such as 'Customer ID', 'City', 'Zip Code', 'Latitude', 'Longitude'
    
> Decisions:

    - Drop data (Rows): Customer Status = 'Joined'
    - Drop features: 'Churn Reason', 'Churn Category', 'Customer ID', 'City', 'Zip Code', 'Latitude', 'Longitude'

# Remove Unnecessary Features and Rows 

In [11]:
# create list of unecessary features
list_columns_to_drop = ['Churn Reason', 'Churn Category', 'Customer ID', 'City', 'Zip Code', 'Latitude', 'Longitude']

In [12]:
# drop features
df_clean = df.drop(columns=list_columns_to_drop)
df_clean

Unnamed: 0,Gender,Age,Married,Number of Dependents,Number of Referrals,Tenure in Months,Offer,Phone Service,Avg Monthly Long Distance Charges,Multiple Lines,...,Contract,Paperless Billing,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status
0,Female,37,Yes,0,2,9,,Yes,42.39,No,...,One Year,Yes,Credit Card,65.60,593.30,0.00,0,381.51,974.81,Stayed
1,Male,46,No,0,0,9,,Yes,10.69,Yes,...,Month-to-Month,No,Credit Card,-4.00,542.40,38.33,10,96.21,610.28,Stayed
2,Male,50,No,0,0,4,Offer E,Yes,33.65,No,...,Month-to-Month,Yes,Bank Withdrawal,73.90,280.85,0.00,0,134.60,415.45,Churned
3,Male,78,Yes,0,1,13,Offer D,Yes,27.82,No,...,Month-to-Month,Yes,Bank Withdrawal,98.00,1237.85,0.00,0,361.66,1599.51,Churned
4,Female,75,Yes,0,3,3,,Yes,7.38,No,...,Month-to-Month,Yes,Credit Card,83.90,267.40,0.00,0,22.14,289.54,Churned
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Female,20,No,0,0,13,Offer D,Yes,46.68,No,...,One Year,No,Credit Card,55.15,742.90,0.00,0,606.84,1349.74,Stayed
7039,Male,40,Yes,0,1,22,Offer D,Yes,16.20,Yes,...,Month-to-Month,Yes,Bank Withdrawal,85.10,1873.70,0.00,0,356.40,2230.10,Churned
7040,Male,22,No,0,0,2,Offer E,Yes,18.62,No,...,Month-to-Month,Yes,Credit Card,50.30,92.75,0.00,0,37.24,129.99,Joined
7041,Male,21,Yes,0,5,67,Offer A,Yes,2.12,No,...,Two Year,No,Credit Card,67.85,4627.65,0.00,0,142.04,4769.69,Stayed


In [13]:
# exclude Customer Status = 'Joined'
df_clean = df_clean[df_clean['Customer Status'] != 'Joined'].reset_index(drop=True)
df_clean

Unnamed: 0,Gender,Age,Married,Number of Dependents,Number of Referrals,Tenure in Months,Offer,Phone Service,Avg Monthly Long Distance Charges,Multiple Lines,...,Contract,Paperless Billing,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status
0,Female,37,Yes,0,2,9,,Yes,42.39,No,...,One Year,Yes,Credit Card,65.60,593.30,0.00,0,381.51,974.81,Stayed
1,Male,46,No,0,0,9,,Yes,10.69,Yes,...,Month-to-Month,No,Credit Card,-4.00,542.40,38.33,10,96.21,610.28,Stayed
2,Male,50,No,0,0,4,Offer E,Yes,33.65,No,...,Month-to-Month,Yes,Bank Withdrawal,73.90,280.85,0.00,0,134.60,415.45,Churned
3,Male,78,Yes,0,1,13,Offer D,Yes,27.82,No,...,Month-to-Month,Yes,Bank Withdrawal,98.00,1237.85,0.00,0,361.66,1599.51,Churned
4,Female,75,Yes,0,3,3,,Yes,7.38,No,...,Month-to-Month,Yes,Credit Card,83.90,267.40,0.00,0,22.14,289.54,Churned
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6584,Female,36,No,0,0,4,,Yes,2.01,No,...,Month-to-Month,No,Bank Withdrawal,20.95,85.50,0.00,0,8.04,93.54,Churned
6585,Female,20,No,0,0,13,Offer D,Yes,46.68,No,...,One Year,No,Credit Card,55.15,742.90,0.00,0,606.84,1349.74,Stayed
6586,Male,40,Yes,0,1,22,Offer D,Yes,16.20,Yes,...,Month-to-Month,Yes,Bank Withdrawal,85.10,1873.70,0.00,0,356.40,2230.10,Churned
6587,Male,21,Yes,0,5,67,Offer A,Yes,2.12,No,...,Two Year,No,Credit Card,67.85,4627.65,0.00,0,142.04,4769.69,Stayed


In [14]:
# check df_clean shape
df_clean.shape

(6589, 31)

# Change Values

In [15]:
# to make it easier for preprocessing, change 'Yes' 'No' to 'Y' 'N'
df_clean.replace("Yes", "Y", inplace=True)
df_clean.replace("No", "N", inplace=True)

# Standardize Column Name

In [16]:
# default snake case
def clean_headers(val):
    if isinstance(val, str):
        # remove special chars (but skip emtpy spaces and all)
        val = "".join(char for char in val if char.isalnum() or char in (" ", "_"))
        # convert to snake case
        val = val.strip().lower().replace(" ", "_")
        return val
    else:
        return val
    
df_clean = df_clean.rename(columns=clean_headers)
df_clean

Unnamed: 0,gender,age,married,number_of_dependents,number_of_referrals,tenure_in_months,offer,phone_service,avg_monthly_long_distance_charges,multiple_lines,...,contract,paperless_billing,payment_method,monthly_charge,total_charges,total_refunds,total_extra_data_charges,total_long_distance_charges,total_revenue,customer_status
0,Female,37,Y,0,2,9,,Y,42.39,N,...,One Year,Y,Credit Card,65.60,593.30,0.00,0,381.51,974.81,Stayed
1,Male,46,N,0,0,9,,Y,10.69,Y,...,Month-to-Month,N,Credit Card,-4.00,542.40,38.33,10,96.21,610.28,Stayed
2,Male,50,N,0,0,4,Offer E,Y,33.65,N,...,Month-to-Month,Y,Bank Withdrawal,73.90,280.85,0.00,0,134.60,415.45,Churned
3,Male,78,Y,0,1,13,Offer D,Y,27.82,N,...,Month-to-Month,Y,Bank Withdrawal,98.00,1237.85,0.00,0,361.66,1599.51,Churned
4,Female,75,Y,0,3,3,,Y,7.38,N,...,Month-to-Month,Y,Credit Card,83.90,267.40,0.00,0,22.14,289.54,Churned
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6584,Female,36,N,0,0,4,,Y,2.01,N,...,Month-to-Month,N,Bank Withdrawal,20.95,85.50,0.00,0,8.04,93.54,Churned
6585,Female,20,N,0,0,13,Offer D,Y,46.68,N,...,One Year,N,Credit Card,55.15,742.90,0.00,0,606.84,1349.74,Stayed
6586,Male,40,Y,0,1,22,Offer D,Y,16.20,Y,...,Month-to-Month,Y,Bank Withdrawal,85.10,1873.70,0.00,0,356.40,2230.10,Churned
6587,Male,21,Y,0,5,67,Offer A,Y,2.12,N,...,Two Year,N,Credit Card,67.85,4627.65,0.00,0,142.04,4769.69,Stayed


# Data Checking After Some Changes

In [17]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6589 entries, 0 to 6588
Data columns (total 31 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   gender                             6589 non-null   object 
 1   age                                6589 non-null   int64  
 2   married                            6589 non-null   object 
 3   number_of_dependents               6589 non-null   int64  
 4   number_of_referrals                6589 non-null   int64  
 5   tenure_in_months                   6589 non-null   int64  
 6   offer                              6589 non-null   object 
 7   phone_service                      6589 non-null   object 
 8   avg_monthly_long_distance_charges  5945 non-null   float64
 9   multiple_lines                     5945 non-null   object 
 10  internet_service                   6589 non-null   object 
 11  internet_type                      5245 non-null   objec

In [18]:
df_clean.isna().sum()

gender                                  0
age                                     0
married                                 0
number_of_dependents                    0
number_of_referrals                     0
tenure_in_months                        0
offer                                   0
phone_service                           0
avg_monthly_long_distance_charges     644
multiple_lines                        644
internet_service                        0
internet_type                        1344
avg_monthly_gb_download              1344
online_security                      1344
online_backup                        1344
device_protection_plan               1344
premium_tech_support                 1344
streaming_tv                         1344
streaming_movies                     1344
streaming_music                      1344
unlimited_data                       1344
contract                                0
paperless_billing                       0
payment_method                    

In [19]:
# integer variables
for int_var in df_clean.select_dtypes('int'):
    print('-', int_var)

- age
- number_of_dependents
- number_of_referrals
- tenure_in_months
- total_extra_data_charges


In [20]:
# object variables
for obj_var in df_clean.select_dtypes('O'):
    print('-', obj_var)

- gender
- married
- offer
- phone_service
- multiple_lines
- internet_service
- internet_type
- online_security
- online_backup
- device_protection_plan
- premium_tech_support
- streaming_tv
- streaming_movies
- streaming_music
- unlimited_data
- contract
- paperless_billing
- payment_method
- customer_status


In [21]:
# float variables
for float_var in df_clean.select_dtypes('float'):
    print('-', float_var)

- avg_monthly_long_distance_charges
- avg_monthly_gb_download
- monthly_charge
- total_charges
- total_refunds
- total_long_distance_charges
- total_revenue


In [22]:
# all variables
for all_var in df_clean:
    print('-', all_var)

- gender
- age
- married
- number_of_dependents
- number_of_referrals
- tenure_in_months
- offer
- phone_service
- avg_monthly_long_distance_charges
- multiple_lines
- internet_service
- internet_type
- avg_monthly_gb_download
- online_security
- online_backup
- device_protection_plan
- premium_tech_support
- streaming_tv
- streaming_movies
- streaming_music
- unlimited_data
- contract
- paperless_billing
- payment_method
- monthly_charge
- total_charges
- total_refunds
- total_extra_data_charges
- total_long_distance_charges
- total_revenue
- customer_status


In [23]:
# integer variables range
for int_var in df_clean.select_dtypes('int'):
    print(f'range_{int_var}:')
    print('-', df_clean[int_var].min())
    print('-', df_clean[int_var].max())

range_age:
- 19
- 80
range_number_of_dependents:
- 0
- 9
range_number_of_referrals:
- 0
- 11
range_tenure_in_months:
- 1
- 72
range_total_extra_data_charges:
- 0
- 150


In [24]:
# integer variables range
for float_var in df_clean.select_dtypes('float'):
    print(f'range_{float_var}:')
    print('-', df_clean[float_var].min())
    print('-', df_clean[float_var].max())

range_avg_monthly_long_distance_charges:
- 1.01
- 49.99
range_avg_monthly_gb_download:
- 2.0
- 85.0
range_monthly_charge:
- -10.0
- 118.75
range_total_charges:
- 18.85
- 8684.8
range_total_refunds:
- 0.0
- 49.79
range_total_long_distance_charges:
- 0.0
- 3564.72
range_total_revenue:
- 21.61
- 11979.34


In [25]:
# object variables range
for obj_var in df_clean.select_dtypes('O'):
    print(f'range_{obj_var}:')
    for uniq in df_clean[obj_var].unique():
        print('-', uniq)

range_gender:
- Female
- Male
range_married:
- Y
- N
range_offer:
- None
- Offer E
- Offer D
- Offer A
- Offer B
- Offer C
range_phone_service:
- Y
- N
range_multiple_lines:
- N
- Y
- nan
range_internet_service:
- Y
- N
range_internet_type:
- Cable
- Fiber Optic
- DSL
- nan
range_online_security:
- N
- Y
- nan
range_online_backup:
- Y
- N
- nan
range_device_protection_plan:
- N
- Y
- nan
range_premium_tech_support:
- Y
- N
- nan
range_streaming_tv:
- Y
- N
- nan
range_streaming_movies:
- N
- Y
- nan
range_streaming_music:
- N
- Y
- nan
range_unlimited_data:
- Y
- N
- nan
range_contract:
- One Year
- Month-to-Month
- Two Year
range_paperless_billing:
- Y
- N
range_payment_method:
- Credit Card
- Bank Withdrawal
- Mailed Check
range_customer_status:
- Stayed
- Churned


In [26]:
# create ohe pattern for yaml
for obj_var in df_clean.select_dtypes('O'):
    print(f'ohe_{obj_var}_path: models/ohe_{obj_var}.pkl')


ohe_gender_path: models/ohe_gender.pkl
ohe_married_path: models/ohe_married.pkl
ohe_offer_path: models/ohe_offer.pkl
ohe_phone_service_path: models/ohe_phone_service.pkl
ohe_multiple_lines_path: models/ohe_multiple_lines.pkl
ohe_internet_service_path: models/ohe_internet_service.pkl
ohe_internet_type_path: models/ohe_internet_type.pkl
ohe_online_security_path: models/ohe_online_security.pkl
ohe_online_backup_path: models/ohe_online_backup.pkl
ohe_device_protection_plan_path: models/ohe_device_protection_plan.pkl
ohe_premium_tech_support_path: models/ohe_premium_tech_support.pkl
ohe_streaming_tv_path: models/ohe_streaming_tv.pkl
ohe_streaming_movies_path: models/ohe_streaming_movies.pkl
ohe_streaming_music_path: models/ohe_streaming_music.pkl
ohe_unlimited_data_path: models/ohe_unlimited_data.pkl
ohe_contract_path: models/ohe_contract.pkl
ohe_paperless_billing_path: models/ohe_paperless_billing.pkl
ohe_payment_method_path: models/ohe_payment_method.pkl
ohe_customer_status_path: models/o

# Handling Missing Value for Splitting

Fill missing value in categorical variables with 'Unknown' and float variables with -999

In [27]:
# fill missing value in categorical variables with 'Unknown' and float variables with -999
categorical_features = df_clean.select_dtypes('O').columns
df_clean[categorical_features] = df_clean.select_dtypes('O').fillna('Unknown')
df_clean.avg_monthly_long_distance_charges.fillna(-999, inplace=True)
df_clean.avg_monthly_gb_download.fillna(-999, inplace=True)
df_clean.isna().sum()

gender                               0
age                                  0
married                              0
number_of_dependents                 0
number_of_referrals                  0
tenure_in_months                     0
offer                                0
phone_service                        0
avg_monthly_long_distance_charges    0
multiple_lines                       0
internet_service                     0
internet_type                        0
avg_monthly_gb_download              0
online_security                      0
online_backup                        0
device_protection_plan               0
premium_tech_support                 0
streaming_tv                         0
streaming_movies                     0
streaming_music                      0
unlimited_data                       0
contract                             0
paperless_billing                    0
payment_method                       0
monthly_charge                       0
total_charges            

In [28]:
joblib.dump(df_clean, "../data/processed/df_clean.pkl")

['../data/processed/df_clean.pkl']

# Split Data

In [29]:
x = df_clean.drop(columns="customer_status")
y = df_clean["customer_status"].copy()

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42, stratify = y)

In [31]:
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size = 0.5, random_state = 42, stratify = y_test)

In [32]:
joblib.dump(x_train, "../data/processed/x_train.pkl")
joblib.dump(y_train, "../data/processed/y_train.pkl")
joblib.dump(x_valid, "../data/processed/x_valid.pkl")
joblib.dump(y_valid, "../data/processed/y_valid.pkl")
joblib.dump(x_test, "../data/processed/x_test.pkl")
joblib.dump(y_test, "../data/processed/y_test.pkl")

['../data/processed/y_test.pkl']