In [1]:
import os

In [2]:
%pwd

'd:\\Bappy\\YouTube\\End-to-end-Machine-Learning-Project-with-MLflow\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\Bappy\\YouTube\\End-to-end-Machine-Learning-Project-with-MLflow'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [6]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir 
        )

        return data_ingestion_config

In [8]:
import os
import urllib.request as request
import zipfile
from mlProject import logger
from mlProject.utils.common import get_size

In [9]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config


    
    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve(
                url = self.config.source_URL,
                filename = self.config.local_data_file
            )
            logger.info(f"{filename} download! with following info: \n{headers}")
        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")



    def extract_zip_file(self):
        """
        zip_file_path: str
        Extracts the zip file into the data directory
        Function returns None
        """
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)
  

In [11]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
except Exception as e:
    raise e

[2023-07-11 09:48:52,013: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-07-11 09:48:52,015: INFO: common: yaml file: params.yaml loaded successfully]
[2023-07-11 09:48:52,017: INFO: common: yaml file: schema.yaml loaded successfully]
[2023-07-11 09:48:52,020: INFO: common: created directory at: artifacts]
[2023-07-11 09:48:52,021: INFO: common: created directory at: artifacts/data_ingestion]
[2023-07-11 09:48:53,478: INFO: 2366278714: artifacts/data_ingestion/data.zip download! with following info: 
Connection: close
Content-Length: 23329
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Content-Type: application/zip
ETag: "c69888a4ae59bc5a893392785a938ccd4937981c06ba8a9d6a21aa52b4ab5b6e"
Strict-Transport-Security: max-age=31536000
X-Content-Type-Options: nosniff
X-Frame-Options: deny
X-XSS-Protection: 1; mode=block
X-GitHub-Request-Id: 3E94:6864:6F310:85DCD:64ACD122
Accept-Ranges: bytes
Date: Tue, 11 Jul

In [None]:
credit_data_frame = credit_data_frame.withColumnRenamed("default.payment.next.month", "default_payment_next_month")

            

            # credit_data_frame["credit_cat"] = pd.cut(           
            #     credit_data_frame["default.payment.next.month"],       ## stratified state means distribution of test and train dataset should align during splits
            #     bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],    ## creating stratified split
            #     labels=[1,2,3,4,5]
            # )
            

            logging.info(f"Splitting data into train and test")
            strat_train_set = None                        ## stratified state means distribution of test and train dataset should align during splits
            strat_test_set = None

            spliter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) ## instantiating splitter

            for train_index,test_index in spliter.split(credit_data_frame, credit_data_frame["default_payment_next_month"]):
                strat_train_set = credit_data_frame.loc[train_index] ##.drop(["premium_cat"],axis=1)
                strat_test_set = credit_data_frame.loc[test_index] ##.drop(["premium_cat"],axis=1)


In [1]:
import pandas as pd

In [3]:
df =pd.read_csv(r"C:\data science\Internship projects\credit card defaulters\Credit_card_default_prediction_with_mlflow\dataset\credit_card.csv")

In [7]:

credit_data_frame = df.rename(columns={"default.payment.next.month": "default_payment_next_month"})


In [8]:
from sklearn.model_selection import StratifiedShuffleSplit

In [23]:
spliter = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=42) ## instantiating splitter
for train_index,test_index in spliter.split(credit_data_frame, credit_data_frame["default_payment_next_month"]):
                strat_train_set = credit_data_frame.loc[train_index] ##.drop(["premium_cat"],axis=1)
                strat_test_set = credit_data_frame.loc[test_index] ##.drop(["premium_cat"],axis=1)
                print(train_index,test_index)
  
                
                


[22788 29006 16950 ...  3794 27565 27126] [ 6907 24575 26766 ... 27309 29583 24399]
[  954 13480 25746 ...   660  3846 10070] [22733 16746 19484 ... 16549 21966  8794]
[22989   958  2999 ... 23212 10213  8786] [ 4096 26571 24117 ... 17655 21361  5514]


In [24]:
train_index.shape

(24000,)

In [25]:
test_index.shape

(6000,)

In [26]:
strat_train_set

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default_payment_next_month
22989,22990,30000.0,2,3,1,44,0,0,0,0,...,28780.0,28756.0,29566.0,1441.0,1597.0,1634.0,1000.0,1124.0,1223.0,0
958,959,350000.0,1,1,2,35,-1,-1,-1,0,...,16426.0,10833.0,2261.0,33891.0,16267.0,4027.0,234.0,1565.0,26346.0,0
2999,3000,50000.0,2,2,3,52,0,0,0,0,...,18443.0,20065.0,20296.0,1844.0,1292.0,804.0,5717.0,823.0,1299.0,0
19912,19913,30000.0,2,2,2,32,0,0,0,0,...,28885.0,29287.0,23012.0,1431.0,1579.0,1402.0,804.0,732.0,1820.0,1
28529,28530,20000.0,2,3,1,36,-1,-1,-1,2,...,626.0,626.0,776.0,626.0,1252.0,0.0,626.0,776.0,626.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16364,16365,230000.0,2,2,1,34,-1,-1,-1,-1,...,1390.0,1072.0,849.0,10998.0,1509.0,1390.0,0.0,849.0,3720.0,0
8407,8408,50000.0,2,1,2,28,2,2,2,2,...,32482.0,33807.0,34398.0,1500.0,1500.0,600.0,2000.0,1300.0,1500.0,0
23212,23213,500000.0,2,1,1,44,-1,-1,-1,-1,...,48858.0,66050.0,43769.0,26963.0,33169.0,48858.0,66050.0,43769.0,37509.0,0
10213,10214,220000.0,2,1,2,30,1,2,-1,-1,...,207.0,207.0,12890.0,0.0,339.0,500.0,0.0,12999.0,0.0,0


In [27]:
strat_test_set

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default_payment_next_month
4096,4097,200000.0,1,2,1,30,1,-2,-2,-2,...,736.0,736.0,316.0,736.0,736.0,736.0,736.0,316.0,1156.0,0
26571,26572,250000.0,2,3,1,53,1,-1,-1,-1,...,0.0,1762.0,0.0,3836.0,839.0,0.0,1762.0,0.0,0.0,0
24117,24118,500000.0,1,1,2,28,0,0,0,0,...,251573.0,300336.0,269964.0,7036.0,20042.0,150122.0,75182.0,11486.0,11162.0,0
19971,19972,130000.0,2,2,1,41,0,0,0,0,...,11036.0,10552.0,9934.0,1161.0,1187.0,1439.0,345.0,346.0,487.0,0
23469,23470,70000.0,2,3,1,31,0,0,0,2,...,15525.0,14772.0,15380.0,2000.0,3000.0,2000.0,0.0,1000.0,1000.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18364,18365,90000.0,2,1,2,23,0,0,0,0,...,25692.0,27364.0,24521.0,5000.0,1454.0,2000.0,2000.0,1500.0,680.0,0
2249,2250,550000.0,2,2,1,32,0,0,0,0,...,530672.0,155083.0,165975.0,22863.0,167622.0,14000.0,4300.0,158064.0,28840.0,1
17655,17656,130000.0,2,1,1,40,-1,-1,-1,-1,...,989.0,990.0,702.0,991.0,991.0,991.0,992.0,703.0,992.0,0
21361,21362,180000.0,1,2,1,70,0,0,0,0,...,16232.0,16958.0,17685.0,1724.0,2000.0,1300.0,1000.0,1000.0,792.0,0
