In [1]:
import kaggle
import os
import pandas as pd

In [2]:
os.getcwd()

'/home/jatin/Projects/customer_churn_prediction/resarch'

In [3]:
os.chdir("../")

In [4]:
os.getcwd()

'/home/jatin/Projects/customer_churn_prediction'

In [5]:
df = pd.read_csv('data/telco_customer_churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [7]:
# Data Ingestion Entity

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    """
    Storing configuration related to data ingestion
    """
    root_dir: Path
    kaggle_dataset: str
    file: str
    local_data_file: str
    data_dir: Path

In [None]:
# Configuration manager

from customer_churn_prediction.constants import CONFIG_FILE_PATH, SCHEMA_FILE_PATH, PARAMS_FILE_PATH
from customer_churn_prediction.utils.common import read_yaml, create_directory

class ConfigurationManager:
    """
    Handles loading and managing configuration,
    params and schema for the project.
    """
    def __init__(
            self,
            config_path=CONFIG_FILE_PATH,
            schema_path=SCHEMA_FILE_PATH,
            params_path=PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_path)
        self.schema = read_yaml(schema_path)
        self.params = read_yaml(params_path)

        create_directory([self.config.artifacts_root])

    def get_data_ingestion_config(self)-> DataIngestionConfig:
        """
        Returns the Data Ingestion configuration
        """
        config = self.config.data_ingestion
        create_directory([config.root_dir])
        
        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            kaggle_dataset = config.kaggle_dataset,
            file = config.file,
            local_data_file = config.local_data_file,
            data_dir = config.data_dir
        )
        return data_ingestion_config


In [9]:
# Define the component

from customer_churn_prediction import logger
from customer_churn_prediction.utils.common import get_size

class DataIngestion:
    """
    
    """
    def __init__(self,config:DataIngestionConfig):
        self.config = config

    def download_file(self):
        """
        
        """
        try:
            if not os.path.exists(self.config.local_data_file):
                res = kaggle.api.dataset_download_file(
                    self.config.kaggle_dataset,
                    self.config.file,
                    self.config.data_dir
                )
                if res:
                    logger.info(
                        f"File: {self.config.local_data_file} downloaded successfully")
                else:
                    logger.info(
                        f"File: {self.config.local_data_file} failed to download")
            else:
                logger.info(
                    f"File: {self.config.local_data_file} "
                    f"already exists of size {get_size(self.config.local_data_file)}"
                )
        except Exception as e:
            logger.exception(
                f"Exception occurred while downloading the file: "
                f"{self.config.local_data_file}"
            )
            raise


In [10]:
# Create the pipeline

from customer_churn_prediction import logger

try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(data_ingestion_config)
    data_ingestion.download_file()
except Exception as e:
    logger.exception(f"Exception occured while running the data ingestion pipeline")
    raise

[2025-09-27 12:26:03,956]:INFO:common.py:Yaml file: config/config.yaml is loaded successfully
[2025-09-27 12:26:03,958]:INFO:common.py:Yaml file: schema.yaml is loaded successfully
[2025-09-27 12:26:03,960]:INFO:common.py:Yaml file: params.yaml is loaded successfully
[2025-09-27 12:26:03,960]:INFO:common.py:Directory created at: artifacts
[2025-09-27 12:26:03,961]:INFO:common.py:Directory created at: artifacts/data_ingstion
Dataset URL: https://www.kaggle.com/datasets/palashfendarkar/wa-fnusec-telcocustomerchurn
[2025-09-27 12:26:08,891]:INFO:465592512.py:File: tcblastchar/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv downloaded successfully
