In [1]:
import os

In [2]:
%pwd

'/workspaces/-renci_Perfonmans_Tahminlemesi_MLOps_Deneme2/research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'/workspaces/-renci_Perfonmans_Tahminlemesi_MLOps_Deneme2'

# Entity

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformConfig:
    root_dir: Path  # Türü belirtmek için ':' kullanın
    preprocessor_file: Path  # Türü belirtmek için ':' kullanın
    data_path: Path  # Türü belirtmek için ':' kullanın

# Config

In [6]:
from src.ÖğrenciTahminleme.constants import * # constanst icerisinde degiskenleri import ettik
from src.ÖğrenciTahminleme.utils.common import read_yaml, create_directories # common.py icerisinde read_yaml ve create_directories methodlarini import ettik

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    def get_data_transformation_config(self) -> DataTransformConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])
        data_transformation_config = DataTransformConfig(
            root_dir = config.root_dir,
            data_path = config.data_path,
            preprocessor_file = Path(config.preprocessor_file)
        )
        return data_transformation_config

# Components

In [8]:
import os
import urllib.request as request
import zipfile
from src.ÖğrenciTahminleme import logger
from src.ÖğrenciTahminleme.utils.common import get_size

In [9]:
from src.ÖğrenciTahminleme.utils.common import save_object

In [10]:
import sys
from dataclasses import dataclass

import numpy as np 
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split

import os



In [11]:
class DataTransfrom:
    def __init__(self,config: DataTransformConfig): 
        self.config = config 

    def get_data_transformer_object(self):
        '''
        This function si responsible for data trnasformation
        
        '''
        try:
            numerical_columns = ["writing_score", "reading_score"]
            categorical_columns = [
                "gender",
                "race_ethnicity",
                "parental_level_of_education",
                "lunch",
                "test_preparation_course",
            ]

            num_pipeline= Pipeline(
                steps=[
                ("imputer",SimpleImputer(strategy="median")),
                ("scaler",StandardScaler())

                ]
            )

            cat_pipeline=Pipeline(

                steps=[
                ("imputer",SimpleImputer(strategy="most_frequent")),
                ("one_hot_encoder",OneHotEncoder()),
                ("scaler",StandardScaler(with_mean=False))
                ]

            )

            logger.info(f"Categorical columns: {categorical_columns}")
            logger.info(f"Numerical columns: {numerical_columns}")

            preprocessor=ColumnTransformer(
                [
                ("num_pipeline",num_pipeline,numerical_columns),
                ("cat_pipelines",cat_pipeline,categorical_columns)

                ]


            )

            return preprocessor
        
        except Exception as e:
            raise e
        
    def initiate_data_transformation(self):

        try:
            data = pd.read_csv(self.config.data_path)
            train_df,test_df = train_test_split(data)

            logger.info("Read train and test data completed")

            logger.info("Obtaining preprocessing object")

            preprocessing_obj=self.get_data_transformer_object()

            target_column_name="math_score"
            numerical_columns = ["writing_score", "reading_score"]

            input_feature_train_df=train_df.drop(columns=[target_column_name],axis=1)
            target_feature_train_df=train_df[target_column_name]

            input_feature_test_df=test_df.drop(columns=[target_column_name],axis=1)
            target_feature_test_df=test_df[target_column_name]

            logger.info(
                f"Applying preprocessing object on training dataframe and testing dataframe."
            )

            input_feature_train_arr=preprocessing_obj.fit_transform(input_feature_train_df)
            input_feature_test_arr=preprocessing_obj.transform(input_feature_test_df)

            train_arr = np.c_[
                input_feature_train_arr, np.array(target_feature_train_df)
            ]
            test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]

            logger.info(f"Saved preprocessing object.")

            save_object(

                file_path=self.config.preprocessor_file,
                obj=preprocessing_obj

            )

            return (
                train_arr,
                test_arr,
                self.config.preprocessor_file,
            )
        except Exception as e:
            raise e

# Pipeline


In [12]:
try:
    config = ConfigurationManager() #benim değişkenlerimin değerlerini tutuyor.
    data_transformation_config = config.get_data_transformation_config() # bu methodla config.yaml içindeki bütün değerleri çağırıyorum
    data_transformation = DataTransfrom(config=data_transformation_config)
    train_arr,test_arr,_ = data_transformation.initiate_data_transformation()
except Exception as e:
    raise e

[2025-01-17 22:26:11,511: INFO: 323158990: Read train and test data completed]
[2025-01-17 22:26:11,511: INFO: 323158990: Obtaining preprocessing object]
[2025-01-17 22:26:11,513: INFO: 323158990: Categorical columns: ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']]
[2025-01-17 22:26:11,514: INFO: 323158990: Numerical columns: ['writing_score', 'reading_score']]
[2025-01-17 22:26:11,516: INFO: 323158990: Applying preprocessing object on training dataframe and testing dataframe.]


[2025-01-17 22:26:11,540: INFO: 323158990: Saved preprocessing object.]


In [20]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class ModelTrainerConfig:
    root_dir: Path
    prep_data_path: Path
    model_file: Path

In [14]:
from src.ÖğrenciTahminleme.constants import * # constanst icerisinde degiskenleri import ettik
from src.ÖğrenciTahminleme.utils.common import read_yaml, create_directories # common.py icerisinde read_yaml ve create_directories methodlarini import ettik

In [30]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        create_directories([config.root_dir])
        model_trainer_config = ModelTrainerConfig(
            root_dir = config.root_dir,
            prep_data_path = config.prep_data_path,
            model_file = Path(config.model_path)
        )
        return model_trainer_config

In [31]:
import os

import sys
from catboost import CatBoostRegressor
from sklearn.ensemble import (
    AdaBoostRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor
)

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score # Yüzdelik olarak modelin ne kadar doğru çalıştığını gösterir
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from src.ÖğrenciTahminleme import logger


from dataclasses import dataclass

 
from src.ÖğrenciTahminleme.utils.common import get_size,save_object,evaluate_model

In [40]:
import yaml
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score
from src.ÖğrenciTahminleme.utils.common import read_yaml  # Mevcut read_yaml fonksiyonunu kullanıyoruz

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig,params_filepath = PARAMS_FILE_PATH):
        self.config = config
        self.params = read_yaml(PARAMS_FILE_PATH)  # params path config'den okunur

    def initiate_model_trainer(self, train_array, test_array):
        try:
            logger.info("Train ve test verisiyle model eğitimi başlatılıyor.")
            
            X_train, y_train, X_test, y_test = (
                train_array[:, :-1],
                train_array[:, -1],
                test_array[:, :-1],
                test_array[:, -1],
            )

            models = {
                "Random Forest": RandomForestRegressor(),
                "Decision Tree": DecisionTreeRegressor(),
                "Gradient Boosting": GradientBoostingRegressor(),
                "Linear Regression": LinearRegression(),
                "CatBoosting Regressor": CatBoostRegressor(verbose=False),
                "AdaBoost Regressor": AdaBoostRegressor(),
            }

            # Parametreleri params.yaml dosyasından oku
            params = self.params

            model_report = {}
            for model_name, model in models.items():
                if model_name in params:
                    param_grid = params[model_name]
                    if param_grid:  # Eğer model için parametre tanımlandıysa
                        logger.info(f"{model_name} için RandomizedSearchCV başlatılıyor.")
                        search = RandomizedSearchCV(
                            estimator=model,
                            param_distributions=param_grid,
                            scoring="r2",
                            n_iter=10,  # Rastgele 10 kombinasyonu değerlendir
                            cv=3,  # 3-fold cross-validation
                            random_state=42,
                            n_jobs=-1
                        )
                        search.fit(X_train, y_train)
                        best_model = search.best_estimator_
                        model_report[model_name] = best_model.score(X_test, y_test)
                    else:
                        logger.info(f"{model_name} için parametre yok, varsayılan model kullanılıyor.")
                        model.fit(X_train, y_train)
                        model_report[model_name] = model.score(X_test, y_test)
                else:
                    logger.warning(f"{model_name} için params.yaml'de parametre bulunamadı.")

            # En iyi modeli bul
            best_model_name = max(model_report, key=model_report.get)
            best_model_score = model_report[best_model_name]
            best_model = models[best_model_name]

            if best_model_score < 0.6:
                raise Exception("Hiçbir model yeterli performansı sağlayamadı.")

            logger.info(f"En iyi model: {best_model_name} (R2: {best_model_score})")

            # Modeli kaydet
            save_object(
                file_path=self.config.model_file / "best_model.pkl",
                obj=best_model
            )

            predicted = best_model.predict(X_test)
            r2_square = r2_score(y_test, predicted)
            return best_model, r2_square

        except Exception as e:
            logger.error(f"Model eğitimi sırasında hata oluştu: {e}")
            raise

# pipeline

In [41]:
try:
    config = ConfigurationManager() #benim değişkenlerimin değerlerini tutuyor.
    model_trainer_config = config.get_model_trainer_config() # bu methodla config.yaml içindeki bütün değerleri çağırıyorum
    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.initiate_model_trainer(train_arr,test_arr)
except Exception as e:
    raise e

[2025-01-17 23:10:59,020: INFO: 3420584700: Train ve test verisiyle model eğitimi başlatılıyor.]
[2025-01-17 23:10:59,020: INFO: 3420584700: Random Forest için RandomizedSearchCV başlatılıyor.]




[2025-01-17 23:11:02,569: INFO: 3420584700: Decision Tree için RandomizedSearchCV başlatılıyor.]
[2025-01-17 23:11:02,743: INFO: 3420584700: Gradient Boosting için RandomizedSearchCV başlatılıyor.]




[2025-01-17 23:11:04,665: INFO: 3420584700: Linear Regression için parametre yok, varsayılan model kullanılıyor.]
[2025-01-17 23:11:04,667: INFO: 3420584700: CatBoosting Regressor için RandomizedSearchCV başlatılıyor.]
[2025-01-17 23:11:07,400: INFO: 3420584700: AdaBoost Regressor için RandomizedSearchCV başlatılıyor.]
[2025-01-17 23:11:11,267: INFO: 3420584700: En iyi model: Linear Regression (R2: 0.88122153283977)]
