In [1]:
import pandas as pd

In [2]:
import os
os.chdir("../")

In [3]:
from sklearn.linear_model._base import LinearRegression

In [4]:
data = pd.read_csv("artifacts\data_ingestion\Housing.csv")

In [5]:
data.head(4)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished


In [6]:
from sklearn.preprocessing import LabelEncoder

# Columns to encode
categorical_columns = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']

# Apply LabelEncoder
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le  # Save encoders for future use


In [7]:
data = pd.get_dummies(data, columns=['furnishingstatus'], drop_first=True)  # Example for one column


In [8]:
X=data.drop(columns=["price"])

In [9]:
y=data["price"]

In [10]:
X

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_1,furnishingstatus_2
0,7420,4,2,3,1,0,0,0,1,2,1,False,False
1,8960,4,4,4,1,0,0,0,1,3,0,False,False
2,9960,3,2,2,1,0,1,0,0,2,1,True,False
3,7500,4,2,2,1,0,1,0,1,3,1,False,False
4,7420,4,1,2,1,1,1,0,1,2,0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,3000,2,1,1,1,0,1,0,0,2,0,False,True
541,2400,3,1,1,0,0,0,0,0,0,0,True,False
542,3620,2,1,1,1,0,0,0,0,0,0,False,True
543,2910,3,1,1,0,0,0,0,0,0,0,False,False


In [11]:
y

0      13300000
1      12250000
2      12250000
3      12215000
4      11410000
         ...   
540     1820000
541     1767150
542     1750000
543     1750000
544     1750000
Name: price, Length: 545, dtype: int64

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
reg = LinearRegression().fit(X_train, y_train)

# Evaluate the model
print("Training R^2 Score:", reg.score(X_train, y_train))
print("Testing R^2 Score:", reg.score(X_test, y_test))  


Training R^2 Score: 0.6859438988560158
Testing R^2 Score: 0.6529242642153184


In [13]:
import pickle

In [32]:
with open('model.pkl','wb') as f:
    pickle.dump(reg,f)

In [19]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class PrepareTrainModelConfig:
    root_dir: Path
    base_model_path:Path


In [17]:
from src.house_reg.constant import *
from src.house_reg._utils.common import read_yaml,create_directories

In [31]:
class ConfigurationManager:
    def __init__(self, config_file_path=CONFIG_PATH, params_file_path=PARAMS_PATH):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        create_directories([self.config.artifacts_root])

    def get_train_model_config(self)->PrepareTrainModelConfig:
        config=self.config.prepare_base_model
        os.makedirs(config.root_dir,exist_ok=True)

        prepare_train_model=PrepareTrainModelConfig(
            root_dir=Path(config.root_dir),
            base_model_path=Path(config.base_model_path)
        )

        return prepare_train_model


In [32]:
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model._base import LinearRegression
import pandas as pd
from sklearn.model_selection import train_test_split


In [33]:
class PrepareTrainModel:

    def __init__(self,config=PrepareTrainModelConfig):

        self.config=config

    def data_set(self):

        data = pd.read_csv("artifacts\data_ingestion\Housing.csv")

        categorical_columns = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
        label_encoders = {}
        for col in categorical_columns:
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col])
            label_encoders[col] = le
        self.X=data.drop(columns=["price"])
        self.y=data["price"]

    def train_model(self):
    
        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Train the model
        reg = LinearRegression().fit(X_train, y_train)
        # Evaluate the model
        #print("Training R^2 Score:", reg.score(X_train, y_train))
        #print("Testing R^2 Score:", reg.score(X_test, y_test))
        with open(str(self.config.base_model_path),'wb') as f:
            pickle.dump(reg,f)



#### Pipeline


In [34]:
try:
    config=ConfigurationManager()
    prepare_train_config=config.get_train_model_config()
    prepare_train=PrepareTrainModel(config=prepare_train_config)
    prepare_train.data_set()
    prepare_train.train_model()
except Exception as e:
    raise e

[2024-12-27 02:10:04,055: INFO: common: yaml file:config\config.yaml loaded successfully]
[2024-12-27 02:10:04,058: INFO: common: yaml file:params.yaml loaded successfully]
[2024-12-27 02:10:04,059: INFO: common: created directory:artifacts]
