In [16]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
# # src/utils/constants.py

embedding_dict = {
    'mainroad': {'yes': 1, 'no': 0},
    'guestroom': {'yes': 1, 'no': 0},
    'basement': {'yes': 1, 'no': 0},
    'hotwaterheating': {'yes': 1, 'no': 0},
    'airconditioning': {'yes': 1, 'no': 0},
    'prefarea': {'yes': 1, 'no': 0},
    'furnishingstatus': {'furnished': 2, 'semi-furnished': 1, 'unfurnished': 0},
    # 'parking': {'yes': 1, 'no': 0}  # Add this line
}

def handle_missing_values(data):
    """
    Handle missing values in the dataset.

    Parameters:
    - data: DataFrame, input dataset.

    Returns:
    - DataFrame, dataset after handling missing values.
    """

    return data.dropna()


def encode_categorical_variables(data, categorical_columns):
    """
    Encode categorical variables in the dataset using the provided embedding dictionary.

    Parameters:
    - data: DataFrame, input dataset.
    - categorical_columns: list, names of categorical columns.
    - embedding_dict: dict, embedding dictionary for categorical values.

    Returns:
    - DataFrame, dataset after encoding.
    """
    # Replace categorical values with numerical representations from embedding_dict
    for column in categorical_columns:
        if column in embedding_dict:
            data[column] = data[column].map(embedding_dict[column])
    return data


def split_dataset(data, target_column, test_size=0.2, random_state=42):
    """
    Split the dataset into training and testing sets.

    Parameters:
    - data: DataFrame, input dataset.
    - target_column: str, the name of the target column.
    - test_size: float, the proportion of the dataset to include in the test split.
    - random_state: int, seed for random number generation.

    Returns:
    - tuple, (X_train, X_test, y_train, y_test).
    """
    X = data.drop(columns=[target_column])
    y = data[target_column]
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

def save_processed_data(data, file_path='data/processed_data.csv'):
    """
    Save the processed dataset to a CSV file.

    Parameters:
    - data: DataFrame, processed dataset.
    - file_path: str, path to the output CSV file.

    Returns:
    - None
    """
    data.to_csv(file_path, index=False)

raw_data = pd.read_csv('../data/raw_data.csv')
raw_data
processed_data = handle_missing_values(raw_data)
processed_data = encode_categorical_variables(processed_data, categorical_columns=['mainroad', 'guestroom','basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea', 'furnishingstatus'])

processed_data
X_train, X_test, y_train, y_test = split_dataset(processed_data, target_column='price')
save_processed_data(processed_data, file_path='../  data/processed_data.csv')



OSError: Cannot save file into a non-existent directory: '../  data'

In [20]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the processed data from data/processed_data.csv
processed_data_path = '../data/processed_data.csv'
df = pd.read_csv(processed_data_path)

# Perform feature engineering tasks
# Example: Standard Scaling on numerical features
numerical_features = [
    'price', 'area', 'bedrooms'	,'bathrooms'	,'stories', 'mainroad','guestroom', 'basement',
    'hotwaterheating', 'airconditioning','parking' , 'prefarea','furnishingstatus']  # Replace with your actual feature names
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# You can add more feature engineering steps as needed

# Save the updated dataframe to data/processed_data.csv
df.to_csv(processed_data_path, index=False)
df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,4.566365,1.046726,1.403419,1.421812,1.378217,0.405623,-0.465315,-0.734539,-0.219265,1.472618,1.517692,1.804941,1.406286
1,4.004484,1.757010,1.403419,5.405809,2.532024,0.405623,-0.465315,-0.734539,-0.219265,1.472618,2.679409,-0.554035,1.406286
2,4.004484,2.218232,0.047278,1.421812,0.224410,0.405623,-0.465315,1.361397,-0.219265,-0.679063,1.517692,1.804941,0.091662
3,3.985755,1.083624,1.403419,1.421812,0.224410,0.405623,-0.465315,1.361397,-0.219265,1.472618,2.679409,1.804941,1.406286
4,3.554979,1.046726,1.403419,-0.570187,0.224410,0.405623,2.149083,1.361397,-0.219265,1.472618,1.517692,-0.554035,1.406286
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,-1.576868,-0.991879,-1.308863,-0.570187,-0.929397,0.405623,-0.465315,1.361397,-0.219265,-0.679063,1.517692,-0.554035,-1.222962
541,-1.605149,-1.268613,0.047278,-0.570187,-0.929397,-2.465344,-0.465315,-0.734539,-0.219265,-0.679063,-0.805741,-0.554035,0.091662
542,-1.614327,-0.705921,-1.308863,-0.570187,-0.929397,0.405623,-0.465315,-0.734539,-0.219265,-0.679063,-0.805741,-0.554035,-1.222962
543,-1.614327,-1.033389,0.047278,-0.570187,-0.929397,-2.465344,-0.465315,-0.734539,-0.219265,-0.679063,-0.805741,-0.554035,1.406286
