In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
import joblib
import os

In [2]:
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

# Importing the data 

In [3]:
housing_df_train = pd.read_csv(r"C:\Users\SADEK COMPUTER\Desktop\Epita\01 - Semester 2\Data Science Production\Github Assignment\dsp-jimy-salem\data\train.csv.xls")

In [4]:
housing_df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
unique_salecondition = housing_df_train["SaleCondition"].unique()
print (unique_salecondition)

['Normal' 'Abnorml' 'Partial' 'AdjLand' 'Alloca' 'Family']


In [6]:
housing_df_test = pd.read_csv(r"C:\Users\SADEK COMPUTER\Desktop\Epita\01 - Semester 2\Data Science Production\Github Assignment\dsp-jimy-salem\data\test.csv.xls")

print (f"The housing test shape is {housing_df_test.shape}")
print(f"The housing train shape is {housing_df_train.shape}")

The housing test shape is (1459, 80)
The housing train shape is (1460, 81)


In [7]:
housing_df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [8]:
unique_street = housing_df_test["Street"].unique()
unique_street

array(['Pave', 'Grvl'], dtype=object)

In [9]:
column_names_test = housing_df_test.columns
column_names_train = housing_df_train.columns
#Train has more columns
print(column_names_train)
print()
diff_columns = [col for col in housing_df_train.columns if col not in housing_df_test.columns]
print(f"The train has a difference  of {diff_columns} from the test data")
# the indication here is that we need to predict the sale price using modeling for the train data and then compare it with the y  

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

# Checking Training Data 

To goal is to train the model on the training data to evaluate the prediction for the testing data and do the RMSE

In [10]:
#The housing train shape is (1460, 81)
null_count = housing_df_train.isnull().sum()
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print (null_count)

#Avoid Alley, PoolQC, Fence, MiscFeature 

Id                  0
MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType        872
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           37
BsmtCond           37
BsmtExposure       38
BsmtFinType1       37
BsmtFinSF1          0
BsmtFinType2       38
BsmtFinSF2          0
BsmtUnfSF           0
TotalBsmtSF         0
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
1stFlrSF            0
2ndFlrSF            0
LowQualFin

In [11]:
X_trial = housing_df_train.copy()

y = X_trial["SalePrice"]
X_trial = X_trial.drop (["Id", "PoolQC", "Fence", "MiscFeature", "Alley", "FireplaceQu", "SalePrice"], axis=1)

X_trial = X_trial [["LotArea", "Neighborhood", "TotalBsmtSF", "GrLivArea","BldgType","GarageArea"]]
X_trial.head()

print (X_trial.shape)

(1460, 6)


In [12]:
X = X_trial.copy() #,"TotalBsmtSF", "SaleCondition"]]

neighborhood_values = X["Neighborhood"].unique()
print(neighborhood_values)
print()
building_type = X['BldgType'].unique()
print (X.columns)

['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'
 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'
 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill'
 'Blmngtn' 'BrDale' 'SWISU' 'Blueste']

Index(['LotArea', 'Neighborhood', 'TotalBsmtSF', 'GrLivArea', 'BldgType',
       'GarageArea'],
      dtype='object')


In [13]:
#sanity checking
zero_value = y [y<=0]
print(zero_value)
y.head()

Series([], Name: SalePrice, dtype: int64)


0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

# Model Building

## Model Training

In [14]:
path = r'C:\Users\SADEK COMPUTER\Desktop\Epita\01 - Semester 2\Data Science Production\Github Assignment\dsp-jimy-salem\models'

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

categorical_features = ['Neighborhood', 'BldgType']
onehot_encoder = OneHotEncoder(drop='first', sparse=False)

X_train_categorical = onehot_encoder.fit_transform(X_train[categorical_features])
joblib.dump (onehot_encoder, os.path.join (path, "one_hot_encoder.joblib"))

X_test_categorical = onehot_encoder.transform(X_test[categorical_features])

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(categorical_features, axis=1))
X_test_scaled = scaler.transform(X_test.drop(categorical_features, axis=1))
joblib.dump (scaler, os.path.join (path,"scaler.joblib"))

X_train_final = np.hstack((X_train_scaled, X_train_categorical))
X_test_final = np.hstack((X_test_scaled, X_test_categorical))

print(X_train_final.shape)

model = LinearRegression()
model.fit(X_train_final, y_train)

#first joblib
joblib.dump (model, os.path.join (path,'model.joblib'))

y_pred = model.predict(X_test_final)

y_pred = np.clip(y_pred, 0, None)


(1168, 32)




In [16]:
print (compute_rmsle(y_test,y_pred))

0.2


In [17]:
y_df = pd.DataFrame(y, columns=['SalePrice'])
y_pred_df = pd.DataFrame(y_pred, columns=['PredictedSalePrice'])
result_df = pd.concat([y_df, y_pred_df], axis=1)
X_train_df = pd.DataFrame(X_train_final)
X_test_df = pd.DataFrame(X_test_final)
combined_df = pd.concat([X_train_df, X_test_df])

combined_df.reset_index(drop=True, inplace=True)
result_df.reset_index(drop=True, inplace=True)

#It didn't work elseway
combined_df.columns = combined_df.columns.astype(str)
result_df.columns = result_df.columns.astype(str)

processed_df = pd.concat([combined_df, result_df], axis =1)
processed_df.to_parquet('C:/Users/SADEK COMPUTER/Desktop/Epita/01 - Semester 2/Data Science Production/Github Assignment/dsp-jimy-salem/notebooks/processed_df.parquet', index=False)

In [18]:
correctly_processed_df = pd.read_parquet('processed_df.parquet')
#None means the assertion was successful
print (pd.testing.assert_frame_equal(processed_df, correctly_processed_df))

FileNotFoundError: [Errno 2] No such file or directory: 'processed_df.parquet'

## Model Evaluation/Model Inference

Due to the fact my code is not well structured from the begining, I will add both ideas and proceed with models folder and joblib

In [None]:
print (len(y))

1460


In [None]:
#"LotArea", "Neighborhood", "TotalBsmtSF", "GrLivArea","BldgType", "TotalBsmtSF","GarageArea"
feature_chosen_test = housing_df_test[["LotArea" , "Neighborhood", "TotalBsmtSF", "GrLivArea","BldgType","GarageArea"]]

In [None]:
null_counts = feature_chosen_test.isnull().sum()
print(null_counts)

LotArea         0
Neighborhood    0
TotalBsmtSF     1
GrLivArea       0
BldgType        0
GarageArea      1
dtype: int64


In [None]:
feature_chosen_test = feature_chosen_test.dropna()

In [None]:
recount_null = feature_chosen_test.isnull().sum()
print(recount_null)

print (feature_chosen_test.shape)

LotArea         0
Neighborhood    0
TotalBsmtSF     0
GrLivArea       0
BldgType        0
GarageArea      0
dtype: int64
(1457, 6)


In [None]:
columns_to_encode = ['Neighborhood', 'BldgType']
encoder_unloaded = joblib.load(os.path.join (path, 'one_hot_encoder.joblib'))

encoded_features = encoder_unloaded.fit_transform(feature_chosen_test[columns_to_encode])
categories = encoder_unloaded.categories_

encoded_columns = [f"{column}_{category}" for column, category_list in zip(columns_to_encode, categories) for category in category_list[1:]]

encoded_df = pd.DataFrame(encoded_features, columns=encoded_columns)

encoded_df.head()
features = feature_chosen_test.copy()

feature_chosen_test.drop(columns=columns_to_encode, inplace=True)
feature_chosen_test = pd.concat([feature_chosen_test, encoded_df], axis=1)

In [None]:
model_unload = joblib.load(os.path.join (path, 'model.joblib'))# for the sake of joblib
feature_chosen_test.dropna(inplace=True)

scaler_unloaded = joblib.load(os.path.join(path, 'scaler.joblib'))

print (feature_chosen_test.shape)
y_pred_test = model_unload.predict(scaler_unloaded.fit_transform(feature_chosen_test))#scaler_unloaded.fit_transform(feature_chosen_test))

y_pred_test = np.abs(y_pred_test)

print(len(y_pred_test))


(1455, 32)
1455


In [None]:
#5 were removed, if they were critical points it will afect 
y_truncated = y[:len(y_pred_test)]

rmsle = compute_rmsle(y_truncated, y_pred_test)
print(rmsle)

1.01


# Code Refactoring

## Preprocessing Function

Idea behind the code:
1. Feature_selection will get the whole original data and return:
    
    train data: the dataset will remove all the null values containing the features chosen
    
    test data: Only the needed features without null values 

Now this will clearly provide a dataset with only the features to continue the whole other part 

2. Train_split_func will be passed only the needed data after training and splitting:

    Only Conducted for the training Data: train test split after choosing the y (Target variable)

3. Hot Encoding Function: this will hot encode categorical features after being provided the dataframe containing only the function 

    Train Data: train, test split will be done followed by hot encoding 

    Test Data: After having the selected features we will be doing a hot encoding over categorical variables 

4. Scaling: Similar to hot encoding step but with scaling

For Train Data: we can simply only do the encoding and testing 



In [None]:
def feature_selection (data: pd.DataFrame, is_test: bool, features: list, y_out: list):
    #in our case features gonna be ["LotArea", "Neighborhood", "TotalBsmtSF", "GrLivArea", "BldgType", "GarageArea"]
    #y is gonna be ["SalePrice"]

    if is_test == False: #this is the train data with sales price
        feature_selection = data[features + y_out]
        feature_selection.dropna(subset=features, inplace=True)
        X = feature_selection [features]
        y = feature_selection [y_out]
        return pd.concat ([X, y], axis = 1)
    else:
        feature_selection = data[features]
        feature_selection.dropna(subset=features, inplace=True)
    return feature_selection

def train_split_func(data: pd.DataFrame, is_test: bool, y_out: list):
    if is_test == False:
        y = data[y_out]
        X = data.drop(columns=y_out, axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        return X_train, X_test, y_train, y_test
    else:
        pass

def hot_encoding_func(data: pd.DataFrame, is_test: bool, categorical_features: list, y_out: list):
    onehot_encoder = joblib.load(os.path.join(path, 'one_hot_encoder.joblib'))
    if is_test == False:
        X_train, X_test, y_train, y_test = train_split_func(data, is_test, y_out)
        X_train_categorical = onehot_encoder.fit_transform(X_train[categorical_features])
        X_test_categorical = onehot_encoder.transform(X_test[categorical_features])
        return X_train_categorical, X_test_categorical, y_train, y_test
    else:
        if categorical_features:
            encoded_features = onehot_encoder.transform(data[categorical_features])
            categories = onehot_encoder.categories_
            encoded_columns = [f"{column}_{category}" for column, category_list in zip(categorical_features, categories) for category in category_list[1:]]
            encoded_df = pd.DataFrame(encoded_features, columns=encoded_columns)
            data.drop(columns=categorical_features, inplace=True)
            return pd.concat([data, encoded_df], axis=1)
        else:
            return data

def scaling_func (data: pd.DataFrame, is_test: bool, categorical_features: list, y_out: list):
    scaler = joblib.load(os.path.join (path, 'scaler.joblib'))
    #categorical_features = ['Neighborhood', 'BldgType']
    if is_test == False:
        X_train, X_test, y_train, y_test = train_split_func (data, is_test, y_out)
        X_train_scaled = scaler.fit_transform(X_train.drop(categorical_features, axis=1))
        X_test_scaled = scaler.transform(X_test.drop(categorical_features, axis=1))
        return X_train_scaled, X_test_scaled, y_train, y_test
    else:
        data.dropna(inplace=True)
        data = scaler.fit_transform(data)
        return data



# Before Using the Preprocessing Functions

## Building Model

In this step, it is obvious that the train training set is given

In [None]:
def build_model(data: pd.DataFrame) -> dict[str, str]:
    result = {}
    #this step is crucial to drop all the NA 
    feature_selection = data[["LotArea", "Neighborhood", "TotalBsmtSF", "GrLivArea", "BldgType", "GarageArea", "SalePrice"]]
    feature_selection.dropna(subset=["LotArea", "Neighborhood", "TotalBsmtSF", "GrLivArea", "BldgType", "GarageArea"], inplace=True)

    X = feature_selection [["LotArea", "Neighborhood", "TotalBsmtSF", "GrLivArea","BldgType","GarageArea"]]
    y = feature_selection [["SalePrice"]]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    #encoding 
    onehot_encoder = joblib.load(os.path.join (path, 'one_hot_encoder.joblib'))
    categorical_features = ['Neighborhood', 'BldgType']
    X_train_categorical = onehot_encoder.fit_transform(X_train[categorical_features])
    X_test_categorical = onehot_encoder.transform(X_test[categorical_features])
    
    #scaling 
    scaler = joblib.load(os.path.join (path, 'scaler.joblib'))
    X_train_scaled = scaler.fit_transform(X_train.drop(categorical_features, axis=1))
    X_test_scaled = scaler.transform(X_test.drop(categorical_features, axis=1))

    #finalizing, fitting and predicting  
    X_train_final = np.hstack((X_train_scaled, X_train_categorical))
    X_test_final = np.hstack((X_test_scaled, X_test_categorical))

    model = joblib.load(os.path.join (path, 'model.joblib'))
    model.fit(X_train_final, y_train)
    y_pred = model.predict(X_test_final)
    y_pred = np.clip(y_pred, 0, None)

    result['rmse'] = compute_rmsle(y_test, y_pred)

    return result


In [None]:
#let's try it
print (build_model (housing_df_train))

{'rmse': 0.2}


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_selection.dropna(subset=["LotArea", "Neighborhood", "TotalBsmtSF", "GrLivArea", "BldgType", "GarageArea"], inplace=True)


## make_prediction

Here the test data should go in

In [None]:
def make_prediction(input_data: pd.DataFrame) -> np.ndarray:
    #Choosing features
    feature_chosen_test = housing_df_test[["LotArea" , "Neighborhood", "TotalBsmtSF", "GrLivArea","BldgType","GarageArea"]]
    feature_chosen_test = feature_chosen_test.dropna()

    #encoding
    columns_to_encode = ['Neighborhood', 'BldgType']
    encoder_unloaded = joblib.load(os.path.join (path, 'one_hot_encoder.joblib'))
    encoded_features = encoder_unloaded.fit_transform(feature_chosen_test[columns_to_encode])
    categories = encoder_unloaded.categories_
    encoded_columns = [f"{column}_{category}" for column, category_list in zip(columns_to_encode, categories) for category in category_list[1:]]
    encoded_df = pd.DataFrame(encoded_features, columns=encoded_columns)
    feature_chosen_test.drop(columns=columns_to_encode, inplace=True)
    feature_chosen_test = pd.concat([feature_chosen_test, encoded_df], axis=1)

    #Scaling and Model fitting
    model_unload = joblib.load(os.path.join (path, 'model.joblib'))

    feature_chosen_test.dropna(inplace=True)
    scaler_unloaded = joblib.load(os.path.join(path, 'scaler.joblib'))
    feature_chosen_test_transform = scaler_unloaded.fit_transform(feature_chosen_test)
    y_pred_test = model_unload.predict(feature_chosen_test_transform)#scaler_unloaded.fit_transform(feature_chosen_test))
    y_pred_test = np.abs(y_pred_test)
    
    return y_pred_test


In [None]:
#Let's try it 
print (make_prediction(housing_df_test)[0:11])

[188350.1717458  210449.72746717 281556.7130024  276820.2725978
 744162.50142964 274250.47099252 253917.21574973 259991.04374094
 272361.17691418 176165.34572331 125994.7914119 ]


# After Using the preprocessing Functions

## Build Model

In [None]:
def build_model_refactored(data: pd.DataFrame) -> dict[str, str]:
    result = {}
    features_list = ["LotArea", "Neighborhood", "TotalBsmtSF", "GrLivArea", "BldgType", "GarageArea", "SalePrice"]
    y = ["SalePrice"]
    categorical_features = ["Neighborhood", "BldgType"]
    is_test = False
    new_data = feature_selection(data, is_test, features_list, y)
    X_train_categorical, X_test_categorical, y_train, y_test = hot_encoding_func(new_data, is_test, categorical_features, y)
    X_train_scaled, X_test_scaled, _, _ = scaling_func(new_data, is_test, categorical_features, y)

    X_train_final = np.hstack((X_train_scaled, X_train_categorical))
    X_test_final = np.hstack((X_test_scaled, X_test_categorical))

    model = joblib.load(os.path.join(path, 'model.joblib'))
    model.fit(X_train_final, y_train)
    y_pred = model.predict(X_test_final)
    y_pred = np.clip(y_pred, 0, None)

    result['rmse'] = compute_rmsle(y_test, y_pred)

    return result

In [None]:
print (build_model_refactored (housing_df_train))

{'rmse': 0.2}


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_selection.dropna(subset=features, inplace=True)


## Making Prediction

In [None]:
def make_prediction_refactored(input_data: pd.DataFrame) -> np.ndarray:
    features_list = ["LotArea", "Neighborhood", "TotalBsmtSF", "GrLivArea", "BldgType", "GarageArea"]
    categorical_features = ["Neighborhood", "BldgType"]
    is_test = True
 
    # Feature selection
    new_data = feature_selection(input_data, is_test, features_list, [])
    # Encoding
    new_data = hot_encoding_func(new_data, is_test, categorical_features, [])
    # Scaling
    new_data = scaling_func(new_data, is_test, categorical_features, [])
 
    model_unload = joblib.load(os.path.join(path, 'model.joblib'))
    y_pred_test = model_unload.predict(new_data)
    y_pred_test = np.abs(y_pred_test)
 
    return y_pred_test

    

In [None]:
print (make_prediction_refactored(housing_df_test)[0:11])

[188350.1717458  210449.72746717 281556.7130024  276820.2725978
 744162.50142964 274250.47099252 253917.21574973 259991.04374094
 272361.17691418 176165.34572331 125994.7914119 ]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_selection.dropna(subset=features, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=categorical_features, inplace=True)
