In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

# Importing the data 

In [19]:
housing_df_train = pd.read_csv(r"C:\Users\SADEK COMPUTER\Desktop\Epita\01 - Semester 2\Data Science Production\Github Assignment\dsp-jimy-salem\Data - pw2\train.csv.xls")

In [8]:
housing_df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [9]:
unique_salecondition = housing_df_train["SaleCondition"].unique()
print (unique_salecondition)

['Normal' 'Abnorml' 'Partial' 'AdjLand' 'Alloca' 'Family']


In [11]:
housing_df_test = pd.read_csv(r"C:\Users\SADEK COMPUTER\Desktop\Epita\01 - Semester 2\Data Science Production\Github Assignment\dsp-jimy-salem\Data - pw2\test.csv.xls")

print (f"The housing test shape is {housing_df_test.shape}")
print(f"The housing train shape is {housing_df_train.shape}")

The housing test shape is (1459, 80)
The housing train shape is (1460, 81)


In [12]:
housing_df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [13]:
unique_street = housing_df_test["Street"].unique()
unique_street

array(['Pave', 'Grvl'], dtype=object)

In [14]:
column_names_test = housing_df_test.columns
column_names_train = housing_df_train.columns
#Train has more columns
print(column_names_train)
print()
diff_columns = [col for col in housing_df_train.columns if col not in housing_df_test.columns]
print(f"The train has a difference  of {diff_columns} from the test data")
# the indication here is that we need to predict the sale price using modeling for the train data and then compare it with the y  

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

# Checking Training Data 

To goal is to train the model on the training data to evaluate the prediction for the testing data and do the RMSE

In [17]:
#The housing train shape is (1460, 81)
null_count = housing_df_train.isnull().sum()
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print (null_count)

#Avoid Alley, PoolQC, Fence, MiscFeature 

Id                  0
MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType        872
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           37
BsmtCond           37
BsmtExposure       38
BsmtFinType1       37
BsmtFinSF1          0
BsmtFinType2       38
BsmtFinSF2          0
BsmtUnfSF           0
TotalBsmtSF         0
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
1stFlrSF            0
2ndFlrSF            0
LowQualFin

In [25]:
X_trial = housing_df_train.copy()
# correlation_matrix = X_trial.corr()

y = X_trial["SalePrice"]
X_trial = X_trial.drop (["Id", "PoolQC", "Fence", "MiscFeature", "Alley", "FireplaceQu", "SalePrice"], axis=1)

X_trial = X_trial [["LotArea", "Neighborhood", "TotalBsmtSF", "GrLivArea","BldgType", "TotalBsmtSF","GarageArea" ]]
X_trial.head()

Unnamed: 0,LotArea,Neighborhood,TotalBsmtSF,GrLivArea,BldgType,TotalBsmtSF.1,GarageArea
0,8450,CollgCr,856,1710,1Fam,856,548
1,9600,Veenker,1262,1262,1Fam,1262,460
2,11250,CollgCr,920,1786,1Fam,920,608
3,9550,Crawfor,756,1717,1Fam,756,642
4,14260,NoRidge,1145,2198,1Fam,1145,836


In [26]:
X = X_trial.copy() #,"TotalBsmtSF", "SaleCondition"]]

neighborhood_values = X["Neighborhood"].unique()
print(neighborhood_values)
print()
building_type = X['BldgType'].unique()
print (X.columns)

['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'
 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'
 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill'
 'Blmngtn' 'BrDale' 'SWISU' 'Blueste']

Index(['LotArea', 'Neighborhood', 'TotalBsmtSF', 'GrLivArea', 'BldgType',
       'TotalBsmtSF', 'GarageArea'],
      dtype='object')


In [27]:
#sanity checking
zero_value = y [y<=0]
print(zero_value)
y.head()

Series([], Name: SalePrice, dtype: int64)


0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

# Train Test Split, OneHotEncoding,  Scaling & RMSLE Computing

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

categorical_features = ['Neighborhood', 'BldgType']
onehot_encoder = OneHotEncoder(drop='first', sparse=False)

X_train_categorical = onehot_encoder.fit_transform(X_train[categorical_features])
X_test_categorical = onehot_encoder.transform(X_test[categorical_features])

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(categorical_features, axis=1))
X_test_scaled = scaler.transform(X_test.drop(categorical_features, axis=1))

X_train_final = np.hstack((X_train_scaled, X_train_categorical))
X_test_final = np.hstack((X_test_scaled, X_test_categorical))

model = LinearRegression()
model.fit(X_train_final, y_train)

y_pred = model.predict(X_test_final)

y_pred = np.clip(y_pred, 0, None)




In [29]:
print (compute_rmsle(y_test,y_pred))

0.2


In [30]:
y_df = pd.DataFrame(y, columns=['SalePrice'])
y_pred_df = pd.DataFrame(y_pred, columns=['PredictedSalePrice'])
result_df = pd.concat([y_df, y_pred_df], axis=1)
X_train_df = pd.DataFrame(X_train_final)
X_test_df = pd.DataFrame(X_test_final)
combined_df = pd.concat([X_train_df, X_test_df])

combined_df.reset_index(drop=True, inplace=True)
result_df.reset_index(drop=True, inplace=True)

#It didn't work elseway
combined_df.columns = combined_df.columns.astype(str)
result_df.columns = result_df.columns.astype(str)

processed_df = pd.concat([combined_df, result_df], axis =1)
processed_df.to_parquet('C:/Users/SADEK COMPUTER/Desktop/Epita/01 - Semester 2/Data Science Production/Github Assignment/dsp-jimy-salem/notebooks/processed_df.parquet', index=False)

In [31]:
correctly_processed_df = pd.read_parquet('processed_df.parquet')
#None means the assertion was successful
print (pd.testing.assert_frame_equal(processed_df, correctly_processed_df))

None
