In [12]:
# modules we'll use
import random
import pandas as pd
import numpy as np
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd
from sklearn.ensemble import RandomForestRegressor  # for regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error


# read in all our data
HPP_data = pd.read_csv("../train.csv")

# set seed for reproducibility
np.random.seed(0) 

# FIND MISSING VALUE-PERCENTAGE BEFORE CLEANING
# get the number of missing data points per column
missing_values_count = HPP_data.isnull().sum()
# how many total missing values do we have?
total_cells = np.product(HPP_data.shape)
total_missing = missing_values_count.sum()
# percent of data that is missing
percent_missing = (total_missing/total_cells) * 100
print(percent_missing)

# CLEANING
# Mean
HPP_data["MasVnrArea"].fillna(HPP_data["MasVnrArea"].mean(),inplace=True)

# Mode
HPP_data['MasVnrType'] = HPP_data['MasVnrType'].fillna(HPP_data['MasVnrType'].mode()[random.randint(0, len(pd.Series(HPP_data['MasVnrType']).mode()) - 1)])
HPP_data['Electrical'] = HPP_data['Electrical'].fillna(HPP_data['Electrical'].mode()[random.randint(0, len(pd.Series(HPP_data['Electrical']).mode()) - 1)])

# Impute
HPP_data['LotFrontage'] = HPP_data['LotFrontage'].fillna('0')
HPP_data['Alley'] = HPP_data['Alley'].fillna('NoAlley')
HPP_data['BsmtQual'] = HPP_data['BsmtQual'].fillna('NoBasement')
HPP_data['BsmtCond'] = HPP_data['BsmtCond'].fillna('NoBasement')
HPP_data['BsmtExposure'] = HPP_data['BsmtExposure'].fillna('NoBasement')
HPP_data['BsmtFinType1'] = HPP_data['BsmtFinType1'].fillna('NoBasement')
HPP_data['BsmtFinType2'] = HPP_data['BsmtFinType2'].fillna('NoBasement')
HPP_data['FireplaceQu'] = HPP_data['FireplaceQu'].fillna('NoFireplace')
HPP_data['GarageType'] = HPP_data['GarageType'].fillna('NoGarage')
HPP_data['GarageYrBlt'] = HPP_data['GarageYrBlt'].fillna(-1)
HPP_data['GarageFinish'] = HPP_data['GarageFinish'].fillna('NoGarage')
HPP_data['GarageQual'] = HPP_data['GarageQual'].fillna('NoGarage')
HPP_data['GarageCond'] = HPP_data['GarageCond'].fillna('NoGarage')
HPP_data['PoolQC'] = HPP_data['PoolQC'].fillna('NoPool')
HPP_data['Fence'] = HPP_data['Fence'].fillna('NoFence')
HPP_data['MiscFeature'] = HPP_data['MiscFeature'].fillna('NoMiscFeature')

# FIND THE COLUMNS WITH MISSING VALUES
# Now, how many total missing values do we have?
total_cells = np.product(HPP_data.shape)
missing_values_count = HPP_data.isnull().sum()
total_missing = missing_values_count.sum()
# Now, we find the percent of data that is missing
percent_missing = (total_missing/total_cells) * 100
print(percent_missing)

# Set the width and height of the figure
# sns.scatterplot(x=HPP_data['SalePrice'], y=HPP_data['GrLivArea'], hue= HPP_data['YearBuilt'])

# Identify categorical columns
categorical_columns = HPP_data.select_dtypes(include=['object']).columns

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# TODO make sure every value is an int64 
for column_name, data_type in HPP_data.dtypes.items():
    if data_type == "float64":
        HPP_data[column_name] = HPP_data[column_name].astype('int64')
    elif data_type == "object":
        HPP_data[column_name] = label_encoder.fit_transform(HPP_data[column_name].astype(str))


# TODO check that it worked


# # Load the testing data
# HPP_test_data = pd.read_csv("../test.csv")    



# # Apply the same preprocessing and encoding steps to the test data
# # This includes handling missing values and label encoding
# # (You can reuse the same code you used for the training data)

# # Assuming you have a function called preprocess_data for the test data:
# # HPP_test_data = preprocess_data(HPP_test_data)



# # Split your training dataset into features (X) and target (y)
# X = HPP_data.drop('SalePrice', axis=1)
# y = HPP_data['SalePrice']

# # Create and train the Random Forest Regressor
# regressor = RandomForestRegressor(n_estimators=100, random_state=0)
# regressor.fit(X, y)

# # Use the trained model to make predictions on the test data
# X_test = HPP_test_data
# y_pred = regressor.predict(X_test)

# # Display the predicted values
# print("Predicted Sale Prices:", y_pred)

# # Save the predictions to a CSV file if needed
# HPP_test_data['Predicted_SalePrice'] = y_pred
# HPP_test_data.to_csv("predicted_test_results.csv", index=False)




5.889565364451209
0.0


In [98]:
# modules we'll use
import random
import pandas as pd
import numpy as np
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd
from sklearn.ensemble import RandomForestRegressor  # for regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error

# #TODO make a preprocessing function
def preprocess_data(filename, is_train_data, label_encoder):
    # Read in all our data
    HPP_data = pd.read_csv(filename)

    # Set seed for reproducibility
    np.random.seed(0) 

    # FIND MISSING VALUE-PERCENTAGE BEFORE CLEANING
    # Get the number of missing data points per column
    missing_values_count = HPP_data.isnull().sum()
    # How many total missing values do we have?
    total_cells = np.product(HPP_data.shape)
    total_missing = missing_values_count.sum()
    # Percent of data that is missing
    percent_missing = (total_missing / total_cells) * 100
    print(percent_missing)

    # CLEANING
    # Mean
    HPP_data["MasVnrArea"].fillna(HPP_data["MasVnrArea"].mean(), inplace=True)

    # Mode
    HPP_data['MasVnrType'] = HPP_data['MasVnrType'].fillna(HPP_data['MasVnrType'].mode()[random.randint(0, len(pd.Series(HPP_data['MasVnrType']).mode()) - 1)])
    HPP_data['Electrical'] = HPP_data['Electrical'].fillna(HPP_data['Electrical'].mode()[random.randint(0, len(pd.Series(HPP_data['Electrical']).mode()) - 1)])

    # Impute
    HPP_data['LotFrontage'] = HPP_data['LotFrontage'].fillna('0')
    HPP_data['Alley'] = HPP_data['Alley'].fillna('NoAlley')
    HPP_data['BsmtQual'] = HPP_data['BsmtQual'].fillna('NoBasement')
    HPP_data['BsmtCond'] = HPP_data['BsmtCond'].fillna('NoBasement')
    HPP_data['BsmtExposure'] = HPP_data['BsmtExposure'].fillna('NoBasement')
    HPP_data['BsmtFinType1'] = HPP_data['BsmtFinType1'].fillna('NoBasement')
    HPP_data['BsmtFinType2'] = HPP_data['BsmtFinType2'].fillna('NoBasement')
    HPP_data['FireplaceQu'] = HPP_data['FireplaceQu'].fillna('NoFireplace')
    HPP_data['GarageType'] = HPP_data['GarageType'].fillna('NoGarage')
    HPP_data['GarageYrBlt'] = HPP_data['GarageYrBlt'].fillna(-1)
    HPP_data['GarageFinish'] = HPP_data['GarageFinish'].fillna('NoGarage')
    HPP_data['GarageQual'] = HPP_data['GarageQual'].fillna('NoGarage')
    HPP_data['GarageCond'] = HPP_data['GarageCond'].fillna('NoGarage')
    HPP_data['PoolQC'] = HPP_data['PoolQC'].fillna('NoPool')
    HPP_data['Fence'] = HPP_data['Fence'].fillna('NoFence')
    HPP_data['MiscFeature'] = HPP_data['MiscFeature'].fillna('NoMiscFeature')
    # (Rest of the imputations...)

    # FIND THE COLUMNS WITH MISSING VALUES
    # Now, how many total missing values do we have?
    total_cells = np.product(HPP_data.shape)
    missing_values_count = HPP_data.isnull().sum()
    total_missing = missing_values_count.sum()
    # Now, we find the percent of data that is missing
    percent_missing = (total_missing / total_cells) * 100
    print(percent_missing)

    if not is_train_data:
        # For non-training data, replace "NA" with 0
        missing_values_count = HPP_data.isnull().sum()
        missing_columns = missing_values_count[missing_values_count > 0]
        for column_name in missing_columns.index:
            HPP_data[column_name] = HPP_data[column_name].fillna(-1)

    # TODO make sure every value is an int64 
    for column_name, data_type in HPP_data.dtypes.items():
        if data_type == "float64":
            HPP_data[column_name] = label_encoder.fit_transform(HPP_data[column_name].astype('int64'))
        elif data_type == "object":
            HPP_data[column_name] = label_encoder.fit_transform(HPP_data[column_name].astype(str))
        else:
            HPP_data[column_name] = label_encoder.fit_transform(HPP_data[column_name])
        
    return HPP_data


label_encoder = LabelEncoder()

    

In [93]:
filename = "../train.csv"
HPP_data = preprocess_data(filename, True, label_encoder)
HPP_data

5.889565364451209
0.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0,5,3,76,327,1,1,3,3,0,...,0,3,4,1,0,1,2,8,4,412
1,1,0,3,91,498,1,1,3,3,0,...,0,3,4,1,0,4,1,8,4,339
2,2,5,3,79,702,1,1,0,3,0,...,0,3,4,1,0,8,2,8,4,442
3,3,6,3,71,489,1,1,0,3,0,...,0,3,4,1,0,1,0,8,0,194
4,4,5,3,95,925,1,1,0,3,0,...,0,3,4,1,0,11,2,8,4,494
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1455,5,3,73,267,1,1,3,3,0,...,0,3,4,1,0,7,1,8,4,314
1456,1456,0,3,96,866,1,1,3,3,0,...,0,3,2,1,0,1,4,8,4,415
1457,1457,6,3,77,415,1,1,3,3,0,...,0,3,0,3,17,4,4,8,4,527
1458,1458,0,3,79,505,1,1,3,3,0,...,0,3,4,1,0,3,4,8,4,199


In [97]:
# # Apply the same preprocessing and encoding steps to the test data
filenameTest = "../test.csv"
HPP_test_data = preprocess_data(filenameTest, False, label_encoder)    
HPP_test_data

# # Split your training dataset into features (X) and target (y)
X = HPP_data.drop('SalePrice', axis=1)
y = HPP_data['SalePrice']

# # Create and train the Random Forest Regressor
regressor = RandomForestRegressor(n_estimators=100, random_state=0)
regressor.fit(X, y)

# # Use the trained model to make predictions on the test data
X_test = HPP_test_data
y_pred = regressor.predict(X_test)

# # Display the predicted values
print("Predicted Sale Prices:", y_pred)
for i in y_pred:
    print(i)

print("Training data shape:", HPP_data.shape)
print("Test data shape:", HPP_test_data.shape)



5.9972583961617545
0.018848526387936944
Predicted Sale Prices: [152.21 247.01 317.47 ... 241.96 124.03 458.78]
152.21
247.01
317.47
325.7
397.88
306.37
290.9
289.83
352.46
141.01
394.75
76.59
95.4
243.0
220.19
612.89
501.96
563.81
542.39
578.55
585.28
422.9
295.71
332.73
323.82
385.62
578.95
468.59
409.1
378.03
376.79
71.88
311.21
575.53
572.38
461.51
391.86
264.67
258.38
230.95
309.1
308.59
534.41
486.44
437.67
319.82
394.93
364.77
282.53
224.04
221.46
270.53
242.61
277.95
285.52
262.23
256.42
148.74
448.04
187.73
204.84
254.57
112.88
145.91
154.24
158.57
94.41
184.38
187.42
284.22
185.02
102.69
201.61
129.96
262.12
117.99
47.08
327.46
479.13
128.19
255.24
227.9
388.8
56.0
113.95
149.04
210.59
240.28
113.16
228.07
126.58
211.82
195.15
132.81
317.2
99.36
191.68
96.56
150.62
230.99
238.92
157.33
144.15
287.15
247.36
438.08
45.53
407.63
259.15
153.54
212.88
212.91
474.77
167.15
413.96
370.19
387.57
232.41
216.67
376.49
205.65
160.36
550.6
459.14
200.21
37.07
88.18
240.93
77.8
181.05
87.0