<h1 style="color: #00205B;"> 0. Importing Libraries and creating Cleaning Functions </h1>

In [10]:
# 📚 Basic libraries
import pandas as pd # data manipulation
import numpy as np # numerical operations
import matplotlib.pyplot as plt # 2D visualizations
import seaborn as sns # high-resolution visualization
import warnings # warning messages management
import datetime # to play with dates
import os

# 🤖 Machine Learning
from scipy.stats import skew # data distribution assymmetry
from sklearn.preprocessing import MinMaxScaler # data normalization
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# ⚙️ Settings
pd.set_option('display.max_columns', None) # display all columns
warnings.filterwarnings('ignore') # ignore warnings

In [11]:
def snake_columns(data): # snake_case columns
    data.columns = [column.lower().replace(' ', '_') for column in data.columns]
    return data.sample(0)

def open_data(data): # returns shape, data types & shows a small sample
    print(f"Data shape is {data.shape}.")
    print()
    print(data.dtypes)
    print()
    print("Data row sample and full columns:")
    return data.sample(5)

def explore_data(data): # sum & returns duplicates, NaN & empty spaces
    duplicate_rows = data.duplicated().sum()
    nan_values = data.isna().sum()
    empty_spaces = data.eq(' ').sum()
    import pandas as pd
    exploration = pd.DataFrame({"NaN": nan_values, "EmptySpaces": empty_spaces}) # New dataframe with the results
    print(f"There are {data.duplicated().sum()} duplicate rows. Also;")
    return exploration

def get_house_lifetime(row): # Define the get_house_lifetime function
    today = datetime.datetime.today().year
    if row['yr_renovated'] != 0:
        return today - row['yr_renovated']
    else:
        return today - row['yr_built']
    
def get_house_soldtime(row): # Define the get_house_soldtime function
    today = datetime.datetime.today().year
    return today - row['year']

    


<h1 style="color: #00BFFF;">01 | Data Extraction</h1>

In [13]:
file_path = os.path.join("C:/Users/apisi/01. IronData/01. GitHub/03. Projects/05_patern_pending/00_data", "cleaned.csv")
data = pd.read_csv(file_path, index_col=0) # to deal with an error `Unnamed: 0` column
data = data.reset_index()

In [14]:
datac = data.copy()

In [15]:
snake_columns(datac)


Unnamed: 0,index,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,sqft_living15,sqft_lot15,price,year,month,house_soldtime


In [16]:
datac.zipcode.value_counts()

AttributeError: 'DataFrame' object has no attribute 'zipcode'

In [17]:
open_data(datac)

Data shape is (21597, 18).

index               int64
bedrooms            int64
bathrooms         float64
sqft_living         int64
sqft_lot            int64
floors            float64
waterfront          int64
view                int64
condition           int64
grade               int64
sqft_above          int64
sqft_basement       int64
sqft_living15       int64
sqft_lot15          int64
price               int64
year                int64
month              object
house_soldtime      int64
dtype: object

Data row sample and full columns:


Unnamed: 0,index,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,sqft_living15,sqft_lot15,price,year,month,house_soldtime
4722,4722,4,2.5,2730,36183,2.0,0,0,3,9,2730,0,2710,5964,760000,2014,Jun,9
14901,14901,7,2.5,2580,5750,1.0,0,0,4,7,1880,700,2280,5750,599000,2014,Jul,9
909,909,3,1.75,1420,8738,1.0,0,0,4,7,1420,0,1660,8738,234000,2014,Nov,9
2113,2113,4,2.75,2100,7236,1.0,0,0,3,8,1400,700,1900,7519,491500,2014,Jul,9
2971,2971,1,1.0,820,1060,1.0,0,0,3,8,760,60,1770,1924,219500,2014,Sep,9


In [18]:
explore_data(datac)

There are 0 duplicate rows. Also;


Unnamed: 0,NaN,EmptySpaces
index,0,0
bedrooms,0,0
bathrooms,0,0
sqft_living,0,0
sqft_lot,0,0
floors,0,0
waterfront,0,0
view,0,0
condition,0,0
grade,0,0


<h1 style="color: #00BFFF;">02 | Data cleaning</h1>

<h2 style="color: #008080;">Dropping unnecessary features</h2>

In [19]:
datac.sample(5)

Unnamed: 0,index,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,sqft_living15,sqft_lot15,price,year,month,house_soldtime
1050,1050,4,3.25,3450,6500,2.0,0,0,3,8,2450,1000,1750,6500,850000,2014,Oct,9
9255,9255,4,1.5,2180,22870,1.0,0,0,4,6,1280,900,2420,22614,273000,2014,May,9
13162,13162,3,1.0,1710,6195,1.0,0,0,3,7,1410,300,1610,6195,670000,2014,Dec,9
14157,14157,3,1.75,950,6000,1.0,0,0,3,6,790,160,1360,6000,230000,2014,May,9
13545,13545,3,2.75,1730,4131,2.0,0,2,3,7,1480,250,1570,4120,260000,2015,Apr,8


In [20]:
##datac = datac.drop('date', axis=1) # safe date but separately in YEAR, MONTH, DAY
datac = datac.drop('id', axis=1)

KeyError: "['id'] not found in axis"

In [21]:
datac2 = datac.copy()

<h2 style="color: #008080;">Encoding Categoricals</h2>

In [22]:
#instead of giving them a number, we make them categoricals, and then we will encode them
datac2['year'] = datac2['date'].dt.year
datac2['month'] = datac2['date'].dt.month_name().str.slice(stop=3)
datac2 = datac2.drop(columns=['date'], axis=1)

KeyError: 'date'

In [23]:
datac2.head()

Unnamed: 0,index,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,sqft_living15,sqft_lot15,price,year,month,house_soldtime
0,0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1340,5650,221900,2014,Oct,9
1,1,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1690,7639,538000,2014,Dec,9
2,2,2,1.0,770,10000,1.0,0,0,3,6,770,0,2720,8062,180000,2015,Feb,8
3,3,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1360,5000,604000,2014,Dec,9
4,4,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1800,7503,510000,2015,Feb,8


### What can we do with zipcodes?

In [24]:


# Group the data by zip code and calculate the mean price for each group
mean_price_by_zip = datac2.groupby('zipcode')['price'].mean()

# Plot the mean price by zip code
mean_price_by_zip.plot(kind='bar', figsize=(10, 5))
plt.title('Mean Price by Zip Code')
plt.xlabel('Zip Code')
plt.ylabel('Mean Price')
plt.show()


KeyError: 'zipcode'

In [25]:

# Group the data by latitude and longitude and calculate the mean price for each group
mean_price_by_location = datac2.groupby(['lat', 'long'])['price'].mean().reset_index()

# Set up the scatter plot
fig, ax = plt.subplots(figsize=(10, 8))
sc = ax.scatter(mean_price_by_location['long'], mean_price_by_location['lat'], s=50, c=mean_price_by_location['price'], cmap='coolwarm', alpha=0.8)

# Add a color bar
cbar = plt.colorbar(sc)
cbar.ax.set_ylabel('Mean Price')

# Set the axis labels and title
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Mean Price by Location')

# Show the plot
plt.show()



KeyError: 'lat'

In [30]:
datac2 = pd.get_dummies(datac2, columns=['month'])

datac2 = pd.get_dummies(datac2, columns=['zipcode','month'])


In [32]:
datac2['house_soldtime'] = datac2.apply(get_house_soldtime, axis=1)

In [33]:
datac2.head()

Unnamed: 0,index,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,sqft_living15,sqft_lot15,price,year,house_soldtime,month_Apr,month_Aug,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,month_May,month_Nov,month_Oct,month_Sep
0,0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1340,5650,221900,2014,9.0,0,0,0,0,0,0,0,0,0,0,1,0
1,1,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1690,7639,538000,2014,9.0,0,0,1,0,0,0,0,0,0,0,0,0
2,2,2,1.0,770,10000,1.0,0,0,3,6,770,0,2720,8062,180000,2015,8.0,0,0,0,1,0,0,0,0,0,0,0,0
3,3,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1360,5000,604000,2014,9.0,0,0,1,0,0,0,0,0,0,0,0,0
4,4,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1800,7503,510000,2015,8.0,0,0,0,1,0,0,0,0,0,0,0,0


<h2 style="color: #008080;">Feature Engineering</h2>

In [34]:
# Apply the function to create a new column 'house_lifetime'
datac2['house_lifetime'] = datac2.apply(get_house_lifetime, axis=1)
datac2 = datac2.drop('yr_built', axis=1)
datac2 = datac2.drop('yr_renovated', axis=1)
datac2 = datac2.drop('year', axis=1)
datac2 = datac2.drop('lat', axis=1)
datac2 = datac2.drop('long', axis=1)
# Print the updated DataFrame
datac2.head(10)

KeyError: 'yr_renovated'

In [None]:
import seaborn as sns

# Assuming you have a dataframe named `df` with columns `col1`, `col2`, and `col3`
sns.boxplot(data=datac2['sqft_lot'])



## Looking for outliers

In [35]:
def remove_outliers_iqr(df, columns, threshold=1.5):
    df_outliers_removed = df.copy()
    for column in columns:
        q1, q3 = np.percentile(df[column], [25, 75])
        iqr = q3 - q1
        lower_bound = q1 - threshold * iqr
        upper_bound = q3 + threshold * iqr
        df_outliers_removed = df_outliers_removed[(df_outliers_removed[column] >= lower_bound) & (df_outliers_removed[column] <= upper_bound)]
    return df_outliers_removed


# Apply the remove_outliers_iqr function to the 'column1' and 'column2' columns with a threshold of 1.5
datac3 = remove_outliers_iqr(datac2, ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'sqft_living15', 'sqft_lot15','house_soldtime','house_lifetime'], threshold=3.3)

# Print the results
print('Original data:\n', datac2.shape)
print('Data without outliers:\n', datac3.shape)


Original data:
 (21597, 29)
Data without outliers:
 (17800, 29)


In [36]:
datac3.isna().sum()

NameError: name 'datac3' is not defined

In [None]:
# Split the data into input and target variables
X = datac3.drop("price", axis=1)
y = datac3["price"]

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [None]:
X_scaled.sample(10)

In [None]:
# Select the columns you want to use

# Fit a linear regression model to the data
model = LinearRegression()

model.fit(X_scaled, y)


# Get the feature importances for all columns
importances = np.abs(model.coef_)

# Create a boolean mask to select columns that do not start with 'zip' or 'month'
exclude_mask = ~X.columns.str.startswith(('zip', 'month'))

# Apply the mask to get the names and importances of the selected columns
selected_names = X.columns[exclude_mask]
selected_importances = importances[exclude_mask]

# Sort the selected features by importance
indices = np.argsort(selected_importances)[::-1]

# Limit to the top 15 features
top_indices = indices[:15]
top_importances = selected_importances[top_indices]
top_feature_names = selected_names[top_indices]

# Create a bar plot of feature importances
plt.figure(figsize=(10, 6))
plt.bar(range(15), top_importances)
plt.xticks(range(15), top_feature_names, rotation=90)
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.title("Top 15 Feature Importances (excluding 'zip' and 'month')")
plt.show()


<h1 style="color: #00BFFF;">03 | EDA</h1>

<h2 style="color: #008080;">Dealing with Duplicates</h2>

## Standard Scaler Results

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42) 

In [None]:

# make predictions using your model
y_pred = model.predict(X_test)

# calculate mean absolute error
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

# calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# calculate R^2 score
r2 = r2_score(y_test, y_pred)
print("R^2 Score:", r2)

In [None]:
# plot the data
plt.scatter(y_test, y_pred)

# add labels and title
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('Comparison between y_test and y_pred')

# add a line representing perfect prediction
min_val = min(np.min(y_test), np.min(y_pred))
max_val = max(np.max(y_test), np.max(y_pred))
plt.plot([min_val, max_val], [min_val, max_val], 'k--')

# show the plot
plt.show()

## How can we improve this??

Testinf KNN

In [None]:
scores = []
scores2 = []# We will store here the R2 values for each of our K-NN models with different K-values.
for i in range(10,40): 
    knn_model = KNeighborsRegressor(n_neighbors=i)
    knn_model2 = KNeighborsRegressor(n_neighbors=i, weights='distance')
    knn_model.fit(X_train, y_train)
    knn_model2.fit(X_train, y_train)
    scores.append(knn_model.score(X_test, y_test))
    scores2.append(knn_model2.score(X_test, y_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(10,40), scores, color = 'indigo', linestyle='-',
         marker='o', markerfacecolor='cornflowerblue', markersize=7, label='without distance', alpha=0.9)
plt.plot(range(10,40), scores2, color = 'cornflowerblue', linestyle='-',
         marker='o', markerfacecolor='lavender', markersize=7, label='with distance', alpha=0.9)
#Add vertical line in k=17
plt.axvline(x=19, color='darkgreen', linestyle='dashed')
# Add text to the vertical line
plt.text(19.1, 0.778, "K=19", color='darkgreen', fontsize=12)

plt.title('R2 vs. K Value')
plt.xlabel('K')
plt.ylabel('R2')
plt.legend()
plt.show()

k=19 is the best argument for KNN

In [None]:
def testing(models_list):
    for i in models_list:
        model = i
        model.fit(X_train,y_train)
        predictions=model.predict(X_test)
        MAE = mean_absolute_error(y_test,predictions)
        MSE = mean_squared_error(y_test,predictions)
        RMSE = np.sqrt(MSE)
        R2 = r2_score(y_test,predictions)
        R2_adj = 1 - (1-R2)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
        print("-------------------------")
        print('\033[1m' + "Using the model: " + str(model) + '\033[0m')
        print("The mean absolute error is: %6.3f" % (MAE))
        print("The mean squared error is: %6.3f" % (MSE))
        print("The root mean squared error is: %6.3f" % (RMSE))
        print("The R2 is: %6.3f" % (R2))
        print("The Adjusted R2 is: %6.3f" % (R2_adj))

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

models=[
    LinearRegression(),
    KNeighborsRegressor(n_neighbors=19, weights = "distance"),
    ElasticNet(alpha=0.01, l1_ratio=0.5),
    MLPRegressor(hidden_layer_sizes=(10, 5, 5), activation='relu', solver='adam', max_iter=2000),
    ##MLPRegressor(hidden_layer_sizes=(10, 5), activation='relu', solver='adam', max_iter=1000),
    ##DecisionTreeRegressor(),
    RandomForestRegressor(),
]

In [None]:
testing(models)

In [None]:
datac3.price.min()

In [None]:
datac3.price.max()

In [None]:
datac3.price.median()

In [None]:
datac3.price.mean()

In [None]:
print("The percentage of the error is: ", round(57354.907/(datac3.price.median())*100, 2), "%")

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid to search over
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 10],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}

# Create an instance of the ElasticNet class
enet = ElasticNet()

# Create a GridSearchCV object to search over the hyperparameter grid
grid_search = GridSearchCV(enet, param_grid=param_grid, cv=5, scoring='r2')

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

## Lets see how good is the MLPRegressor Model

In [None]:
ModelMLP = MLPRegressor(hidden_layer_sizes=(10, 5, 5), activation='relu', solver='adam', max_iter=2000)
ModelMLP.fit(X_train, y_train)
MLPpredictions=ModelMLP.predict(X_test)

In [None]:
sns.set_style('whitegrid')
# plot the predictions
sns.scatterplot(x=y_test, y=MLPpredictions, s=50, alpha=0.5, color='cornflowerblue')

# plot the ideal predictions line
plt.plot(y_test, y_test, color='red', linestyle='-')

# set the title and labels
plt.title('MLP Regressor Predictions')
plt.xlabel('True Values')
plt.ylabel('Predictions')

plt.show()

MAE = mean_absolute_error(y_test,MLPpredictions)
MSE = mean_squared_error(y_test,MLPpredictions)
RMSE = np.sqrt(MSE)
R2 = r2_score(y_test,MLPpredictions)
R2_adj = 1 - (1-R2)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
print("-------------------------")
print('\033[1m' + "Using the model: " + str(ModelMLP) + '\033[0m')
print("The mean absolute error is: %6.3f" % (MAE))
print("The mean squared error is: %6.3f" % (MSE))
print("The root mean squared error is: %6.3f" % (RMSE))
print("The R2 is: %6.3f" % (R2))
print("The Adjusted R2 is: %6.3f" % (R2_adj))

In [4]:
# Create a new column in the test data frame that contains the residuals
datac2['residuals'] = y_test - MLPpredictions
# Create a new column with the rank of the 'price' column
datac2['price_rank'] = datac2['price'].rank(ascending=False)

# Sort the data frame by the absolute value of the residuals
sorted_data = datac2.reindex(datac2['residuals'].abs().sort_values(ascending=False).index)

# Print the first 10 rows of the sorted data frame
sorted_data[['price','residuals','price_rank']].head(50)

NameError: name 'y_test' is not defined

In [5]:
datac2.price_rank.max()

NameError: name 'datac2' is not defined

In [6]:
datac2.price_rank.min()

NameError: name 'datac2' is not defined