In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
# function to laod a dataset    
def load_data(data_path):
    return pd.read_csv(data_path)

In [3]:
data_path = '/home/jeffs/mach_learn/USA_Housing_Dataset.csv'
df_1 = load_data(data_path)


In [4]:
df_1.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-09 00:00:00,376000.0,3.0,2.0,1340,1384,3.0,0,0,3,1340,0,2008,0,9245-9249 Fremont Ave N,Seattle,WA 98103,USA
1,2014-05-09 00:00:00,800000.0,4.0,3.25,3540,159430,2.0,0,0,3,3540,0,2007,0,33001 NE 24th St,Carnation,WA 98014,USA
2,2014-05-09 00:00:00,2238888.0,5.0,6.5,7270,130017,2.0,0,0,3,6420,850,2010,0,7070 270th Pl SE,Issaquah,WA 98029,USA
3,2014-05-09 00:00:00,324000.0,3.0,2.25,998,904,2.0,0,0,3,798,200,2007,0,820 NW 95th St,Seattle,WA 98117,USA
4,2014-05-10 00:00:00,549900.0,5.0,2.75,3060,7015,1.0,0,0,5,1600,1460,1979,0,10834 31st Ave SW,Seattle,WA 98146,USA


In [5]:
df_1.columns

Index(['date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'street', 'city',
       'statezip', 'country'],
      dtype='object')

In [7]:
#function to split the data into training and testing sets
def split_data(data):
    X = data[['bedrooms', 'bathrooms','condition','yr_built', 'yr_renovated']]
    y = data['price']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [8]:
X_train, X_test, y_train, y_test = split_data(df_1)

In [9]:
print(X_train)

      bedrooms  bathrooms  condition  yr_built  yr_renovated
3949       4.0       1.00          3      1945          2010
3930       3.0       2.00          4      1977             0
178        2.0       1.00          5      1952          1998
668        3.0       2.50          3      1992             0
3330       4.0       2.75          4      1962             0
...        ...        ...        ...       ...           ...
3444       3.0       2.50          5      1945             0
466        5.0       1.75          4      1966             0
3092       3.0       1.75          3      1998          2006
3772       4.0       2.50          4      1978          2000
860        3.0       1.00          3      1972          2002

[3312 rows x 5 columns]


In [10]:
print(X_test)

      bedrooms  bathrooms  condition  yr_built  yr_renovated
3487       5.0       2.25          4      1975             0
1964       2.0       1.00          4      1947          1988
1582       3.0       2.50          3      2001             0
296        3.0       2.00          3      1948          1994
149        4.0       1.00          3      1925          2002
...        ...        ...        ...       ...           ...
838        2.0       2.50          3      2006             0
2466       2.0       2.50          3      2006             0
3837       2.0       1.00          5      1942             0
2409       5.0       3.75          3      2014             0
1545       4.0       5.25          3      1989             0

[828 rows x 5 columns]


In [13]:
# print(y_train)
# print y_train with two decimal places and commas for thousands
print(y_train.apply(lambda x: '{:,.2f}'.format(x)))
                

3949      266,066.67
3930      540,833.33
178       155,000.00
668     1,550,000.00
3330      505,000.00
            ...     
3444      850,000.00
466       462,000.00
3092      235,000.00
3772      270,000.00
860       172,500.00
Name: price, Length: 3312, dtype: object


In [14]:
# print y_test with two decimal places and commas for thousands
print(y_test.apply(lambda x: '{:,.2f}'.format(x)))

3487      600,000.00
1964      370,000.00
1582      471,000.00
296       240,000.00
149       413,000.00
            ...     
838       375,000.00
2466      754,800.00
3837      279,000.00
2409      540,500.00
1545    1,415,000.00
Name: price, Length: 828, dtype: object


In [15]:
# train a linear regression model
def train_model(X_train, y_train):
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model

In [16]:
model = train_model(X_train, y_train)

In [17]:
print(model)

LinearRegression()


In [18]:
# make predictions using the trained model
def predict(model, X_test):
    return model.predict(X_test)

In [19]:

predictions = predict(model, X_test)

In [21]:
# print predictions with two decimal places and commas for thousands
print(pd.Series(predictions).apply(lambda x: '{:,.2f}'.format(x)))

0        574,269.53
1        288,406.78
2        557,694.03
3        569,419.35
4        330,143.31
           ...     
823      543,093.67
824      543,093.67
825      301,571.00
826      902,995.96
827    1,422,765.82
Length: 828, dtype: object


In [22]:
# convert the ouput to a dataframe
def predictions_to_df(predictions):
    return pd.DataFrame(predictions)

In [39]:
df_predictions = predictions_to_df(predictions)

df_predictions.head()
# convert the number to a string with two decimal places and commas for thousands
df_predictions = df_predictions[0].map(lambda x: '{:,.2f}'.format(x))
print(df_predictions)



0        574,269.53
1        288,406.78
2        557,694.03
3        569,419.35
4        330,143.31
           ...     
823      543,093.67
824      543,093.67
825      301,571.00
826      902,995.96
827    1,422,765.82
Name: 0, Length: 828, dtype: object


In [43]:
import numpy as np

def round_predictions(predictions):
    return np.round(predictions, 2)

# Example usage
pred_round = round_predictions(predictions)
# print(pred_round)

# calculate the mean squared error
def calculate_mse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)

mse = calculate_mse(y_test, predictions)
print(mse)

83297328002.39993


In [45]:
# convert the output to a dollar value with two decimal places
def convert_to_dollars(predictions):
    return predictions.round(2)

In [46]:
predictions_dollars = convert_to_dollars(df_predictions)

In [47]:
print(predictions_dollars)

0        574,269.53
1        288,406.78
2        557,694.03
3        569,419.35
4        330,143.31
           ...     
823      543,093.67
824      543,093.67
825      301,571.00
826      902,995.96
827    1,422,765.82
Name: 0, Length: 828, dtype: object


In [48]:
# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

Mean Squared Error: 83297328002.39993


In [49]:
def convert_to_dollars(amount):
    return "${:,.2f}".format(amount)

# MSE value
mse_value = 83297328002.39993

# Convert MSE to a dollar format
mse_dollars = convert_to_dollars(mse_value)

print(f"Mean Squared Error in dollars: {mse_dollars}")

Mean Squared Error in dollars: $83,297,328,002.40


In [50]:
# function to calculate the root mean squared error
def calculate_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [51]:
#calculate the root mean squared error
rmse = calculate_rmse(y_test, predictions)
print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 288612.7647946292


In [52]:
# compare the predicted values with the actual values
def compare_predictions(predictions, y_test):
    return pd.DataFrame({'Actual': y_test, 'Predicted': predictions})

In [53]:
table = compare_predictions(predictions_dollars[0], y_test)

In [54]:
print(table)

         Actual   Predicted
3487   600000.0  574,269.53
1964   370000.0  574,269.53
1582   471000.0  574,269.53
296    240000.0  574,269.53
149    413000.0  574,269.53
...         ...         ...
838    375000.0  574,269.53
2466   754800.0  574,269.53
3837   279000.0  574,269.53
2409   540500.0  574,269.53
1545  1415000.0  574,269.53

[828 rows x 2 columns]


In [55]:
# calculate  the difference between the actual and predicted values in dollars
def calculate_diff(predictions, y_test):
    return predictions - y_test

diff = calculate_diff(predictions, y_test)
print(diff)

3487    -25730.470967
1964    -81593.216785
1582     86694.027535
296     329419.351270
149     -82856.688514
            ...      
838     168093.667981
2466   -211706.332019
3837     22570.995446
2409    362495.962056
1545      7765.821056
Name: price, Length: 828, dtype: float64


In [56]:
# calculate the percentage difference between the actual and predicted values
def calculate_percentage_diff(predictions, y_test):
    return ((predictions - y_test) / y_test) * 100
diff_percentage = calculate_percentage_diff(predictions, y_test)
print(diff_percentage)

3487     -4.288412
1964    -22.052221
1582     18.406375
296     137.258063
149     -20.062152
           ...    
838      44.824978
2466    -28.048004
3837      8.089963
2409     67.066783
1545      0.548821
Name: price, Length: 828, dtype: float64


In [65]:
# print data type of the predictions and y_test
# print(type(predictions))    
# print(type(y_test))
# convert the predictions to a dataframe
def predictions_to_df(predictions):
    return pd.DataFrame(predictions)
predictions = predictions_to_df(predictions)
print(type(predictions))


<class 'pandas.core.frame.DataFrame'>
