In [25]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [26]:
data = pd.read_csv('archive/train.csv')

In [27]:
data.head()

Unnamed: 0,beds,baths,size,size_units,lot_size,lot_size_units,zip_code,price
0,3,2.5,2590.0,sqft,6000.0,sqft,98144,795000.0
1,4,2.0,2240.0,sqft,0.31,acre,98106,915000.0
2,4,3.0,2040.0,sqft,3783.0,sqft,98107,950000.0
3,4,3.0,3800.0,sqft,5175.0,sqft,98199,1950000.0
4,2,2.0,1042.0,sqft,,,98102,950000.0


In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2016 entries, 0 to 2015
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   beds            2016 non-null   int64  
 1   baths           2016 non-null   float64
 2   size            2016 non-null   float64
 3   size_units      2016 non-null   object 
 4   lot_size        1669 non-null   float64
 5   lot_size_units  1669 non-null   object 
 6   zip_code        2016 non-null   int64  
 7   price           2016 non-null   float64
dtypes: float64(4), int64(2), object(2)
memory usage: 126.1+ KB


In [29]:
data.isnull().values.any()

True

In [30]:
data.head()

Unnamed: 0,beds,baths,size,size_units,lot_size,lot_size_units,zip_code,price
0,3,2.5,2590.0,sqft,6000.0,sqft,98144,795000.0
1,4,2.0,2240.0,sqft,0.31,acre,98106,915000.0
2,4,3.0,2040.0,sqft,3783.0,sqft,98107,950000.0
3,4,3.0,3800.0,sqft,5175.0,sqft,98199,1950000.0
4,2,2.0,1042.0,sqft,,,98102,950000.0


In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2016 entries, 0 to 2015
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   beds            2016 non-null   int64  
 1   baths           2016 non-null   float64
 2   size            2016 non-null   float64
 3   size_units      2016 non-null   object 
 4   lot_size        1669 non-null   float64
 5   lot_size_units  1669 non-null   object 
 6   zip_code        2016 non-null   int64  
 7   price           2016 non-null   float64
dtypes: float64(4), int64(2), object(2)
memory usage: 126.1+ KB


In [32]:
data['size_units'].unique()

array(['sqft'], dtype=object)

In [33]:
data['lot_size_units'].unique()

array(['sqft', 'acre', nan], dtype=object)

In [34]:
for i in range(len(data['lot_size_units'])):
    if  data['lot_size_units'][i] == 'acre':
        data['lot_size'][i] *= 43560

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  data['lot_size'][i] *= 43560
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lot_size'][i] *= 43560
You a

In [35]:
data.head()

Unnamed: 0,beds,baths,size,size_units,lot_size,lot_size_units,zip_code,price
0,3,2.5,2590.0,sqft,6000.0,sqft,98144,795000.0
1,4,2.0,2240.0,sqft,13503.6,acre,98106,915000.0
2,4,3.0,2040.0,sqft,3783.0,sqft,98107,950000.0
3,4,3.0,3800.0,sqft,5175.0,sqft,98199,1950000.0
4,2,2.0,1042.0,sqft,,,98102,950000.0


In [36]:
lot_size_mean = np.mean(data['lot_size'])
data.fillna(lot_size_mean, inplace=True)

In [37]:
data.head()

Unnamed: 0,beds,baths,size,size_units,lot_size,lot_size_units,zip_code,price
0,3,2.5,2590.0,sqft,6000.0,sqft,98144,795000.0
1,4,2.0,2240.0,sqft,13503.6,acre,98106,915000.0
2,4,3.0,2040.0,sqft,3783.0,sqft,98107,950000.0
3,4,3.0,3800.0,sqft,5175.0,sqft,98199,1950000.0
4,2,2.0,1042.0,sqft,18789.951947,18789.951947,98102,950000.0


In [40]:
data.columns

Index(['beds', 'baths', 'size', 'size_units', 'lot_size', 'lot_size_units',
       'zip_code', 'price'],
      dtype='object')

In [42]:
data = data.drop(columns = ['size_units', 'lot_size_units'])

In [43]:
data.head()

Unnamed: 0,beds,baths,size,lot_size,zip_code,price
0,3,2.5,2590.0,6000.0,98144,795000.0
1,4,2.0,2240.0,13503.6,98106,915000.0
2,4,3.0,2040.0,3783.0,98107,950000.0
3,4,3.0,3800.0,5175.0,98199,1950000.0
4,2,2.0,1042.0,18789.951947,98102,950000.0


In [44]:
data.isnull().values.any()

False

In [45]:
data.corr()

Unnamed: 0,beds,baths,size,lot_size,zip_code,price
beds,1.0,0.652853,0.771929,-0.059688,0.077811,0.293516
baths,0.652853,1.0,0.667655,-0.044108,-0.002679,0.317325
size,0.771929,0.667655,1.0,-0.054722,0.070557,0.44414
lot_size,-0.059688,-0.044108,-0.054722,1.0,-0.025355,-0.028256
zip_code,0.077811,-0.002679,0.070557,-0.025355,1.0,-0.047189
price,0.293516,0.317325,0.44414,-0.028256,-0.047189,1.0


In [46]:
x = data.drop(columns=['price'])
y = data['price']

In [47]:
lr = LinearRegression()

In [48]:
model = lr.fit(x, y)

In [60]:
test_data = pd.read_csv('archive/test.csv')

In [61]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 505 entries, 0 to 504
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   beds            505 non-null    int64  
 1   baths           505 non-null    float64
 2   size            505 non-null    float64
 3   size_units      505 non-null    object 
 4   lot_size        428 non-null    float64
 5   lot_size_units  428 non-null    object 
 6   zip_code        505 non-null    int64  
 7   price           505 non-null    float64
dtypes: float64(4), int64(2), object(2)
memory usage: 31.7+ KB


In [62]:
test_data = test_data.drop(columns = ['size_units', 'lot_size_units'])
test_data = test_data.dropna()

In [63]:
test_x = test_data.drop(columns = ['price'])
test_y = test_data.price

In [64]:
y_predicted = lr.predict(test_x)

In [65]:
mse = mean_squared_error(test_y, y_predicted)
mae = np.mean(np.abs(test_y - y_predicted))
r2 = r2_score(test_y, y_predicted)

In [66]:
print(f'MSE: {mse:.2f} \nMAE: {mae:.2f} \nR-squared: {r2:.2f}')

MSE: 192320099947.99 
MAE: 277066.93 
R-squared: 0.50
