In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('houses_train.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      5001 non-null   int64  
 1   price           5001 non-null   float64
 2   condition       5001 non-null   object 
 3   district        5001 non-null   object 
 4   max_floor       5001 non-null   int64  
 5   street          5001 non-null   object 
 6   num_rooms       5001 non-null   int64  
 7   region          5001 non-null   object 
 8   area            5001 non-null   float64
 9   url             5001 non-null   object 
 10  num_bathrooms   5001 non-null   int64  
 11  building_type   5001 non-null   object 
 12  floor           5001 non-null   int64  
 13  ceiling_height  5001 non-null   float64
dtypes: float64(3), int64(5), object(6)
memory usage: 547.1+ KB


In [4]:
data_ex_obj = df.select_dtypes(exclude=['object'])

In [5]:
data_ex_obj.head()

Unnamed: 0.1,Unnamed: 0,price,max_floor,num_rooms,area,num_bathrooms,floor,ceiling_height
0,4598,100000.0,6,3,96.0,1,4,3.0
1,5940,52000.0,14,3,78.0,1,10,2.8
2,2302,52000.0,9,3,97.0,1,1,2.8
3,5628,130000.0,4,3,80.0,1,2,3.2
4,760,81600.0,9,3,107.0,1,9,3.0


In [6]:
data_ex_obj.corr()

Unnamed: 0.1,Unnamed: 0,price,max_floor,num_rooms,area,num_bathrooms,floor,ceiling_height
Unnamed: 0,1.0,0.012869,0.01861,0.054708,0.023779,0.040317,0.003506,0.035008
price,0.012869,1.0,0.105915,0.343041,0.626492,0.422173,0.068959,0.369952
max_floor,0.01861,0.105915,1.0,0.045562,0.162761,0.112671,0.637155,-0.063595
num_rooms,0.054708,0.343041,0.045562,1.0,0.74693,0.267053,0.05518,0.069899
area,0.023779,0.626492,0.162761,0.74693,1.0,0.414221,0.109706,0.201462
num_bathrooms,0.040317,0.422173,0.112671,0.267053,0.414221,1.0,0.062784,0.217549
floor,0.003506,0.068959,0.637155,0.05518,0.109706,0.062784,1.0,-0.045235
ceiling_height,0.035008,0.369952,-0.063595,0.069899,0.201462,0.217549,-0.045235,1.0


In [7]:
X = data_ex_obj.drop(['price'], axis = 1)
y = data_ex_obj.loc[:, data_ex_obj.columns == "price"]
y

Unnamed: 0,price
0,100000.0
1,52000.0
2,52000.0
3,130000.0
4,81600.0
...,...
4996,70000.0
4997,77000.0
4998,46000.0
4999,99000.0


In [8]:
X

Unnamed: 0.1,Unnamed: 0,max_floor,num_rooms,area,num_bathrooms,floor,ceiling_height
0,4598,6,3,96.0,1,4,3.0
1,5940,14,3,78.0,1,10,2.8
2,2302,9,3,97.0,1,1,2.8
3,5628,4,3,80.0,1,2,3.2
4,760,9,3,107.0,1,9,3.0
...,...,...,...,...,...,...,...
4996,3585,5,3,97.0,1,4,2.8
4997,3291,4,3,71.0,1,4,2.8
4998,5959,5,1,40.0,1,2,3.0
4999,542,14,4,118.0,2,14,3.0


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1234 )

In [38]:
norm = MinMaxScaler().fit(X_train)
X_train_norm = pd.DataFrame(columns = X_train.columns, data = norm.transform(X_train))
X_test_norm = pd.DataFrame(columns = X_test.columns, data = norm.transform(X_test))

X_train_norm = X_train
X_test_norm = X_test

X_test_norm

Unnamed: 0.1,Unnamed: 0,max_floor,num_rooms,area,num_bathrooms,floor,ceiling_height
2706,6471,10,1,41.0,1,4,3.0
2436,2137,9,5,133.0,3,1,2.8
1201,1902,5,3,90.0,2,4,3.2
1486,40,10,4,104.0,1,2,2.8
4286,1770,5,1,42.0,1,3,2.8
...,...,...,...,...,...,...,...
2724,4550,5,4,80.0,1,4,2.8
4892,2402,12,2,63.0,1,4,2.8
4746,1591,13,4,127.0,1,4,2.8
4416,4719,6,3,90.0,1,6,2.8


In [39]:
lr = linear_model.LinearRegression()
lr.fit(X_train_norm, y_train)
print('b0:', lr.intercept_[0])
print('b1:', lr.coef_)

b0: -236516.11702874108
b1: [[-1.57378284e-01 -1.37975724e+01 -1.34263017e+04  1.38515972e+03
   1.99035658e+04  1.23217942e+02  7.65012723e+04]]


In [40]:
y_train_pred = lr.predict(X_train_norm)

In [41]:
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
print('rmse_train', rmse_train)

rmse_train 35929.66480989277


In [42]:
y_test_pred = lr.predict(X_test_norm)

In [43]:
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
print('rmse_test', rmse_test)

rmse_test 40995.73660875415


In [44]:
deg = 3
poly = PolynomialFeatures(degree = deg, include_bias=False)
X_train_Poly = poly.fit_transform(X_train_norm)
X_test_Poly = poly.fit_transform(X_test_norm)


In [None]:
linear = LinearRegression()
linear.fit(X_train_Poly, y_train)

In [45]:
y_train_pred = linear.predict(X_train_Poly)

In [46]:
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
print('rmse_train', rmse_train)

rmse_train 32952.41198262792


In [47]:
y_test_pred = linear.predict(X_test_Poly)

In [48]:
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
print('rmse_test', rmse_test)

rmse_test 38455.26860263028


In [49]:
elNet = ElasticNet( alpha=0.001, l1_ratio=0.01)

In [50]:

elNet.fit(X_train_Poly, y_train)

  model = cd_fast.enet_coordinate_descent(


In [51]:
y_train_pred = elNet.predict(X_train_Poly)

In [52]:
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
print('rmse_train', rmse_train)

rmse_train 33613.11716925846


In [53]:
y_test_pred = elNet.predict(X_test_Poly)

In [54]:
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
print('rmse_test', rmse_test)

rmse_test 39155.74340040925


In [61]:
elastic=ElasticNet()
params={'alpha':[1e-5,1e-4,1e-3,1e-2,1e-1,1,2,3,4,5,10,20,30,40,50,100,],'l1_ratio':[1e-6,1e-5,1e-4,1e-3,1e-2,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]}
Regressor=GridSearchCV(elastic,params,scoring='neg_mean_squared_error',cv=10)
Regressor.fit(X_train,y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [62]:
y_train_pred = Regressor.predict(X_train)

In [63]:
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
print('rmse_train', rmse_train)

rmse_train 35929.705675911704


In [64]:
y_test_pred = Regressor.predict(X_test)

In [65]:
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
print('rmse_test', rmse_test)

rmse_test 40996.51639962285


In [21]:
model = ...

In [22]:
def final_predict(final_test_df):
    1. preprocessing of final_test_df (scaling, one hot encoding ...)
    2. make sure that columns and their order in train and test are the same
    3. return predictions

SyntaxError: invalid syntax (2686721630.py, line 2)

In [None]:
df = pd.read_csv('houses_test.csv')
final_predict(df)