In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('houses_train.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,price,condition,district,max_floor,street,num_rooms,region,area,url,num_bathrooms,building_type,floor,ceiling_height
0,4598,100000.0,newly repaired,Arabkir,6,Kievyan St,3,Yerevan,96.0,http://www.myrealty.am/en/item/26229/3-senyaka...,1,stone,4,3.0
1,5940,52000.0,good,Arabkir,14,Mamikoniants St,3,Yerevan,78.0,http://www.myrealty.am/en/item/32897/3-senyaka...,1,panel,10,2.8
2,2302,52000.0,newly repaired,Qanaqer-Zeytun,9,M. Melikyan St,3,Yerevan,97.0,http://www.myrealty.am/en/item/1459/apartment-...,1,panel,1,2.8
3,5628,130000.0,good,Center,4,Spendiaryan St,3,Yerevan,80.0,http://www.myrealty.am/en/item/2099/3-senyakan...,1,stone,2,3.2
4,760,81600.0,zero condition,Center,9,Ler. Kamsar St,3,Yerevan,107.0,http://www.myrealty.am/en/item/22722/3-senyaka...,1,monolit,9,3.0


In [4]:
df['condition'].unique().shape #1

(3,)

In [5]:
df['district'].unique().shape #3

(13,)

In [6]:
df['street'].unique().shape #4

(350,)

In [7]:
df['region'].unique().shape

(1,)

In [8]:
df['url'].unique().shape

(5001,)

In [9]:
df['building_type'].unique().shape #2

(4,)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      5001 non-null   int64  
 1   price           5001 non-null   float64
 2   condition       5001 non-null   object 
 3   district        5001 non-null   object 
 4   max_floor       5001 non-null   int64  
 5   street          5001 non-null   object 
 6   num_rooms       5001 non-null   int64  
 7   region          5001 non-null   object 
 8   area            5001 non-null   float64
 9   url             5001 non-null   object 
 10  num_bathrooms   5001 non-null   int64  
 11  building_type   5001 non-null   object 
 12  floor           5001 non-null   int64  
 13  ceiling_height  5001 non-null   float64
dtypes: float64(3), int64(5), object(6)
memory usage: 547.1+ KB


In [11]:
data_ex_obj = df.select_dtypes(exclude=['object'])

In [12]:
data_ex_obj.head()

Unnamed: 0.1,Unnamed: 0,price,max_floor,num_rooms,area,num_bathrooms,floor,ceiling_height
0,4598,100000.0,6,3,96.0,1,4,3.0
1,5940,52000.0,14,3,78.0,1,10,2.8
2,2302,52000.0,9,3,97.0,1,1,2.8
3,5628,130000.0,4,3,80.0,1,2,3.2
4,760,81600.0,9,3,107.0,1,9,3.0


In [13]:
data_ex_obj.corr()

Unnamed: 0.1,Unnamed: 0,price,max_floor,num_rooms,area,num_bathrooms,floor,ceiling_height
Unnamed: 0,1.0,0.012869,0.01861,0.054708,0.023779,0.040317,0.003506,0.035008
price,0.012869,1.0,0.105915,0.343041,0.626492,0.422173,0.068959,0.369952
max_floor,0.01861,0.105915,1.0,0.045562,0.162761,0.112671,0.637155,-0.063595
num_rooms,0.054708,0.343041,0.045562,1.0,0.74693,0.267053,0.05518,0.069899
area,0.023779,0.626492,0.162761,0.74693,1.0,0.414221,0.109706,0.201462
num_bathrooms,0.040317,0.422173,0.112671,0.267053,0.414221,1.0,0.062784,0.217549
floor,0.003506,0.068959,0.637155,0.05518,0.109706,0.062784,1.0,-0.045235
ceiling_height,0.035008,0.369952,-0.063595,0.069899,0.201462,0.217549,-0.045235,1.0


In [14]:
X = data_ex_obj.drop(['price'], axis = 1)
y = data_ex_obj.loc[:, data_ex_obj.columns == "price"]
y

Unnamed: 0,price
0,100000.0
1,52000.0
2,52000.0
3,130000.0
4,81600.0
...,...
4996,70000.0
4997,77000.0
4998,46000.0
4999,99000.0


In [15]:
X['condition'] = df['condition']
X['building_type'] = df['building_type']
X['district'] = df['district']
X['street'] = df['street']

X.columns

Index(['Unnamed: 0', 'max_floor', 'num_rooms', 'area', 'num_bathrooms',
       'floor', 'ceiling_height', 'condition', 'building_type', 'district',
       'street'],
      dtype='object')

In [16]:
X = pd.get_dummies(X[['Unnamed: 0', 'max_floor', 'num_rooms', 'area', 'num_bathrooms',
       'floor', 'ceiling_height', 'condition', 'building_type', 'district', 'street']])
X.head()

Unnamed: 0.1,Unnamed: 0,max_floor,num_rooms,area,num_bathrooms,floor,ceiling_height,condition_good,condition_newly repaired,condition_zero condition,...,street_Vratsakan 4 alley,street_Vratsakan St,street_Vratsyan St,street_Yekmalyan St,street_Yerznkyan St,street_Z. Sarkavag 3 dead end,street_Z. Sarkavag St,street_Z.Andranik St,street_Zakyan St,street_Zavaryan St
0,4598,6,3,96.0,1,4,3.0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,5940,14,3,78.0,1,10,2.8,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2302,9,3,97.0,1,1,2.8,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,5628,4,3,80.0,1,2,3.2,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,760,9,3,107.0,1,9,3.0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1234 )

In [18]:
norm = MinMaxScaler().fit(X_train)
X_train_norm = pd.DataFrame(columns = X_train.columns, data = norm.transform(X_train))
X_test_norm = pd.DataFrame(columns = X_test.columns, data = norm.transform(X_test))

X_train_norm = X_train
X_test_norm = X_test

X_test_norm

Unnamed: 0.1,Unnamed: 0,max_floor,num_rooms,area,num_bathrooms,floor,ceiling_height,condition_good,condition_newly repaired,condition_zero condition,...,street_Vratsakan 4 alley,street_Vratsakan St,street_Vratsyan St,street_Yekmalyan St,street_Yerznkyan St,street_Z. Sarkavag 3 dead end,street_Z. Sarkavag St,street_Z.Andranik St,street_Zakyan St,street_Zavaryan St
2706,6471,10,1,41.0,1,4,3.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2436,2137,9,5,133.0,3,1,2.8,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1201,1902,5,3,90.0,2,4,3.2,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1486,40,10,4,104.0,1,2,2.8,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4286,1770,5,1,42.0,1,3,2.8,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2724,4550,5,4,80.0,1,4,2.8,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4892,2402,12,2,63.0,1,4,2.8,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4746,1591,13,4,127.0,1,4,2.8,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4416,4719,6,3,90.0,1,6,2.8,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
lr = linear_model.LinearRegression()
lr.fit(X_train_norm, y_train)
print('b0:', lr.intercept_[0])
print('b1:', lr.coef_)

b0: -8272977263337.675
b1: [[-2.31370645e-01  6.33868933e+02 -2.84189114e+03  1.09098888e+03
   9.89534213e+03 -6.41703024e+02  1.72094811e+04  1.19264620e+13
   1.19264620e+13  1.19264620e+13 -7.61012482e+12 -7.61012483e+12
  -7.61012484e+12 -7.61012483e+12  3.85807475e+12  4.19705535e+12
   4.35787174e+12  3.78402364e+12  3.87641114e+12  3.64364700e+12
   4.07710638e+12  3.87641114e+12  2.65778794e+12  1.05976112e+12
   4.08195095e+12  3.78402363e+12  1.55348931e+12 -1.25310917e+11
   9.85652550e+10  9.85652545e+10  9.85652643e+10 -2.40415316e+11
  -1.20466364e+11  8.02288685e+10 -2.40415309e+11 -1.25310914e+11
  -1.25310923e+11  1.72616420e+11  1.72616437e+11 -2.40415334e+11
   9.85652755e+10  1.72616454e+11 -4.01231727e+11 -2.40415321e+11
   1.72616406e+11 -1.25310909e+11  1.72616439e+11 -2.40415329e+11
   9.85652868e+10 -4.01231724e+11  1.72616449e+11  1.72616426e+11
  -2.40415346e+11 -2.40415313e+11 -2.40415322e+11 -2.40415293e+11
  -2.40415133e+11 -6.48037453e+09 -2.40415321e+11

In [20]:
y_train_pred = lr.predict(X_train_norm)

In [21]:
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
print('rmse_train', rmse_train)

rmse_train 22331.812910698834


In [22]:
y_test_pred = lr.predict(X_test_norm)

In [23]:
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
print('rmse_test', rmse_test)

rmse_test 95878606342.84462


In [24]:
deg = 2
poly = PolynomialFeatures(degree = deg, include_bias=False)
X_train_Poly = poly.fit_transform(X_train_norm)
X_test_Poly = poly.fit_transform(X_test_norm)


In [25]:
linear = LinearRegression()
linear.fit(X_train_Poly, y_train)

In [26]:
y_train_pred = linear.predict(X_train_Poly)

In [27]:
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
print('rmse_train', rmse_train)

rmse_train 13839.414529831407


In [28]:
y_test_pred = linear.predict(X_test_Poly)

In [29]:
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
print('rmse_test', rmse_test)

rmse_test 44673520643.37354


In [30]:
elNet = ElasticNet( alpha=0.001, l1_ratio=0.01)

In [31]:

elNet.fit(X_train_Poly, y_train)

  model = cd_fast.enet_coordinate_descent(


In [32]:
y_train_pred = elNet.predict(X_train_Poly)

In [33]:
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
print('rmse_train', rmse_train)

rmse_train 15409.063313687906


In [34]:
y_test_pred = elNet.predict(X_test_Poly)

In [35]:
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
print('rmse_test', rmse_test)

rmse_test 27341.769812027385


In [36]:
# al = np.linspace(0.001, 0.2, 30)
# l1 = np.linspace(0.001, 0.02, 30)
al = np.linspace(0.0001, 0.01, 30)
l1 = np.linspace(0.05, 0.1, 30)
print('al',al)
print('l1',l1)

al [0.0001     0.00044138 0.00078276 0.00112414 0.00146552 0.0018069
 0.00214828 0.00248966 0.00283103 0.00317241 0.00351379 0.00385517
 0.00419655 0.00453793 0.00487931 0.00522069 0.00556207 0.00590345
 0.00624483 0.00658621 0.00692759 0.00726897 0.00761034 0.00795172
 0.0082931  0.00863448 0.00897586 0.00931724 0.00965862 0.01      ]
l1 [0.05       0.05172414 0.05344828 0.05517241 0.05689655 0.05862069
 0.06034483 0.06206897 0.0637931  0.06551724 0.06724138 0.06896552
 0.07068966 0.07241379 0.07413793 0.07586207 0.07758621 0.07931034
 0.08103448 0.08275862 0.08448276 0.0862069  0.08793103 0.08965517
 0.09137931 0.09310345 0.09482759 0.09655172 0.09827586 0.1       ]


In [37]:
elastic=ElasticNet()

params={'alpha':al,'l1_ratio':l1}
Regressor=GridSearchCV(elastic,params,scoring='neg_mean_squared_error',cv=5)
Regressor.fit(X_train,y_train)

  model = cd_fast.enet_coordinate_descent(


In [38]:
y_train_pred = Regressor.predict(X_train)

In [39]:
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
print('rmse_train', rmse_train)

rmse_train 22633.500208278976


In [40]:
y_test_pred = Regressor.predict(X_test)

In [41]:
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
print('rmse_test', rmse_test)

rmse_test 25183.476968832547


In [42]:
print('best parameter: ', Regressor.best_params_)

best parameter:  {'alpha': 0.0004413793103448276, 'l1_ratio': 0.05}


In [43]:
model = ...

In [44]:
# def final_predict(final_test_df):
#     1. preprocessing of final_test_df (scaling, one hot encoding ...)
#     2. make sure that columns and their order in train and test are the same
#     3. return predictions

In [45]:
# df = pd.read_csv('houses_test.csv')
# final_predict(df)