In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('houses_train_new.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,price,condition,district,max_floor,street,num_rooms,region,area,url,num_bathrooms,building_type,floor,ceiling_height
0,4598,100000.0,newly repaired,Arabkir,6,Kievyan St,3,Yerevan,96.0,http://www.myrealty.am/en/item/26229/3-senyaka...,1,stone,4,3.0
1,5940,52000.0,good,Arabkir,14,Mamikoniants St,3,Yerevan,78.0,http://www.myrealty.am/en/item/32897/3-senyaka...,1,panel,10,2.8
2,2302,52000.0,newly repaired,Qanaqer-Zeytun,9,M. Melikyan St,3,Yerevan,97.0,http://www.myrealty.am/en/item/1459/apartment-...,1,panel,1,2.8
3,5628,130000.0,good,Center,4,Spendiaryan St,3,Yerevan,80.0,http://www.myrealty.am/en/item/2099/3-senyakan...,1,stone,2,3.2
4,760,81600.0,zero condition,Center,9,Ler. Kamsar St,3,Yerevan,107.0,http://www.myrealty.am/en/item/22722/3-senyaka...,1,monolit,9,3.0


In [4]:
df['condition'].unique().shape #1

(3,)

In [5]:
df['district'].unique().shape #3

(13,)

In [6]:
df['street'].unique().shape #4

(350,)

In [7]:
df['region'].unique().shape

(1,)

In [8]:
df['url'].unique().shape

(5001,)

In [9]:
df['building_type'].unique().shape #2

(4,)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      5001 non-null   int64  
 1   price           5001 non-null   float64
 2   condition       5001 non-null   object 
 3   district        5001 non-null   object 
 4   max_floor       5001 non-null   int64  
 5   street          5001 non-null   object 
 6   num_rooms       5001 non-null   int64  
 7   region          5001 non-null   object 
 8   area            5001 non-null   float64
 9   url             5001 non-null   object 
 10  num_bathrooms   5001 non-null   int64  
 11  building_type   5001 non-null   object 
 12  floor           5001 non-null   int64  
 13  ceiling_height  5001 non-null   float64
dtypes: float64(3), int64(5), object(6)
memory usage: 547.1+ KB


In [11]:
X = df.drop(['price'], axis = 1)
y = df.loc[:, df.columns == "price"]
X

Unnamed: 0.1,Unnamed: 0,condition,district,max_floor,street,num_rooms,region,area,url,num_bathrooms,building_type,floor,ceiling_height
0,4598,newly repaired,Arabkir,6,Kievyan St,3,Yerevan,96.0,http://www.myrealty.am/en/item/26229/3-senyaka...,1,stone,4,3.0
1,5940,good,Arabkir,14,Mamikoniants St,3,Yerevan,78.0,http://www.myrealty.am/en/item/32897/3-senyaka...,1,panel,10,2.8
2,2302,newly repaired,Qanaqer-Zeytun,9,M. Melikyan St,3,Yerevan,97.0,http://www.myrealty.am/en/item/1459/apartment-...,1,panel,1,2.8
3,5628,good,Center,4,Spendiaryan St,3,Yerevan,80.0,http://www.myrealty.am/en/item/2099/3-senyakan...,1,stone,2,3.2
4,760,zero condition,Center,9,Ler. Kamsar St,3,Yerevan,107.0,http://www.myrealty.am/en/item/22722/3-senyaka...,1,monolit,9,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4996,3585,newly repaired,Arabkir,5,Griboedov St,3,Yerevan,97.0,http://www.myrealty.am/en/item/36852/3-senyaka...,1,stone,4,2.8
4997,3291,newly repaired,Arabkir,4,Orbeli Yeghbayrner St,3,Yerevan,71.0,http://www.myrealty.am/en/item/13933/Apartment...,1,stone,4,2.8
4998,5959,zero condition,Center,5,Mashtots Ave,1,Yerevan,40.0,http://www.myrealty.am/en/item/31190/1-senyaka...,1,stone,2,3.0
4999,542,newly repaired,Center,14,Argishti St,4,Yerevan,118.0,http://www.myrealty.am/en/item/25905/4-senyaka...,2,monolit,14,3.0


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1234 )

In [13]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,condition,district,max_floor,street,num_rooms,region,area,url,num_bathrooms,building_type,floor,ceiling_height
894,5482,good,Arabkir,5,V.Papazyan St,3,Yerevan,80.0,http://www.myrealty.am/en/item/34245/3-senyaka...,1,stone,4,2.8
2322,6565,good,Center,4,Leo St,3,Yerevan,97.0,http://www.myrealty.am/en/item/21554/3-senyaka...,2,stone,3,3.2
4592,3592,newly repaired,Arabkir,12,Komitas Ave,2,Yerevan,80.0,http://www.myrealty.am/en/item/36588/2-senyaka...,1,panel,2,2.8
3711,1841,newly repaired,Center,5,Tumanyan St,2,Yerevan,65.0,http://www.myrealty.am/en/item/34959/2-senyaka...,1,stone,2,3.0
4130,3146,newly repaired,Qanaqer-Zeytun,9,Lepsus St,3,Yerevan,78.0,http://www.myrealty.am/en/item/19048/3-senyaka...,1,panel,6,3.0


In [14]:
X_train_num = X_train.select_dtypes(exclude=['object'])
X_test_num = X_test.select_dtypes(exclude=['object'])
X_train_num.head()

Unnamed: 0.1,Unnamed: 0,max_floor,num_rooms,area,num_bathrooms,floor,ceiling_height
894,5482,5,3,80.0,1,4,2.8
2322,6565,4,3,97.0,2,3,3.2
4592,3592,12,2,80.0,1,2,2.8
3711,1841,5,2,65.0,1,2,3.0
4130,3146,9,3,78.0,1,6,3.0


In [15]:
X_train_num.corr()

Unnamed: 0.1,Unnamed: 0,max_floor,num_rooms,area,num_bathrooms,floor,ceiling_height
Unnamed: 0,1.0,0.010424,0.06061,0.023548,0.042754,-0.008888,0.036236
max_floor,0.010424,1.0,0.037436,0.158072,0.103681,0.631435,-0.057251
num_rooms,0.06061,0.037436,1.0,0.748989,0.270376,0.053953,0.062985
area,0.023548,0.158072,0.748989,1.0,0.415477,0.106946,0.190076
num_bathrooms,0.042754,0.103681,0.270376,0.415477,1.0,0.05659,0.207212
floor,-0.008888,0.631435,0.053953,0.106946,0.05659,1.0,-0.045996
ceiling_height,0.036236,-0.057251,0.062985,0.190076,0.207212,-0.045996,1.0


In [16]:
deg = 3
poly = PolynomialFeatures(degree = deg, include_bias=False)
X_train_Poly = poly.fit_transform(X_train_num)
X_test_Poly = poly.fit_transform(X_test_num)


In [17]:
target_feature_names_train = ['x'.join(['{}^{}'.format(pair[0],pair[1]) for pair in tuple if pair[1]!=0]) for tuple in [zip(X_train_num.columns,p) for p in poly.powers_]]
print('n of columns of target_feature_names_train', len(target_feature_names_train))
X_train_Poly = pd.DataFrame(X_train_Poly, columns = target_feature_names_train)

n of columns of target_feature_names_train 170543


In [18]:
target_feature_names_test = ['x'.join(['{}^{}'.format(pair[0],pair[1]) for pair in tuple if pair[1]!=0]) for tuple in [zip(X_test_num.columns,p) for p in poly.powers_]]
print('n of columns of target_feature_names_test', len(target_feature_names_test))
X_test_Poly = pd.DataFrame(X_test_Poly, columns = target_feature_names_test)

n of columns of target_feature_names_test 170543


In [19]:
X_train_Poly.head()

Unnamed: 0,Unnamed: 0^1,max_floor^1,num_rooms^1,area^1,num_bathrooms^1,floor^1,ceiling_height^1,Unnamed: 0^2,Unnamed: 0^1xmax_floor^1,Unnamed: 0^1xnum_rooms^1,...,floor^9xceiling_height^6,floor^8xceiling_height^7,floor^7xceiling_height^8,floor^6xceiling_height^9,floor^5xceiling_height^10,floor^4xceiling_height^11,floor^3xceiling_height^12,floor^2xceiling_height^13,floor^1xceiling_height^14,ceiling_height^15
0,5482.0,5.0,3.0,80.0,1.0,4.0,2.8,30052324.0,27410.0,16446.0,...,126324700.0,88427260.0,61899080.0,43329360.0,30330550.0,21231380.0,14861970.0,10403380.0,7282365.0,5097655.0
1,6565.0,4.0,3.0,97.0,2.0,3.0,3.2,43099225.0,26260.0,19695.0,...,21134460.0,22543420.0,24046320.0,25649410.0,27359370.0,29183330.0,31128880.0,33204140.0,35417750.0,37778930.0
2,3592.0,12.0,2.0,80.0,1.0,2.0,2.8,12902464.0,43104.0,7184.0,...,246727.8,345419.0,483586.6,677021.2,947829.7,1326962.0,1857746.0,2600845.0,3641182.0,5097655.0
3,1841.0,5.0,2.0,65.0,1.0,2.0,3.0,3389281.0,9205.0,3682.0,...,373248.0,559872.0,839808.0,1259712.0,1889568.0,2834352.0,4251528.0,6377292.0,9565938.0,14348910.0
4,3146.0,9.0,3.0,78.0,1.0,6.0,3.0,9897316.0,28314.0,9438.0,...,7346640000.0,3673320000.0,1836660000.0,918330000.0,459165000.0,229582500.0,114791300.0,57395630.0,28697810.0,14348910.0


In [20]:
X_train_Poly['condition'] = X_train['condition']
X_train_Poly['building_type'] = X_train['building_type']
X_train_Poly['district'] = X_train['district']
X_train_Poly['street'] = X_train['street']

# X.columns

In [21]:
X_train_poly_ohe = pd.get_dummies(X_train_Poly, prefix=['condition', 'building_type', 'district', 'street'], columns=['condition', 'building_type', 'district', 'street'])
X_train_poly_ohe.head()

Unnamed: 0,Unnamed: 0^1,max_floor^1,num_rooms^1,area^1,num_bathrooms^1,floor^1,ceiling_height^1,Unnamed: 0^2,Unnamed: 0^1xmax_floor^1,Unnamed: 0^1xnum_rooms^1,...,district_Avan,district_Center,district_Davtashen,district_Erebuni,district_Malatia-Sebastia,district_Nor Norq,district_Norq Marash,district_Qanaqer-Zeytun,district_Shengavit,district_Vahagni district
0,5482.0,5.0,3.0,80.0,1.0,4.0,2.8,30052324.0,27410.0,16446.0,...,0,0,0,0,0,0,0,0,0,0
1,6565.0,4.0,3.0,97.0,2.0,3.0,3.2,43099225.0,26260.0,19695.0,...,0,0,0,0,0,0,0,0,0,0
2,3592.0,12.0,2.0,80.0,1.0,2.0,2.8,12902464.0,43104.0,7184.0,...,0,0,0,0,0,0,0,1,0,0
3,1841.0,5.0,2.0,65.0,1.0,2.0,3.0,3389281.0,9205.0,3682.0,...,0,1,0,0,0,0,0,0,0,0
4,3146.0,9.0,3.0,78.0,1.0,6.0,3.0,9897316.0,28314.0,9438.0,...,0,1,0,0,0,0,0,0,0,0


In [22]:
X_test_Poly['condition'] = X_test['condition']
X_test_Poly['building_type'] = X_test['building_type']
X_test_Poly['district'] = X_test['district']
X_test_Poly['street'] = df['street']

# X.columns

In [23]:
X_test_poly_ohe = pd.get_dummies(X_test_Poly, prefix=['condition', 'building_type', 'district', 'street'], columns=['condition', 'building_type', 'district', 'street'])
X_test_poly_ohe.head()

Unnamed: 0,Unnamed: 0^1,max_floor^1,num_rooms^1,area^1,num_bathrooms^1,floor^1,ceiling_height^1,Unnamed: 0^2,Unnamed: 0^1xmax_floor^1,Unnamed: 0^1xnum_rooms^1,...,district_Arabkir,district_Avan,district_Center,district_Davtashen,district_Erebuni,district_Malatia-Sebastia,district_Nor Norq,district_Norq Marash,district_Qanaqer-Zeytun,district_Shengavit
0,6471.0,10.0,1.0,41.0,1.0,4.0,3.0,41873841.0,64710.0,6471.0,...,0,0,0,0,0,0,0,0,0,0
1,2137.0,9.0,5.0,133.0,3.0,1.0,2.8,4566769.0,19233.0,10685.0,...,0,0,0,0,0,0,0,0,0,0
2,1902.0,5.0,3.0,90.0,2.0,4.0,3.2,3617604.0,9510.0,5706.0,...,0,0,0,0,0,0,0,0,0,0
3,40.0,10.0,4.0,104.0,1.0,2.0,2.8,1600.0,400.0,160.0,...,0,0,0,0,0,0,0,0,0,0
4,1770.0,5.0,1.0,42.0,1.0,3.0,2.8,3132900.0,8850.0,1770.0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
miss_col_in_test = list(set(X_train_poly_ohe.columns)- set(X_test_poly_ohe.columns))
print(len(miss_col_in_test))
miss_col_in_train = list(set(X_test_poly_ohe.columns)- set(X_train_poly_ohe.columns))
print(len(miss_col_in_train))
print('miss_col_in_train', miss_col_in_train)

1
0
miss_col_in_train []


In [25]:
X_test_poly_ohe[miss_col_in_test] = 0
X_test_poly_ohe = X_test_poly_ohe.drop(miss_col_in_train, axis = 1)
X_test_poly_ohe.head()
X_test_poly_ohe =X_test_poly_ohe.reindex(columns=X_train_poly_ohe.columns)

In [26]:
linear = LinearRegression()
linear.fit(X_train_poly_ohe, y_train)

In [27]:
y_train_pred = linear.predict(X_train_poly_ohe)

In [28]:
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
print('rmse_train', rmse_train)

In [29]:
y_test_pred = linear.predict(X_test_poly_ohe)

In [30]:
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
print('rmse_test', rmse_test)

In [31]:
elNet = ElasticNet( alpha=0.001, l1_ratio=0.01)

In [32]:

elNet.fit(X_train_poly_ohe, y_train)

In [33]:
y_train_pred = elNet.predict(X_train_poly_ohe)

In [34]:
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
print('rmse_train', rmse_train)

In [35]:
y_test_pred = elNet.predict(X_test_poly_ohe)

In [36]:
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
print('rmse_test', rmse_test)

In [37]:
# al = np.linspace(0.001, 0.2, 30)
# l1 = np.linspace(0.001, 0.02, 30)
al = np.linspace(0.00001, 0.001, 10)
l1 = np.linspace(0.001, 0.2, 10)
print('al',al)
print('l1',l1)

In [None]:
elastic=ElasticNet()

params={'alpha':al,'l1_ratio':l1}
Regressor=GridSearchCV(elastic,params,scoring='neg_mean_squared_error',cv=5)
Regressor.fit(X_train_poly_ohe,y_train)

In [None]:
y_train_pred = Regressor.predict(X_train_poly_ohe)

In [None]:
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
print('rmse_train', rmse_train)

In [None]:
y_test_pred = Regressor.predict(X_test_poly_ohe)

In [None]:
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
print('rmse_test', rmse_test)

In [None]:
print('best parameter: ', Regressor.best_params_)

In [None]:
elNet = ElasticNet( alpha=0.001, l1_ratio=0.001)

In [None]:

elNet.fit(X_train_poly_ohe, y_train)

In [None]:
y_train_pred = elNet.predict(X_train_poly_ohe)

In [None]:
mse_train = mean_squared_error(X_train_poly_ohe, y_train_pred)
rmse_train = np.sqrt(mse_train)
print('rmse_train', rmse_train)

In [None]:
y_test_pred = elNet.predict(X_test)

In [None]:
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
print('rmse_test', rmse_test)

In [None]:
model = ...

In [None]:
# def final_predict(final_test_df):
#     1. preprocessing of final_test_df (scaling, one hot encoding ...)
#     2. make sure that columns and their order in train and test are the same
#     3. return predictions

In [None]:
# df = pd.read_csv('houses_test.csv')
# final_predict(df)