In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV
import pickle

In [2]:
df = pd.read_csv('houses_train.csv')
train, test = train_test_split(df, test_size=0.2, shuffle=True, random_state=1234 )
test.to_csv('houses_test.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,price,condition,district,max_floor,street,num_rooms,region,area,url,num_bathrooms,building_type,floor,ceiling_height
0,4598,100000.0,newly repaired,Arabkir,6,Kievyan St,3,Yerevan,96.0,http://www.myrealty.am/en/item/26229/3-senyaka...,1,stone,4,3.0
1,5940,52000.0,good,Arabkir,14,Mamikoniants St,3,Yerevan,78.0,http://www.myrealty.am/en/item/32897/3-senyaka...,1,panel,10,2.8
2,2302,52000.0,newly repaired,Qanaqer-Zeytun,9,M. Melikyan St,3,Yerevan,97.0,http://www.myrealty.am/en/item/1459/apartment-...,1,panel,1,2.8
3,5628,130000.0,good,Center,4,Spendiaryan St,3,Yerevan,80.0,http://www.myrealty.am/en/item/2099/3-senyakan...,1,stone,2,3.2
4,760,81600.0,zero condition,Center,9,Ler. Kamsar St,3,Yerevan,107.0,http://www.myrealty.am/en/item/22722/3-senyaka...,1,monolit,9,3.0


In [4]:
df['condition'].unique().shape #1

(3,)

In [5]:
df['district'].unique().shape #3

(13,)

In [6]:
df['street'].unique().shape #4

(350,)

In [7]:
df['region'].unique().shape

(1,)

In [8]:
df['url'].unique().shape

(5001,)

In [9]:
df['building_type'].unique().shape #2

(4,)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      5001 non-null   int64  
 1   price           5001 non-null   float64
 2   condition       5001 non-null   object 
 3   district        5001 non-null   object 
 4   max_floor       5001 non-null   int64  
 5   street          5001 non-null   object 
 6   num_rooms       5001 non-null   int64  
 7   region          5001 non-null   object 
 8   area            5001 non-null   float64
 9   url             5001 non-null   object 
 10  num_bathrooms   5001 non-null   int64  
 11  building_type   5001 non-null   object 
 12  floor           5001 non-null   int64  
 13  ceiling_height  5001 non-null   float64
dtypes: float64(3), int64(5), object(6)
memory usage: 547.1+ KB


In [11]:
X = df.drop(['price', 'Unnamed: 0',], axis = 1)
y = df.loc[:, df.columns == "price"]
X

Unnamed: 0,condition,district,max_floor,street,num_rooms,region,area,url,num_bathrooms,building_type,floor,ceiling_height
0,newly repaired,Arabkir,6,Kievyan St,3,Yerevan,96.0,http://www.myrealty.am/en/item/26229/3-senyaka...,1,stone,4,3.0
1,good,Arabkir,14,Mamikoniants St,3,Yerevan,78.0,http://www.myrealty.am/en/item/32897/3-senyaka...,1,panel,10,2.8
2,newly repaired,Qanaqer-Zeytun,9,M. Melikyan St,3,Yerevan,97.0,http://www.myrealty.am/en/item/1459/apartment-...,1,panel,1,2.8
3,good,Center,4,Spendiaryan St,3,Yerevan,80.0,http://www.myrealty.am/en/item/2099/3-senyakan...,1,stone,2,3.2
4,zero condition,Center,9,Ler. Kamsar St,3,Yerevan,107.0,http://www.myrealty.am/en/item/22722/3-senyaka...,1,monolit,9,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4996,newly repaired,Arabkir,5,Griboedov St,3,Yerevan,97.0,http://www.myrealty.am/en/item/36852/3-senyaka...,1,stone,4,2.8
4997,newly repaired,Arabkir,4,Orbeli Yeghbayrner St,3,Yerevan,71.0,http://www.myrealty.am/en/item/13933/Apartment...,1,stone,4,2.8
4998,zero condition,Center,5,Mashtots Ave,1,Yerevan,40.0,http://www.myrealty.am/en/item/31190/1-senyaka...,1,stone,2,3.0
4999,newly repaired,Center,14,Argishti St,4,Yerevan,118.0,http://www.myrealty.am/en/item/25905/4-senyaka...,2,monolit,14,3.0


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1234 )

In [13]:
X_train.head()

Unnamed: 0,condition,district,max_floor,street,num_rooms,region,area,url,num_bathrooms,building_type,floor,ceiling_height
894,good,Arabkir,5,V.Papazyan St,3,Yerevan,80.0,http://www.myrealty.am/en/item/34245/3-senyaka...,1,stone,4,2.8
2322,good,Center,4,Leo St,3,Yerevan,97.0,http://www.myrealty.am/en/item/21554/3-senyaka...,2,stone,3,3.2
4592,newly repaired,Arabkir,12,Komitas Ave,2,Yerevan,80.0,http://www.myrealty.am/en/item/36588/2-senyaka...,1,panel,2,2.8
3711,newly repaired,Center,5,Tumanyan St,2,Yerevan,65.0,http://www.myrealty.am/en/item/34959/2-senyaka...,1,stone,2,3.0
4130,newly repaired,Qanaqer-Zeytun,9,Lepsus St,3,Yerevan,78.0,http://www.myrealty.am/en/item/19048/3-senyaka...,1,panel,6,3.0


In [14]:
X_train_num = X_train.select_dtypes(exclude=['object'])
X_test_num = X_test.select_dtypes(exclude=['object'])
X_test_num['street'] = X_test['street']
X_train_num.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_num['street'] = X_test['street']


Unnamed: 0,max_floor,num_rooms,area,num_bathrooms,floor,ceiling_height
894,5,3,80.0,1,4,2.8
2322,4,3,97.0,2,3,3.2
4592,12,2,80.0,1,2,2.8
3711,5,2,65.0,1,2,3.0
4130,9,3,78.0,1,6,3.0


In [15]:
X_train_num.corr()

Unnamed: 0,max_floor,num_rooms,area,num_bathrooms,floor,ceiling_height
max_floor,1.0,0.037436,0.158072,0.103681,0.631435,-0.057251
num_rooms,0.037436,1.0,0.748989,0.270376,0.053953,0.062985
area,0.158072,0.748989,1.0,0.415477,0.106946,0.190076
num_bathrooms,0.103681,0.270376,0.415477,1.0,0.05659,0.207212
floor,0.631435,0.053953,0.106946,0.05659,1.0,-0.045996
ceiling_height,-0.057251,0.062985,0.190076,0.207212,-0.045996,1.0


In [16]:
X_train_num['condition'] = X_train['condition']
X_train_num['building_type'] = X_train['building_type']
X_train_num['district'] = X_train['district']
X_train_num['street'] = X_train['street']


# X.columns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_num['condition'] = X_train['condition']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_num['building_type'] = X_train['building_type']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_num['district'] = X_train['district']
A value is trying to be set on a copy of a slice from a

In [17]:
X_train_ohe = pd.get_dummies(X_train_num, prefix=['condition', 'building_type', 'district','street'], columns=['condition', 'building_type', 'district','street'])
X_train_ohe.head()

Unnamed: 0,max_floor,num_rooms,area,num_bathrooms,floor,ceiling_height,condition_good,condition_newly repaired,condition_zero condition,building_type_monolit,...,street_Vratsakan 4 alley,street_Vratsakan St,street_Vratsyan St,street_Yekmalyan St,street_Yerznkyan St,street_Z. Sarkavag 3 dead end,street_Z. Sarkavag St,street_Z.Andranik St,street_Zakyan St,street_Zavaryan St
894,5,3,80.0,1,4,2.8,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2322,4,3,97.0,2,3,3.2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4592,12,2,80.0,1,2,2.8,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3711,5,2,65.0,1,2,3.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4130,9,3,78.0,1,6,3.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
X_test_num['condition'] = X_test['condition']
X_test_num['building_type'] = X_test['building_type']
X_test_num['district'] = X_test['district']
X_test_num['street'] = X_test['street']
# X.columns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_num['condition'] = X_test['condition']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_num['building_type'] = X_test['building_type']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_num['district'] = X_test['district']
A value is trying to be set on a copy of a slice from a DataF

In [19]:
X_test_ohe = pd.get_dummies(X_test_num, prefix=['condition', 'building_type', 'district', 'street'], columns=['condition', 'building_type', 'district', 'street'])
X_test_ohe.head()

Unnamed: 0,max_floor,num_rooms,area,num_bathrooms,floor,ceiling_height,condition_good,condition_newly repaired,condition_zero condition,building_type_monolit,...,street_Varuյan district,street_Verin Antarayin,street_Verin Shengavit,street_Vilnyus St,street_Vratsakan St,street_Yekmalyan St,street_Yerznkyan St,street_Z. Sarkavag St,street_Z.Andranik St,street_Zakyan St
2706,10,1,41.0,1,4,3.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2436,9,5,133.0,3,1,2.8,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1201,5,3,90.0,2,4,3.2,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1486,10,4,104.0,1,2,2.8,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4286,5,1,42.0,1,3,2.8,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
X_train_ohe.columns.to_list()

['max_floor',
 'num_rooms',
 'area',
 'num_bathrooms',
 'floor',
 'ceiling_height',
 'condition_good',
 'condition_newly repaired',
 'condition_zero condition',
 'building_type_monolit',
 'building_type_other',
 'building_type_panel',
 'building_type_stone',
 'district_Achapnyak',
 'district_Arabkir',
 'district_Avan',
 'district_Center',
 'district_Davtashen',
 'district_Erebuni',
 'district_Malatia-Sebastia',
 'district_Nor Norq',
 'district_Norq Marash',
 'district_Qanaqer-Zeytun',
 'district_Shengavit',
 'district_Vahagni district',
 'street_ Z. Kanaqertsi',
 'street_16 District',
 'street_16 District 1 chain',
 'street_17 District',
 'street_A. Avetisyan St',
 'street_A. Babajanyan St',
 'street_A. Hovhannisyan St',
 'street_A. Khachatrian St',
 'street_A. Tigranyan St',
 'street_A.Aharonyan St',
 'street_A.Manukyan St',
 'street_A.Petrosyan St',
 'street_A.Tigranyan St',
 'street_Abelyan St',
 'street_Abovyan St',
 'street_Acharyan St',
 'street_Adonts St',
 'street_Agatangeghos 

In [21]:
['max_floor',
 'num_rooms',
 'area',
 'num_bathrooms',
 'floor',
 'ceiling_height',
 'condition_good',
 'condition_newly repaired',
 'condition_zero condition',
 'building_type_monolit',
 'building_type_other',
 'building_type_panel',
 'building_type_stone',
 'district_Achapnyak',
 'district_Arabkir',
 'district_Avan',
 'district_Center',
 'district_Davtashen',
 'district_Erebuni',
 'district_Malatia-Sebastia',
 'district_Nor Norq',
 'district_Norq Marash',
 'district_Qanaqer-Zeytun',
 'district_Shengavit',
 'district_Vahagni district',
 'street_ Z. Kanaqertsi',
 'street_16 District',
 'street_16 District 1 chain',
 'street_17 District',
 'street_A. Avetisyan St',
 'street_A. Babajanyan St',
 'street_A. Hovhannisyan St',
 'street_A. Khachatrian St',
 'street_A. Tigranyan St',
 'street_A.Aharonyan St',
 'street_A.Manukyan St',
 'street_A.Petrosyan St',
 'street_A.Tigranyan St',
 'street_Abelyan St',
 'street_Abovyan St',
 'street_Acharyan St',
 'street_Adonts St',
 'street_Agatangeghos St',
 'street_Aghasi St',
 'street_Aghayan St',
 'street_Aghbyur Serob St',
 'street_Alixanyan exbayrner',
 'street_Alma-ata St',
 'street_Amiryan St',
 'street_Antarayin St',
 'street_Arabkir 17 St',
 'street_Arabkir 19 St',
 'street_Arabkir 21 St',
 'street_Arabkir 25 St',
 'street_Arabkir 27 St',
 'street_Arabkir 39 St',
 'street_Arabkir 43 St',
 'street_Arabkir 51 St',
 'street_Aragats St',
 'street_Arakelian St',
 'street_Aram St',
 'street_Araratyan 1 blok',
 'street_Araratyan St',
 'street_Argishti St',
 'street_Arinj',
 'street_Armenak Armenakyan St',
 'street_Arshakunyats Ave',
 'street_Artashisyan St',
 'street_Artsakh ave',
 'street_Artsakh bl',
 'street_Artsruni St',
 'street_Arzumanyan St',
 'street_Atoyan st',
 'street_Avan 4 St',
 'street_Avan-Arinj 1 micro',
 'street_Avan-Arinj 2 micro',
 'street_Avanesov St',
 'street_Ayas St',
 'street_Aygedzor St',
 'street_Aygedzor alley',
 'street_Aygektsi St',
 'street_Aygestan 10 St',
 'street_Aygestan 11 dead end',
 'street_Aygestan 9 St',
 'street_Aygestan St',
 'street_Azatamartikner St',
 'street_Azatutyan Ave',
 'street_Azatutyan alley',
 'street_Azatutyan avenue',
 'street_Babayan St',
 'street_Badal Muradyan St',
 'street_Baghramyan 2 dead end',
 'street_Baghramyan Ave  1 blind end',
 'street_Baghramyan Ave (Kentron)',
 'street_Baghramyan av(Arabkir)',
 'street_Bagratuniats 1 alley',
 'street_Bagratuniats 4 alley',
 'street_Bagratuniats Ave',
 'street_Bagrevand district',
 'street_Bakunts St',
 'street_Barbyus St',
 'street_Bashinjaxyan 1 alley',
 'street_Bashinjaxyan 2 alley',
 'street_Bashinjaxyan St',
 'street_Baxyan St',
 'street_Bayron St',
 'street_Beknazaryan St',
 'street_Bryusov St',
 'street_Bryusov dis.',
 'street_Buzand St',
 'street_Charents St',
 'street_Charents disrtict',
 'street_Chexov St',
 'street_D. Demirchyan St',
 'street_D. Malyan St',
 'street_Davit Anhaxt St',
 'street_Davit Bek',
 'street_Davtashen 1 St',
 'street_Davtashen 1 district',
 'street_Davtashen 2 St',
 'street_Davtashen 2 district',
 'street_Davtashen 3 district',
 'street_Davtashen 4 St',
 'street_Davtashen 4 district',
 'street_Deghatan St',
 'street_Droi St',
 'street_Duryan disrtict',
 'street_Dushamb St',
 'street_Dzor 1 district',
 'street_Dzorap St',
 'street_E. Qochar St',
 'street_Emineski St',
 'street_Erebuni Masiv 1 street',
 'street_Erebuni St',
 'street_Estonakan St',
 'street_Exbayrutyan St',
 'street_Frik St',
 'street_Frunze St',
 'street_Fuchik 2 alley',
 'street_Fuchik St',
 'street_G. Hakobyanc St',
 'street_G. Qochar St',
 'street_G.Lusavorich St',
 'street_Galshoyan St',
 'street_Garegin Nzhdeh Ave',
 'street_Gayi Ave',
 'street_Ghapantsian St',
 'street_Ghazar Parpetsi St',
 'street_Glinka St',
 'street_Glinkayi St',
 'street_Gogol St',
 'street_Griboedov St',
 'street_Gulakyan St',
 'street_Gusan Sheram St',
 'street_Gyulbenkyan St',
 'street_Gyuliqevxyan St',
 'street_Gyurjyan St',
 'street_H. Avetisyan St',
 'street_H. Shahinyan St',
 'street_Hakobyan St',
 'street_Halabyan St',
 'street_Hambardzumyan St',
 'street_Hanrapetutyan St',
 'street_Hasratyan St',
 'street_Hasratyan st.',
 'street_Heratsi St',
 'street_Hin yerevantsi St',
 'street_Hovhannisyan distrct',
 'street_Hovsep Emin St',
 'street_Hr. Kochar 1 alley',
 'street_Hr.Kochar St',
 'street_Isahakyan St',
 'street_Isahakyan district',
 'street_Isakov Ave',
 'street_Isakov Ave (Mal.-Seb.)',
 'street_Israelyan St',
 'street_Janibekyan St',
 'street_Jivan St',
 'street_Jrvej district',
 'street_Juxai St',
 'street_K. Ulnetsi St',
 'street_Kajaznuni St',
 'street_Kalents St',
 'street_Kanaker 1 St',
 'street_Kanaker 1 alley',
 'street_Kanaker 12 passage',
 'street_Kanaker 14 St',
 'street_Kanaker 9 St',
 'street_Karaxanyan St',
 'street_Kasyan St',
 'street_Keri St',
 'street_Khaghagh Don St',
 'street_Khanjyan St',
 'street_Khnko-Aper St',
 'street_Khorenatsi 2 alley',
 'street_Khudyakov St',
 'street_Kievyan St',
 'street_Kilikia district',
 'street_Koghbatsi St',
 'street_Komitas Ave',
 'street_Koryun St',
 'street_Koryun alley',
 'street_Kristapor 3 alley',
 'street_Kurghinyan St',
 'street_Kurghinyan alley',
 'street_Lalayants st',
 'street_Leningradyan St',
 'street_Leo St',
 'street_Lepsus St',
 'street_Ler. Kamsar St',
 'street_Lisinyan st.',
 'street_Lukashin St',
 'street_Lvovyan St',
 'street_Lyuqsemburg 1 alley',
 'street_M. Avetisyan St',
 'street_M. Melikyan St',
 'street_M. Meliqyan St',
 'street_M.Khorenatsi St',
 'street_Malatia St',
 'street_Mamikoniants St',
 'street_Manandyan St',
 'street_Mantashyan St.',
 'street_Mantashyan alley',
 'street_Manukian St',
 'street_Manushyan St',
 'street_Mar St',
 'street_Mar alley',
 'street_Margaryan  2 alley',
 'street_Margaryan  alley',
 'street_Margaryan St',
 'street_Mashtots Ave',
 'street_Mayak district',
 'street_Mayisi 9 St',
 'street_Mayisyan St',
 'street_Mazmanyan St',
 'street_Mher Mkrtchyan St',
 'street_Miasnikyan Ave',
 'street_Mikayelyan str',
 'street_Mikoyan St',
 'street_Minsk St',
 'street_Moldovakan St',
 'street_Moskovyan St',
 'street_Muradyan St',
 'street_Muratsan St',
 'street_N. Tigranyan St',
 'street_N. Zaryan St',
 'street_Nalbandyan St',
 'street_Nansen St',
 'street_Nar Dos 1 alley',
 'street_NarDos St',
 'street_Narekatsi district',
 'street_Nazarbekyan district',
 'street_Nazaret Surenyan St',
 'street_Nersisyan St',
 'street_Nikol Duman St',
 'street_Nizami St',
 'street_Nor Aresh 14 St',
 'street_Nor Aresh 15 St',
 'street_Nor Aresh 42 St',
 'street_Norashen district',
 'street_Norq 10 St',
 'street_Norq 4 p',
 'street_Norq 5 St',
 'street_Norq 7 p',
 'street_Norq 8 p',
 'street_Norq 9 p',
 'street_Northern Ave',
 'street_Oganov St',
 'street_Orbeli Yeghbayrner   St',
 'street_Paronyan St',
 'street_Petrosyan St',
 'street_Proshyan 1 alley',
 'street_Pushkin St',
 'street_Quchak district',
 'street_Raffi St',
 'street_Raynis St',
 'street_Riga 1 alley',
 'street_Riga St',
 'street_Rostom St',
 'street_Rostovyan St',
 'street_Rubinyants St',
 'street_S. Qocharyan St',
 'street_Sari Tagh 4th st',
 'street_Sari Tagh 6 st',
 'street_Sari Tagh 7th st',
 'street_Sarmen St',
 'street_Saryan St',
 'street_Sasna tsrer St',
 'street_Sayat Nova Ave',
 'street_Sayat Nova district',
 'street_Sebastia St',
 'street_Sebastia str',
 'street_Sevak St',
 'street_Sevqareci Saqo St',
 'street_Shahumyan 16str',
 'street_Sharur St',
 'street_Shevchenko St',
 'street_Shinararner St',
 'street_Shirak St',
 'street_Shiraz St',
 'street_Shirvanzade St',
 'street_Shopron 3 alley',
 'street_Shopron 4 alley',
 'street_Sisakyan St',
 'street_Sose St',
 'street_Spendiaryan St',
 'street_Stepanyan St',
 'street_Sundukyan St',
 'street_Surenyants St',
 'street_Svachyan str',
 'street_Tadevosyan St',
 'street_Tairov St',
 'street_Tamanyan St',
 'street_Taronctsu St',
 'street_Tbilisian hightway (Arabkir)',
 'street_Tbilisian hightway (Qanaqer-Zeytun)',
 'street_Teryan St',
 'street_Tichina str',
 'street_Tigran Mets Ave',
 'street_Tigran Mets Ave 2 alley',
 'street_Tigran Mets Ավե (Erebuni)',
 'street_Totovents St',
 'street_Toybughin St',
 'street_Tpagrichner St',
 'street_Ts.Isakov St',
 'street_Tsarav Aghbyur St',
 'street_Tserenc St',
 'street_Tsitsernakaberd Highway',
 'street_Tsovakal Isakov Ave',
 'street_Tumanyan 1 passage',
 'street_Tumanyan St',
 'street_Tumanyan district',
 'street_V.Antarayin St',
 'street_V.Papazyan St',
 'street_V.Vagharshyan St',
 'street_Vahakni District',
 'street_Vardanants St',
 'street_Vardanants alley',
 'street_Vardashen district',
 'street_Varuյan district',
 'street_Vaxtangov St',
 'street_Verin Antarayin',
 'street_Verin Shengavit 2th street',
 'street_Vilnyus St',
 'street_Vratsakan 2 alley',
 'street_Vratsakan 4 alley',
 'street_Vratsakan St',
 'street_Vratsyan St',
 'street_Yekmalyan St',
 'street_Yerznkyan St',
 'street_Z. Sarkavag 3 dead end',
 'street_Z. Sarkavag St',
 'street_Z.Andranik St',
 'street_Zakyan St',
 'street_Zavaryan St']

['max_floor',
 'num_rooms',
 'area',
 'num_bathrooms',
 'floor',
 'ceiling_height',
 'condition_good',
 'condition_newly repaired',
 'condition_zero condition',
 'building_type_monolit',
 'building_type_other',
 'building_type_panel',
 'building_type_stone',
 'district_Achapnyak',
 'district_Arabkir',
 'district_Avan',
 'district_Center',
 'district_Davtashen',
 'district_Erebuni',
 'district_Malatia-Sebastia',
 'district_Nor Norq',
 'district_Norq Marash',
 'district_Qanaqer-Zeytun',
 'district_Shengavit',
 'district_Vahagni district',
 'street_ Z. Kanaqertsi',
 'street_16 District',
 'street_16 District 1 chain',
 'street_17 District',
 'street_A. Avetisyan St',
 'street_A. Babajanyan St',
 'street_A. Hovhannisyan St',
 'street_A. Khachatrian St',
 'street_A. Tigranyan St',
 'street_A.Aharonyan St',
 'street_A.Manukyan St',
 'street_A.Petrosyan St',
 'street_A.Tigranyan St',
 'street_Abelyan St',
 'street_Abovyan St',
 'street_Acharyan St',
 'street_Adonts St',
 'street_Agatangeghos 

In [22]:
miss_col_in_test = list(set(X_train_ohe.columns)- set(X_test_ohe.columns))
print(len(miss_col_in_test))
miss_col_in_train = list(set(X_test_ohe.columns)- set(X_train_ohe.columns))
print(len(miss_col_in_train))
print('miss_col_in_train', miss_col_in_train)

137
21
miss_col_in_train ['street_Shahsuvaryan St', 'street_Avetisyan 4 St', 'street_Norashxarhyan St', 'street_Exbayrutyan alley', 'street_Tsaturian St', 'street_Baghramyan 2 alley', 'street_Nor Aresh 12 St', 'street_Aygestan 11 St', 'street_Davtashen 3 St', 'street_Lepsus 1 St', 'street_N. Shengavit 2 St', 'district_Nubarashen', 'street_Saxarov St', 'street_Arabkir 29 dead end', 'street_S. Safaryan St', 'street_Verin Shengavit', 'street_Hanrapetutyan dead end', 'street_Nubarashen 5 St', 'street_Nor Aresh 11 St', 'street_Shengavit', 'street_Tadevosyan alley']


In [23]:
X_test_ohe[miss_col_in_test] = 0
X_test_ohe = X_test_ohe.drop(miss_col_in_train, axis = 1)
X_test_ohe.head()
X_test_ohe =X_test_ohe.reindex(columns=X_train_ohe.columns)

In [24]:
deg = 2
poly = PolynomialFeatures(degree = deg, include_bias=False, interaction_only=True)
X_train_ohe_Poly = poly.fit_transform(X_train_ohe)
X_test_ohe_Poly = poly.fit_transform(X_test_ohe)


In [25]:
target_feature_names_train = ['x'.join(['{}^{}'.format(pair[0],pair[1]) for pair in tuple if pair[1]!=0]) for tuple in [zip(X_train_ohe.columns,p) for p in poly.powers_]]
print('n of columns of target_feature_names_train', len(target_feature_names_train))
X_train_ohe_Poly = pd.DataFrame(X_train_ohe_Poly, columns = target_feature_names_train)

n of columns of target_feature_names_train 63190


In [26]:
target_feature_names_test = ['x'.join(['{}^{}'.format(pair[0],pair[1]) for pair in tuple if pair[1]!=0]) for tuple in [zip(X_test_ohe.columns,p) for p in poly.powers_]]
print('n of columns of target_feature_names_test', len(target_feature_names_test))
X_test_ohe_Poly = pd.DataFrame(X_test_ohe_Poly, columns = target_feature_names_test)

n of columns of target_feature_names_test 63190


In [27]:
X_train_ohe_Poly.head()

Unnamed: 0,max_floor^1,num_rooms^1,area^1,num_bathrooms^1,floor^1,ceiling_height^1,condition_good^1,condition_newly repaired^1,condition_zero condition^1,building_type_monolit^1,...,street_Z. Sarkavag 3 dead end^1xstreet_Z. Sarkavag St^1,street_Z. Sarkavag 3 dead end^1xstreet_Z.Andranik St^1,street_Z. Sarkavag 3 dead end^1xstreet_Zakyan St^1,street_Z. Sarkavag 3 dead end^1xstreet_Zavaryan St^1,street_Z. Sarkavag St^1xstreet_Z.Andranik St^1,street_Z. Sarkavag St^1xstreet_Zakyan St^1,street_Z. Sarkavag St^1xstreet_Zavaryan St^1,street_Z.Andranik St^1xstreet_Zakyan St^1,street_Z.Andranik St^1xstreet_Zavaryan St^1,street_Zakyan St^1xstreet_Zavaryan St^1
0,5.0,3.0,80.0,1.0,4.0,2.8,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,3.0,97.0,2.0,3.0,3.2,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12.0,2.0,80.0,1.0,2.0,2.8,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5.0,2.0,65.0,1.0,2.0,3.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9.0,3.0,78.0,1.0,6.0,3.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
linear = LinearRegression()
linear.fit(X_train_ohe_Poly, y_train)

In [29]:
y_train_pred = linear.predict(X_train_ohe_Poly)

In [30]:
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
print('rmse_train', rmse_train)

rmse_train 14409.120846668393


In [31]:
y_test_pred = linear.predict(X_test_ohe_Poly)

In [32]:
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
print('rmse_test', rmse_test)

rmse_test 5241839261.946664


In [33]:
# elNet = ElasticNet( alpha=0.001, l1_ratio=0.01)
elNet = ElasticNet( alpha=200, l1_ratio=1)

In [None]:

elNet.fit(X_train_ohe_Poly, y_train)

In [None]:
y_train_pred = elNet.predict(X_train_ohe_Poly)

In [None]:
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
print('rmse_train', rmse_train)

In [None]:
y_test_pred = elNet.predict(X_test_ohe_Poly)

In [None]:
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
print('rmse_test', rmse_test)

In [None]:
# grid_r = dict()

# grid_r['alpha'] = [1e-5,1e-4,1e-3,1e-2,1e-1,1]
# grid_l['l1_ratio'] = [1e-6,1e-5,1e-4,1e-3,1e-2,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

In [None]:
ridge=Ridge(alpha=200)

#Regressor = GridSearchCV(ridge, grid_r, scoring='neg_mean_squared_error',cv=5)
ridge.fit(X_train_ohe_Poly, y_train)

# print('best parameter: ', Regressor.best_params_)
# print('best score: ', -Regressor.best_score_)

In [None]:
y_train_pred = ridge.predict(X_train_ohe_Poly)

In [None]:
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
print('rmse_train', rmse_train)

In [None]:
y_test_pred = ridge.predict(X_test_ohe_Poly)

In [None]:
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
print('rmse_test', rmse_test)

In [None]:
# grid_l = dict()
# #grid_l['alpha'] = [1e-6,1e-5,1e-4,1e-3,1e-2,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
# grid_l['alpha'] = [20, 50, 60, 70, 100, 120, 150, 170, 200]

# lasso=Lasso()

# Regressor=GridSearchCV(lasso,grid_l,scoring='neg_mean_squared_error',cv=5)
# Regressor.fit(X_train_ohe_Poly_norm, y_train)



In [None]:
# print('best parameter: ', Regressor.best_params_)
# print('best score: ', -Regressor.best_score_)

In [None]:
# y_train_pred = Regressor.predict(X_train_ohe_Poly_norm)

In [None]:
# mse_train = mean_squared_error(y_train, y_train_pred)
# rmse_train = np.sqrt(mse_train)
# print('rmse_train', rmse_train)

In [None]:
# y_test_pred = Regressor.predict(X_test_ohe_Poly_norm)

In [None]:
# mse_test = mean_squared_error(y_test, y_test_pred)
# rmse_test = np.sqrt(mse_test)
# print('rmse_test', rmse_test)

In [None]:
# elastic=ElasticNet()
# grid_r = {}
# grid_r['alpha'] = [200, 210, 220, 230, 240, 250]
# grid_r['l1_ratio'] = [1]

# Regressor=GridSearchCV(elastic,grid_r,scoring='neg_mean_squared_error',cv=5)
# Regressor.fit(X_train_ohe_Poly_norm,y_train)

In [None]:
# print('best parameter: ', Regressor.best_params_)

In [None]:
# elNet = ElasticNet( alpha=170, l1_ratio=1)

In [None]:
# 
# elNet.fit(X_train_ohe_Poly_norm, y_train)

In [None]:
# y_train_pred = elNet.predict(X_train_ohe_Poly_norm)


In [None]:
# mse_train = mean_squared_error(y_train, y_train_pred)
# rmse_train = np.sqrt(mse_train)
# print('rmse_train', rmse_train)

In [None]:
# y_test_pred = elNet.predict(X_test_ohe_Poly_norm)

In [None]:
# mse_test = mean_squared_error(y_test, y_test_pred)
# rmse_test = np.sqrt(mse_test)
# print('rmse_test', rmse_test)

In [None]:
filename = 'Levon_Sargsyan_model.sav'
pickle.dump(ridge, open(filename, 'wb'))

In [None]:
model = ...

In [None]:
def final_predict(model_path):
    
    X = df.drop(['price', 'Unnamed: 0',], axis = 1)
    y = df.loc[:, df.columns == "price"]
    
    X_test_num = X.select_dtypes(exclude=['object'])
    
    X_test_num['condition'] = X['condition']
    X_test_num['building_type'] = X['building_type']
    X_test_num['district'] = X['district']
    X_test_num['street'] = X['street']
    
    X_test_ohe = pd.get_dummies(X_test_num, prefix=['condition', 'building_type', 'district','street'], columns=['condition', 'building_type', 'district','street'])

    train_columns = ['max_floor','num_rooms','area','num_bathrooms','floor','ceiling_height','condition_good','condition_newly repaired','condition_zero condition','building_type_monolit','building_type_other',
                     'building_type_panel','building_type_stone','district_Achapnyak','district_Arabkir','district_Avan','district_Center','district_Davtashen','district_Erebuni','district_Malatia-Sebastia',
                     'district_Nor Norq','district_Norq Marash','district_Qanaqer-Zeytun','district_Shengavit','district_Vahagni district','street_ Z. Kanaqertsi','street_16 District','street_16 District 1 chain',
                     'street_17 District','street_A. Avetisyan St','street_A. Babajanyan St','street_A. Hovhannisyan St','street_A. Khachatrian St','street_A. Tigranyan St','street_A.Aharonyan St','street_A.Manukyan St',
                     'street_A.Petrosyan St','street_A.Tigranyan St','street_Abelyan St','street_Abovyan St','street_Acharyan St','street_Adonts St','street_Agatangeghos St','street_Aghasi St','street_Aghayan St',
                     'street_Aghbyur Serob St','street_Alixanyan exbayrner','street_Alma-ata St','street_Amiryan St','street_Antarayin St','street_Arabkir 17 St','street_Arabkir 19 St','street_Arabkir 21 St',
                     'street_Arabkir 25 St','street_Arabkir 27 St','street_Arabkir 39 St','street_Arabkir 43 St','street_Arabkir 51 St','street_Aragats St','street_Arakelian St','street_Aram St','street_Araratyan 1 blok',
                     'street_Araratyan St','street_Argishti St','street_Arinj','street_Armenak Armenakyan St','street_Arshakunyats Ave','street_Artashisyan St','street_Artsakh ave','street_Artsakh bl','street_Artsruni St',
                     'street_Arzumanyan St','street_Atoyan st','street_Avan 4 St','street_Avan-Arinj 1 micro','street_Avan-Arinj 2 micro','street_Avanesov St','street_Ayas St','street_Aygedzor St','street_Aygedzor alley',
                     'street_Aygektsi St','street_Aygestan 10 St','street_Aygestan 11 dead end','street_Aygestan 9 St','street_Aygestan St','street_Azatamartikner St','street_Azatutyan Ave','street_Azatutyan alley',
                     'street_Azatutyan avenue','street_Babayan St','street_Badal Muradyan St','street_Baghramyan 2 dead end','street_Baghramyan Ave  1 blind end','street_Baghramyan Ave (Kentron)',
                     'street_Baghramyan av(Arabkir)','street_Bagratuniats 1 alley','street_Bagratuniats 4 alley','street_Bagratuniats Ave','street_Bagrevand district','street_Bakunts St','street_Barbyus St',
                     'street_Bashinjaxyan 1 alley','street_Bashinjaxyan 2 alley','street_Bashinjaxyan St','street_Baxyan St','street_Bayron St','street_Beknazaryan St','street_Bryusov St','street_Bryusov dis.',
                     'street_Buzand St','street_Charents St','street_Charents disrtict','street_Chexov St','street_D. Demirchyan St','street_D. Malyan St','street_Davit Anhaxt St','street_Davit Bek',
                     'street_Davtashen 1 St','street_Davtashen 1 district','street_Davtashen 2 St','street_Davtashen 2 district','street_Davtashen 3 district','street_Davtashen 4 St','street_Davtashen 4 district',
                     'street_Deghatan St','street_Droi St','street_Duryan disrtict','street_Dushamb St','street_Dzor 1 district','street_Dzorap St','street_E. Qochar St','street_Emineski St',
                     'street_Erebuni Masiv 1 street','street_Erebuni St','street_Estonakan St','street_Exbayrutyan St','street_Frik St','street_Frunze St','street_Fuchik 2 alley','street_Fuchik St',
                     'street_G. Hakobyanc St','street_G. Qochar St','street_G.Lusavorich St','street_Galshoyan St','street_Garegin Nzhdeh Ave','street_Gayi Ave','street_Ghapantsian St','street_Ghazar Parpetsi St',
                     'street_Glinka St','street_Glinkayi St','street_Gogol St','street_Griboedov St','street_Gulakyan St','street_Gusan Sheram St','street_Gyulbenkyan St','street_Gyuliqevxyan St','street_Gyurjyan St',
                     'street_H. Avetisyan St','street_H. Shahinyan St','street_Hakobyan St','street_Halabyan St','street_Hambardzumyan St','street_Hanrapetutyan St','street_Hasratyan St','street_Hasratyan st.',
                     'street_Heratsi St','street_Hin yerevantsi St','street_Hovhannisyan distrct','street_Hovsep Emin St','street_Hr. Kochar 1 alley','street_Hr.Kochar St','street_Isahakyan St','street_Isahakyan district',
                     'street_Isakov Ave','street_Isakov Ave (Mal.-Seb.)','street_Israelyan St','street_Janibekyan St','street_Jivan St','street_Jrvej district','street_Juxai St','street_K. Ulnetsi St','street_Kajaznuni St',
                     'street_Kalents St','street_Kanaker 1 St','street_Kanaker 1 alley','street_Kanaker 12 passage','street_Kanaker 14 St','street_Kanaker 9 St','street_Karaxanyan St','street_Kasyan St','street_Keri St',
                     'street_Khaghagh Don St','street_Khanjyan St','street_Khnko-Aper St','street_Khorenatsi 2 alley','street_Khudyakov St','street_Kievyan St','street_Kilikia district','street_Koghbatsi St',
                     'street_Komitas Ave','street_Koryun St','street_Koryun alley','street_Kristapor 3 alley','street_Kurghinyan St','street_Kurghinyan alley','street_Lalayants st','street_Leningradyan St',
                     'street_Leo St','street_Lepsus St','street_Ler. Kamsar St','street_Lisinyan st.','street_Lukashin St','street_Lvovyan St','street_Lyuqsemburg 1 alley','street_M. Avetisyan St','street_M. Melikyan St',
                     'street_M. Meliqyan St','street_M.Khorenatsi St','street_Malatia St','street_Mamikoniants St','street_Manandyan St','street_Mantashyan St.','street_Mantashyan alley','street_Manukian St',
                     'street_Manushyan St','street_Mar St','street_Mar alley','street_Margaryan  2 alley','street_Margaryan  alley','street_Margaryan St','street_Mashtots Ave','street_Mayak district','street_Mayisi 9 St',
                     'street_Mayisyan St','street_Mazmanyan St','street_Mher Mkrtchyan St','street_Miasnikyan Ave','street_Mikayelyan str','street_Mikoyan St','street_Minsk St','street_Moldovakan St','street_Moskovyan St',
                     'street_Muradyan St','street_Muratsan St','street_N. Tigranyan St','street_N. Zaryan St','street_Nalbandyan St','street_Nansen St','street_Nar Dos 1 alley','street_NarDos St','street_Narekatsi district',
                     'street_Nazarbekyan district','street_Nazaret Surenyan St','street_Nersisyan St','street_Nikol Duman St','street_Nizami St','street_Nor Aresh 14 St','street_Nor Aresh 15 St','street_Nor Aresh 42 St',
                     'street_Norashen district','street_Norq 10 St','street_Norq 4 p','street_Norq 5 St','street_Norq 7 p','street_Norq 8 p','street_Norq 9 p','street_Northern Ave','street_Oganov St',
                     'street_Orbeli Yeghbayrner   St','street_Paronyan St','street_Petrosyan St','street_Proshyan 1 alley','street_Pushkin St','street_Quchak district','street_Raffi St','street_Raynis St',
                     'street_Riga 1 alley','street_Riga St','street_Rostom St','street_Rostovyan St','street_Rubinyants St','street_S. Qocharyan St','street_Sari Tagh 4th st','street_Sari Tagh 6 st','street_Sari Tagh 7th st',
                     'street_Sarmen St','street_Saryan St','street_Sasna tsrer St','street_Sayat Nova Ave','street_Sayat Nova district','street_Sebastia St','street_Sebastia str','street_Sevak St','street_Sevqareci Saqo St',
                     'street_Shahumyan 16str','street_Sharur St','street_Shevchenko St','street_Shinararner St','street_Shirak St','street_Shiraz St','street_Shirvanzade St','street_Shopron 3 alley','street_Shopron 4 alley',
                     'street_Sisakyan St','street_Sose St','street_Spendiaryan St','street_Stepanyan St','street_Sundukyan St','street_Surenyants St','street_Svachyan str','street_Tadevosyan St','street_Tairov St',
                     'street_Tamanyan St','street_Taronctsu St','street_Tbilisian hightway (Arabkir)','street_Tbilisian hightway (Qanaqer-Zeytun)','street_Teryan St','street_Tichina str','street_Tigran Mets Ave',
                     'street_Tigran Mets Ave 2 alley','street_Tigran Mets Ավե (Erebuni)','street_Totovents St','street_Toybughin St','street_Tpagrichner St','street_Ts.Isakov St','street_Tsarav Aghbyur St','street_Tserenc St',
                     'street_Tsitsernakaberd Highway','street_Tsovakal Isakov Ave','street_Tumanyan 1 passage','street_Tumanyan St','street_Tumanyan district','street_V.Antarayin St','street_V.Papazyan St',
                     'street_V.Vagharshyan St','street_Vahakni District','street_Vardanants St','street_Vardanants alley','street_Vardashen district','street_Varuյan district','street_Vaxtangov St','street_Verin Antarayin',
                     'street_Verin Shengavit 2th street','street_Vilnyus St','street_Vratsakan 2 alley','street_Vratsakan 4 alley','street_Vratsakan St','street_Vratsyan St','street_Yekmalyan St','street_Yerznkyan St',
                     'street_Z. Sarkavag 3 dead end','street_Z. Sarkavag St','street_Z.Andranik St','street_Zakyan St','street_Zavaryan St']
    miss_col_in_test = list(set(train_columns)- set(X_test_ohe.columns))
    miss_col_in_train = list(set(X_test_ohe.columns)- set(train_columns))
    X_test_ohe[miss_col_in_test] = 0
    X_test_ohe = X_test_ohe.drop(miss_col_in_train, axis = 1)
    X_test_ohe =X_test_ohe.reindex(columns=train_columns)
    
    deg = 2
    poly = PolynomialFeatures(degree = deg, include_bias=False, interaction_only=True)
    X_test_ohe_Poly = poly.fit_transform(X_test_ohe)
    
    target_feature_names_test = ['x'.join(['{}^{}'.format(pair[0],pair[1]) for pair in tuple if pair[1]!=0]) for tuple in [zip(X_test_ohe.columns,p) for p in poly.powers_]]
    X_test_ohe_Poly = pd.DataFrame(X_test_ohe_Poly, columns = target_feature_names_test)
    
    loaded_model = pickle.load(open('Levon_Sargsyan_model.sav', 'rb'))
    
    y_test_pred = loaded_model.predict(X_test_ohe_Poly)
    mse_test = mean_squared_error(y, y_test_pred)
    rmse_test = np.sqrt(mse_test)
    print('rmse_test', rmse_test)
    
    # 1. preprocessing of final_test_df (scaling, one hot encoding ...)
    # 2. make sure that columns and their order in train and test are the same
    # 3. return predictions

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import pickle

df = pd.read_csv('houses_test.csv')
final_predict(df)