In [71]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [43]:
train = pd.read_parquet('train.parquet')
fpt = pd.read_parquet('test.parquet')


In [44]:
train.columns

Index(['id', 'url', 'region', 'region_url', 'price', 'type', 'sqfeet', 'beds',
       'baths', 'cats_allowed', 'dogs_allowed', 'smoking_allowed',
       'wheelchair_access', 'electric_vehicle_charge', 'comes_furnished',
       'laundry_options', 'parking_options', 'image_url', 'description', 'lat',
       'long', 'state'],
      dtype='object')

In [45]:
train_d = train.drop(['url', 'region', 'region_url','image_url', 'description','state'],axis = 1)
train_d.columns.value_counts()


id                         1
price                      1
type                       1
sqfeet                     1
beds                       1
baths                      1
cats_allowed               1
dogs_allowed               1
smoking_allowed            1
wheelchair_access          1
electric_vehicle_charge    1
comes_furnished            1
laundry_options            1
parking_options            1
lat                        1
long                       1
dtype: int64

# Es necesario normalizar el nombre de las columnas para poder trabajar mejor sobre las mismas. Luego de esto aplicamos los cambios requeridos de la función antes mencionada. Ahora tenemos la columna target, que es nuestro objetivo para prececir.

In [46]:
train_d['category_price'] = 0
train_d['pred'] = 0
for index,serie in train_d.iterrows():
    if serie[1] >= 0 and serie[1] <= 999:
        train_d.iat[index,16] = 'low'
    if serie[1] >= 1000 and serie[1] <= 1999:
        train_d.iat[index,16] = 'medium'
        train_d.iat[index,17] = 1
    if serie[1] >= 2000:
        train_d.iat[index,16] = 'high'
        train_d.iat[index,17] = 2


In [51]:
train_d

Unnamed: 0,id,price,type,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,lat,long,category_price,pred
0,7048013474,1350,house,1200,2,2.0,1,1,1,0,0,0,w/d in unit,detached garage,43.5851,-116.2250,medium,1
1,7043931179,1115,apartment,694,1,1.0,1,1,1,0,0,0,w/d in unit,carport,38.9137,-104.7800,medium,1
2,7048254516,1129,apartment,900,2,2.0,0,0,1,0,0,0,w/d hookups,off-street parking,36.7922,-76.1643,medium,1
3,7041032577,1580,house,1469,3,2.0,1,1,1,0,0,0,w/d in unit,,33.5623,-112.0560,medium,1
4,7048588701,995,apartment,700,1,1.0,1,1,1,0,0,0,w/d in unit,carport,36.0595,-86.6592,low,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346474,7050982281,672,apartment,660,1,1.0,1,1,1,0,0,0,laundry on site,,37.9591,-87.5293,low,0
346475,7049418251,2122,apartment,1099,2,2.0,1,1,1,0,0,0,w/d in unit,off-street parking,32.6279,-117.0370,high,2
346476,7048268235,1014,apartment,1104,2,2.0,1,1,0,0,0,0,w/d hookups,off-street parking,33.9659,-80.9355,medium,1
346477,7026721229,935,apartment,1050,2,2.0,0,0,0,0,0,0,w/d in unit,detached garage,48.1995,-101.2800,low,0


#Este dataframe tiene variables categoricas, se escogen y se les aplica el metodo OneHotEncoder, ya que si se les hubiera aplicado un tratamiento **ORDINAL**, esto hubiera introducido ruido en mis datos dando resultados no muy satisfactorios.

In [77]:
depar_encoder = OneHotEncoder()
datos_house_1hot = pd.DataFrame(depar_encoder.fit_transform(train_d[['laundry_options','parking_options','type']]).toarray(),columns=np.concatenate(depar_encoder.categories_))			
datos_house_1hot = datos_house_1hot.assign(dogs_allowed = train_d['dogs_allowed'])
datos_house_1hot = datos_house_1hot.assign(smoking_allowed = train_d['smoking_allowed'])
datos_house_1hot = datos_house_1hot.assign(wheelchair_access = train_d['wheelchair_access'])
datos_house_1hot = datos_house_1hot.assign(electric_vehicle_charge = train_d['electric_vehicle_charge'])
datos_house_1hot = datos_house_1hot.assign(comes_furnished = train_d['comes_furnished'])
datos_house_1hot = datos_house_1hot.assign(cats_allowed = train_d['cats_allowed'])
datos_house_1hot = datos_house_1hot.assign(target = train_d["pred"])
datos_house_1hot

Unnamed: 0,laundry in bldg,laundry on site,no laundry on site,w/d hookups,w/d in unit,None,attached garage,carport,detached garage,no parking,...,loft,manufactured,townhouse,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,cats_allowed,target
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1,1,0,0,0,1,1
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1,1,0,0,0,1,1
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,1,0,0,0,0,1
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1,1,0,0,0,1,1
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346474,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1,1,0,0,0,1,0
346475,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1,1,0,0,0,1,2
346476,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1,0,0,0,0,1,1
346477,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


## Para crear nuestro modelo se escoge la regresion logistica, ya que la regresión logística resulta útil para los casos en los que se desea predecir la presencia o ausencia de una característica o resultado.

In [98]:
log_reg = LogisticRegression(solver='lbfgs', max_iter=5000)
datos_entrena, datos_prueba, clase_entrena, clase_prueba = train_test_split(datos_house_1hot[[
'laundry in bldg','laundry on site',
'no laundry on site','w/d hookups',
'w/d in unit',
'attached garage','carport',
'detached garage','no parking',
'off-street parking','street parking',
'valet parking',
'apartment',
'condo','cottage/cabin',
'duplex','flat',
'house','in-law',
'loft',
'manufactured','townhouse',
'dogs_allowed','smoking_allowed',
'wheelchair_access','electric_vehicle_charge',
'comes_furnished','cats_allowed']],datos_house_1hot["target"],test_size = 0.30)
modelo = log_reg.fit(datos_entrena,clase_entrena)
np.set_printoptions(suppress = True)
print("Columna prediccion: ",modelo.predict(datos_prueba))
print("Información referente a la probabilidad de",modelo.predict_proba(datos_prueba))
print("Probabilidad de aciertos: ",modelo.score(datos_prueba,clase_prueba)*100)

Columna prediccion:  [1 0 0 ... 1 0 0]
Información referente a la probabilidad de [[0.28408456 0.52796266 0.18795278]
 [0.63892911 0.31232499 0.04874591]
 [0.63600986 0.31174677 0.05224337]
 ...
 [0.43562054 0.51121856 0.05316089]
 [0.57832957 0.40822876 0.01344167]
 [0.66752244 0.30192879 0.03054877]]
Probabilidad de aciertos:  60.245901639344254


## Ahora probamos el modelo con el archivo de prueba, lo cargamos normalizamos las columnas.

In [100]:
depar_encoder_test = OneHotEncoder()
datos_house_1hot_test = pd.DataFrame(depar_encoder_test.fit_transform(fpt[['laundry_options','parking_options','type']]).toarray(),columns=np.concatenate(depar_encoder_test.categories_))			
datos_house_1hot_test = datos_house_1hot_test.assign(dogs_allowed = fpt['dogs_allowed'])
datos_house_1hot_test = datos_house_1hot_test.assign(smoking_allowed = fpt['smoking_allowed'])
datos_house_1hot_test = datos_house_1hot_test.assign(wheelchair_access = fpt['wheelchair_access'])
datos_house_1hot_test = datos_house_1hot_test.assign(electric_vehicle_charge = fpt['electric_vehicle_charge'])
datos_house_1hot_test = datos_house_1hot_test.assign(comes_furnished = fpt['comes_furnished'])
datos_house_1hot_test = datos_house_1hot_test.assign(cats_allowed = fpt['cats_allowed'])

datos_house_1hot_test = datos_house_1hot_test.drop([None],axis=1)
modelo_test = modelo.predict(datos_house_1hot_test)
df_test = pd.DataFrame(modelo_test,columns=["pred"]) 
df_test.to_csv("juankarlos1983.csv",index=False)
print('Columna con la prediccion : ',df_test)

Columna con la prediccion :         pred
0         0
1         0
2         1
3         1
4         0
...     ...
38493     1
38494     0
38495     1
38496     1
38497     0

[38498 rows x 1 columns]
