In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
import common.common_machine_learning as common
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import json
from difflib import SequenceMatcher
import heapq

In [2]:
train = pd.read_csv('sets_de_datos/train.csv', index_col = 0)
test = pd.read_csv('sets_de_datos/test.csv', index_col = 0)

In [3]:
stop_words_sp = set(stopwords.words('spanish'))
stop_words_en = set(stopwords.words('english'))
stopwords = stop_words_sp | stop_words_en
stopwords.add('para')
spanish_stemmer = SnowballStemmer('spanish')
stopwords = set(map(spanish_stemmer.stem, stopwords))

In [4]:
df_desc = train[['descripcion']].fillna("")

In [5]:
with open('palabras_importantes.json') as f:
    frecuencias = json.loads(f.read())

In [6]:
palabras_filtradas = dict(filter(lambda k:spanish_stemmer.stem(k[0].lower()) not in stopwords, frecuencias.items()))

In [7]:
palabras_top = heapq.nlargest(20, palabras_filtradas.items(), key=lambda i: i[1])
palabras_top_aux = list(map(lambda x:x[0],palabras_top))
palabras_top_stem = list(map(spanish_stemmer.stem, palabras_top_aux))

In [8]:
test

Unnamed: 0_level_0,titulo,descripcion,tipodepropiedad,direccion,ciudad,provincia,antiguedad,habitaciones,garages,banos,...,metrostotales,idzona,lat,lng,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4941,"casa en venta en miguel hidalgo, distrito federal",<p>excelente casa estilo moderno.</p>,Casa,Bosque de Cedros,Miguel Hidalgo,Distrito Federal,29.0,3.0,,4.0,...,,,19.408668,-99.246767,2013-07-20 00:00:00,0.0,0.0,0.0,0.0,0.0
51775,departamentos en venta en montebello,<p>departamento una recamara:\n</p><p>departam...,Apartamento,,Mérida,Yucatán,,1.0,1.0,1.0,...,67.0,113851.0,21.032480,-89.592424,2015-10-24 00:00:00,0.0,0.0,0.0,0.0,0.0
115253,departamento nuevo delegación coyoacán de 87 m...,"departamento nuevo de 87.06 m2, 1 cajón de est...",Apartamento,"Pueblo de los Reyes, Coyoacán, Mexico D.F.",Coyoacán,Distrito Federal,0.0,2.0,1.0,2.0,...,100.0,23620.0,19.332829,-99.152913,2015-05-30 00:00:00,0.0,0.0,0.0,0.0,1.0
299321,departamento en venta en acapulco,<p> raíces dv001 precioso departamento tipo k...,Apartamento,,Acapulco de Juárez,Guerrero,2.0,2.0,2.0,2.0,...,86.0,129347.0,16.860487,-99.878383,2015-04-02 00:00:00,0.0,0.0,0.0,0.0,0.0
173570,bonita casa sola equipada de dos niveles en lo...,"<p>casa sola, bonita de dos rec&aacute;maras u...",Casa,CEDROS,Tultitlán,Edo. de México,10.0,2.0,1.0,1.0,...,76.0,57125.0,19.640482,-99.127273,2013-08-15 00:00:00,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75094,oportunidad!! se vende amplia casa en col. moc...,oportunidad!! ideal para oficina o casa habita...,Casa,Oriente 172 # 265,Venustiano Carranza,Distrito Federal,20.0,4.0,3.0,3.0,...,,275741.0,19.434821,-99.092517,2015-09-19 00:00:00,0.0,0.0,0.0,1.0,1.0
171847,colinas de ecatepec,"casa, sala comedor, patio de servicio, buenas ...",Casa,colinas,Ecatepec de Morelos,Edo. de México,10.0,3.0,1.0,2.0,...,87.0,57474.0,,,2016-10-19 00:00:00,0.0,0.0,0.0,1.0,1.0
138313,estrene hermosa casa en sierra morena,hermosa casa lista para habitarse ubicada en f...,Casa,s/calle,Guadalupe,Nuevo León,5.0,3.0,2.0,2.0,...,,72224.0,,,2014-12-02 00:00:00,0.0,0.0,0.0,1.0,1.0
271268,zen house i venta de linda casa con acabados ...,hermosa casa con acabados de lujo en fracciona...,Casa,Zen House l,Querétaro,Querétaro,0.0,2.0,1.0,2.0,...,144.0,83960.0,20.591773,-100.327615,2016-10-21 00:00:00,0.0,0.0,0.0,1.0,1.0


In [9]:
palabras_top

[('baño', 256482),
 ('recamaras', 213645),
 ('casa', 164959),
 ('cocina', 155812),
 ('sala', 137112),
 ('comedor', 108288),
 ('servicio', 98687),
 ('cuenta', 94218),
 ('completo', 90281),
 ('amplia', 89055),
 ('planta', 85709),
 ('pp', 85627),
 ('cuarto', 77275),
 ('dos', 76364),
 ('estacionamiento', 74024),
 ('excelente', 72034),
 ('autos', 70873),
 ('jardín', 68982),
 ('principal', 67324),
 ('ubicación', 64108)]

In [10]:
def get_distancia(descripcion, palabras_top, palabras_top_stem):
    '''Recibe una descipción y devuelve un diccionario con las distancias a sus palabras top'''
    distancias = dict(map(lambda x: (x[0],0),palabras_top))
    descipcion = str(descripcion)
    for palabra in descripcion.split():
        palabra_stem = spanish_stemmer.stem(palabra)
        if palabra_stem in palabras_top_stem:
            palabra_top = palabras_top[palabras_top_stem.index(palabra_stem)][0]
            distancia = SequenceMatcher(None, palabra, palabra_top).ratio()
            distancias[palabra_top] = max(distancias[palabra_top], distancia)
    return distancias

In [11]:
palabras_top_solas = list(map(lambda x: x[0],palabras_top)) 

# Deje guardado un csv con las 20 palabras mas utilizadas y su distancia a ellas para cada descripcion

# Tratado de NaN's para latitud y longitud

In [12]:
def tratar_lat_long(df,ciudades,provincias):
    df["ciudad"] = df["ciudad"].fillna("")
    df["provincia"] = df["provincia"].fillna("")
    df["lat"] = df.apply(lambda x: x["lat"] if not np.isnan(x["lat"]) else ciudades["Latitude"]\
                           .get(x["ciudad"],provincias["Latitude"].get(x["provincia"],23.062283)), axis=1)
    df["lng"] = df.apply(lambda x: x["lng"] if not np.isnan(x["lng"]) else ciudades["Longitude"]\
                           .get(x["ciudad"],provincias["Longitude"].get(x["provincia"],-109.699951)), axis=1)
    return df

def parser_ciudades(df):
    df["Latitude"] = df["Latitude"].map(lambda x: float(".".join(str(x).split("°"))[:-2]))
    df["Longitude"] = df["Longitude"].map(lambda x: -1*abs(float(".".join(str(x).split("°"))[:-2])))
    return df.to_dict()
    

In [13]:
lat_long_ciudades = pd.read_csv("sets_de_datos/lat_lon.csv", index_col = 0) # https://www.mapsofworld.com/lat_long/mexico-lat-long.html
lat_long_provincias = pd.read_csv("sets_de_datos/provincias.csv", index_col = 0) # https://www.distancelatlong.com/country/mexico

In [14]:
ciudades = parser_ciudades(lat_long_ciudades)
train = tratar_lat_long(train, ciudades, lat_long_provincias.to_dict())

#### Se utilizaron datasets encontrados en internet para generalizar las latitudes y longitudes de diferentes ciudades. En caso de no contar con dicha información se recurrio a un dataset de provincias. En caso de faltar la información de provincia se recurrio a la latitud y longitud Mexicana segun Google

In [15]:
train_final = train[["lat","lng","precio"]]
train_final["año"] = pd.to_datetime(train["fecha"]).dt.year
train_final["mes"] = pd.to_datetime(train["fecha"]).dt.month

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [16]:
train_final.to_csv("sets_de_datos/train_final1.csv")