In [29]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
np.random.seed(500)

In [3]:
df = pd.read_csv("sets_de_datos/train.csv", usecols = ['descripcion','precio'])

In [4]:
df

Unnamed: 0,descripcion,precio
0,"depto. interior de 80.15m2, consta de sala com...",2273000.0
1,"<p>entre sonora y guerrero, atr&aacute;s del h...",3600000.0
2,descripcion \nla mejor ubicacion residencial e...,1200000.0
3,casa en privada con caseta de vigilancia casas...,650000.0
4,bonito departamento en excelentes condiciones ...,1150000.0
...,...,...
239995,vendo casa en bosques de ica residencial a 10 ...,650000.0
239996,"casa con un jardin amplio, un cuarto de servic...",1940000.0
239997,"departamento con excelente ubicación, muy cerc...",3400000.0
239998,"casa sola, dividida en cuatro departamentos de...",2890000.0


In [5]:
import re

TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [6]:
stop_words_sp = set(stopwords.words('spanish'))
stop_words_en = set(stopwords.words('english'))
stop_words = stop_words_sp | stop_words_en

In [7]:
df['descripcion'] = df['descripcion'].fillna('e')\
    .map(lambda x: remove_tags(str(x)))\
    .map(lambda x: "".join([letra.lower() for letra in x if letra.isalpha() or letra == ' ']))\
    .map(lambda x: " ".join([palabra for palabra in x.split() if palabra not in stop_words_sp and len(palabra)>2]))

In [8]:
df['precio']/=10000

In [9]:
df

Unnamed: 0,descripcion,precio
0,depto interior consta sala comedor cocina equi...,227.3
1,sonora guerrero atraacutes hospital aacutengel...,360.0
2,descripcion mejor ubicacion residencial orient...,120.0
3,casa privada caseta vigilancia casas dos tres ...,65.0
4,bonito departamento excelentes condiciones bue...,115.0
...,...,...
239995,vendo casa bosques ica residencial minutos cen...,65.0
239996,casa jardin amplio cuarto servicio baño comple...,194.0
239997,departamento excelente ubicación cerca univers...,340.0
239998,casa sola dividida cuatro departamentos recama...,289.0


In [10]:
def categorizar_precio(precio):
    categorias = [100,200,300,400,500]
    for i in range(len(categorias)):
        if precio < categorias[i]:
            return i
    return 4

In [11]:
df['categoria'] = df['precio'].transform(categorizar_precio)

In [35]:
train_X, test_X, train_Y, test_Y = model_selection.train_test_split(df['descripcion'],df['categoria'],test_size=0.2)

In [44]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=1000, ngram_range=(1, 2))

In [46]:
features = tfidf.fit_transform(df.descripcion).toarray()
labels = df.categoria
features.shape

(240000, 1630)

In [47]:
from sklearn.feature_selection import chi2
import numpy as np

In [49]:
N = 4
for i in range(5):
    features_chi2 = chi2(features, labels == i)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(i))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# '0':
  . Most correlated unigrams:
. cuarto
. protecciones
. infonavit
. auto
  . Most correlated bigrams:
. credito infonavit
. patio servicio
. cuarto servicio
. cochera auto
# '1':
  . Most correlated unigrams:
. servicio
. salón
. cuarto
. family
  . Most correlated bigrams:
. family room
. lavado cuarto
. servicio baño
. cuarto servicio
# '2':
  . Most correlated unigrams:
. cuarto
. granito
. protecciones
. auto
  . Most correlated bigrams:
. principal baño
. cuarto servicio
. principal vestidor
. cochera auto
# '3':
  . Most correlated unigrams:
. terraza
. vestidor
. auto
. cuarto
  . Most correlated bigrams:
. cochera auto
. patio servicio
. servicio baño
. cuarto servicio
# '4':
  . Most correlated unigrams:
. room
. cuarto
. salón
. family
  . Most correlated bigrams:
. salón juegos
. pisos madera
. family room
. cuarto servicio


In [51]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
X_train, X_test, y_train, y_test = train_test_split(df['descripcion'], df['categoria'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [54]:
print(clf.predict(count_vect.transform(["'depto interior consta sala comedor cocina equipada área lavado baño completo recamara principal closet baño completo recamara closet'"])))

[1]


In [61]:
from sklearn.model_selection import cross_val_score
import seaborn as sns
sns.boxplot(x='x',y='acc', data=pd.DataFrame(['model',cross_val_score(clf, features, labels, scoring='accuracy', cv=5)], columns = ['acc']))

ValueError: Could not interpret input 'x'