In [43]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks')

In [0]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.base import BaseEstimator, TransformerMixin


In [0]:
import common_machine_learning as common
import my_pipeline as my_pipe
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp, STATUS_OK
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
import time
from sklearn.model_selection import train_test_split

In [0]:
class ClasificarDescripcionMultinomialNB(BaseEstimator, TransformerMixin):
    """
    a general class for creating a machine learning step in the machine learning pipeline
    """
    def __init__(self):
        """
        constructor
        """
        self.clasificador = MultinomialNB()
        self.count_vec = CountVectorizer()
        self.tf_idf_transformer = TfidfTransformer()

    def fit(self, X, y=None, **kwargs):
        """
        an abstract method that is used to fit the step and to learn by examples
        :param X: features - Dataframe
        :param y: target vector - Series
        :param kwargs: free parameters - dictionary
        :return: self: the class object - an instance of the transformer - Transformer
        """
        X_descripcion_counts = self.count_vec.fit_transform(X['descripcion'])
        X_descripcion_tfidf = self.tf_idf_transformer.fit_transform(X_descripcion_counts)
        self.clasificador.fit(X_descripcion_tfidf, X['categoria'])
        return self

    def transform(self, X, y=None, **kwargs):
        """
        an abstract method that is used to transform according to what happend in the fit method
        :param X: features - Dataframe
        :param y: target vector - Series
        :param kwargs: free parameters - dictionary
        :return: X: the transformed data - Dataframe
        """
        count_vect = CountVectorizer()
        X_descripcion_counts = self.count_vec.transform(X['descripcion'])
        X_descripcion_tfidf = self.tf_idf_transformer.transform(X_descripcion_counts)
        X['descripcion'] = self.clasificador.predict(X_descripcion_tfidf)
        return X

    def fit_transform(self, X, y=None, **kwargs):
        """
        perform fit and transform over the data
        :param X: features - Dataframe
        :param y: target vector - Series
        :param kwargs: free parameters - dictionary
        :return: X: the transformed data - Dataframe
        """
        self = self.fit(X, y)
        return self.transform(X, y)

In [0]:
class DropColumns(BaseEstimator, TransformerMixin):
    """
    a general class for creating a machine learning step in the machine learning pipeline
    """
    def __init__(self, lista_nombre_col):
        """
        constructor
        """
        self.lista_nombre_col = lista_nombre_col

    def fit(self, X, y=None, **kwargs):
        """
        an abstract method that is used to fit the step and to learn by examples
        :param X: features - Dataframe
        :param y: target vector - Series
        :param kwargs: free parameters - dictionary
        :return: self: the class object - an instance of the transformer - Transformer
        """
        return self

    def transform(self, X, y=None, **kwargs):
        """
        an abstract method that is used to transform according to what happend in the fit method
        :param X: features - Dataframe
        :param y: target vector - Series
        :param kwargs: free parameters - dictionary
        :return: X: the transformed data - Dataframe
        """
        for col in self.lista_nombre_col:
          if col in X.columns:
            X = X.drop([col], axis = 1)
        return X

    def fit_transform(self, X, y=None, **kwargs):
        """
        perform fit and transform over the data
        :param X: features - Dataframe
        :param y: target vector - Series
        :param kwargs: free parameters - dictionary
        :return: X: the transformed data - Dataframe
        """
        self = self.fit(X, y)
        return self.transform(X, y) 

In [0]:
def categorizar_precio(precio):
    categorias = [100,200,300,400,500]
    for i in range(len(categorias)):
        if precio < categorias[i]:
            return i
    return 4

In [0]:
import re

TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [51]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words_sp = set(stopwords.words('spanish'))
stop_words_en = set(stopwords.words('english'))
stop_words = stop_words_sp | stop_words_en

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
train = common.cargar_set_optimizado('/content/drive/My Drive/Colab Notebooks/sets_de_datos/train.csv')

In [0]:
train['precio_categorico'] = train['precio'] / 10000

In [0]:
train = train[['descripcion', 'precio_categorico', 'precio']]

In [0]:
train['descripcion'] = train['descripcion'].fillna('e')\
    .map(lambda x: remove_tags(str(x)))\
    .map(lambda x: "".join([letra.lower() for letra in x if letra.isalpha() or letra == ' ']))\
    .map(lambda x: " ".join([palabra for palabra in x.split() if palabra not in stop_words_sp and len(palabra)>2]))

In [0]:
train['categoria'] = train['precio_categorico'].transform(lambda x: categorizar_precio(x))

In [57]:
train.columns

Index(['descripcion', 'precio_categorico', 'precio', 'categoria'], dtype='object')

In [0]:
X_train, X_test, y_train, y_test = train_test_split(train[['descripcion', 'precio_categorico', 'categoria']], train['precio'], random_state = 0)

In [0]:
pipe = Pipeline(steps = [
                         ('custom_trasnform', ClasificarDescripcionMultinomialNB()),
                         ('drop_columns', DropColumns(['precio_categorico'])),
                         
])

In [60]:
pipe.fit_transform(X_train, y_train)

Unnamed: 0,descripcion,categoria
160453,1,3
162315,2,2
161571,0,0
166440,1,2
98957,1,1
...,...,...
176963,1,3
117952,0,0
173685,0,1
43567,0,0


In [61]:
pipe.transform(X_test)

Unnamed: 0,descripcion,categoria
45155,1,1
238446,1,1
121013,4,3
176784,0,0
31780,1,1
...,...,...
22035,1,0
117847,0,0
155247,1,1
148035,4,4
