# Modify Client and Product data

Client features inspired by: https://www.kaggle.com/abbysobh/grupo-bimbo-inventory-demand/classifying-client-type-using-client-names

Product features based on: https://www.kaggle.com/vykhand/grupo-bimbo-inventory-demand/exploring-products

### Import Necessary Packages

In [18]:
import os
import numpy as np
import pandas as pd
import re
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
stemmer = SnowballStemmer('spanish')
from sklearn.feature_extraction.text import TfidfVectorizer

### Load the data

In [19]:
def load_data():
    """ Loads and returns client and product data.
    """
    # Set work directory
    os.chdir('D:/OneDrive/Documents/Kaggle/Grupo Bimbo Inventory Demand/data')  
    
    # Load data
    clients = pd.read_csv("cliente_tabla.csv", sep = ",") 
    products = pd.read_csv("producto_tabla.csv", sep = ",") 
    
    # Return data
    return clients, products 

In [20]:
print "1. Loading client and product data..."
clients, products = load_data()
print "Complete!"

1. Loading client and product data...
Complete!


### Build function to remove duplicate ids from client data 

In [21]:
def remove_duplicate_ids(clients): 
    """ Finds and removes duplicate ids from client data. Returns cleaned data.
    """
    duplicate_ids = clients.duplicated(subset = 'Cliente_ID')    
    clients = clients[duplicate_ids == False]    
    return clients

### Create Client_type feature

In [22]:
def create_client_features(clients):
    """ Takes clients data as input.  
        Creates new variable 'Client_Type' by categorizing NombreCliente. 
        Returns clients data.
    """
    
    # Remove duplicate ids
    clients = remove_duplicate_ids(clients)    
    
    # Create new feature
    clients = clients.copy()
    clients['Client_Type'] = clients.ix[:, 'NombreCliente']    
    
    # Convert to all UPPER-CASE
    clients.ix[:, 'Client_Type'] = clients.ix[:, 'Client_Type'].str.upper()
    
    # Known Large Company / Special Group Types
    clients.ix[:, 'Client_Type'] = clients.ix[:, 'Client_Type'].str.replace('.*REMISION.*','Consignment')
    clients.ix[:, 'Client_Type'] = clients.ix[:, 'Client_Type'].replace(['.*WAL MART.*','.*SAMS CLUB.*'],'Walmart', regex=True)
    clients.ix[:, 'Client_Type'] = clients.ix[:, 'Client_Type'].str.replace('.*OXXO.*','Oxxo Store')
    clients.ix[:, 'Client_Type'] = clients.ix[:, 'Client_Type'].str.replace('.*CONASUPO.*','Govt Store')
    clients.ix[:, 'Client_Type'] = clients.ix[:, 'Client_Type'].str.replace('.*BIMBO.*','Bimbo Store')
    
    # Term search for assortment of words picked from looking at their frequencies
    clients.ix[:, 'Client_Type'] = clients.ix[:, 'Client_Type'].replace(['.*COLEG.*','.*UNIV.*','.*ESCU.*','.*INSTI.*',\
                                                        '.*PREPAR.*'],'School', regex=True)
    clients.ix[:, 'Client_Type'] = clients.ix[:, 'Client_Type'].str.replace('.*PUESTO.*','Post')
    clients.ix[:, 'Client_Type'] = clients.ix[:, 'Client_Type'].replace(['.*FARMA.*','.*HOSPITAL.*','.*CLINI.*'],'Hospital/Pharmacy', regex=True)
    clients.ix[:, 'Client_Type'] = clients.ix[:, 'Client_Type'].replace(['.*CAFE.*','.*CREMERIA.*','.*DULCERIA.*',\
                                                        '.*REST.*','.*BURGER.*','.*TACO.*', '.*TORTA.*',\
                                                        '.*TAQUER.*','.*HOT DOG.*',\
                                                        '.*COMEDOR.*', '.*ERIA.*','.*BURGU.*'],'Eatery', regex=True)
    clients.ix[:, 'Client_Type'] = clients.ix[:, 'Client_Type'].str.replace('.*SUPER.*','Supermarket')
    clients.ix[:, 'Client_Type'] = clients.ix[:, 'Client_Type'].replace(['.*COMERCIAL.*','.*BODEGA.*','.*DEPOSITO.*',\
                                                            '.*ABARROTES.*','.*MERCADO.*','.*CAMBIO.*',\
                                                        '.*MARKET.*','.*MART .*','.*MINI .*',\
                                                        '.*PLAZA.*','.*MISC.*','.*ELEVEN.*','.*EXP.*',\
                                                         '.*SNACK.*', '.*PAPELERIA.*', '.*CARNICERIA.*',\
                                                         '.*LOCAL.*','.*COMODIN.*','.*PROVIDENCIA.*'
                                                        ],'General Market/Mart'\
                                                       , regex=True)                                                   
    clients.ix[:, 'Client_Type'] = clients.ix[:, 'Client_Type'].replace(['.*VERDU.*','.*FRUT.*'],'Fresh Market', regex=True)
    clients.ix[:, 'Client_Type'] = clients.ix[:, 'Client_Type'].replace(['.*HOTEL.*','.*MOTEL.*'],'Hotel', regex=True)    
 
    # Filter participles
    clients.ix[:, 'Client_Type'] = clients.ix[:, 'Client_Type'].replace([
            '.*LA .*','.*EL .*','.*DE .*','.*LOS .*','.*DEL .*','.*Y .*', '.*SAN .*', '.*SANTA .*',\
            '.*AG .*','.*LAS .*','.*MI .*','.*MA .*', '.*II.*', '.*[0-9]+.*'\
                ],'Small Franchise', regex=True)
               
    # Everything else bucketed into 'Individual'
    def filter_remaining(clients):
        def function_word(data):
            # Avoid the single-words created so far by checking for upper-case
            if (data.isupper()) and (data != "NO IDENTIFICADO"): 
                return 'Individual'
            else:
                return data
        clients.ix[:, 'Client_Type'] = clients.ix[:, 'Client_Type'].map(function_word)
    filter_remaining(clients)
    
    return clients

In [23]:
print "2. Generating new client features..."
clients = create_client_features(clients)
print "Complete!"
print clients.head()

2. Generating new client features...
Complete!
   Cliente_ID                            NombreCliente      Client_Type
0           0                               SIN NOMBRE       Individual
1           1                         OXXO XINANTECATL       Oxxo Store
2           2                               SIN NOMBRE       Individual
3           3                                EL MORENO  Small Franchise
4           4  SDN SER  DE ALIM  CUERPO SA CIA  DE INT  Small Franchise


### Create new product features: short_product_name, brand, weight, weight_per_piece

In [24]:
def create_product_features(products):
    """ Takes products data as input and builds new features. 
        Returns modified products data.
    """
    # Split NombreProducto and create new columns
    products['short_name'] = products['NombreProducto'].str.extract('^(\D*)', expand=False)
    products['brand'] = products['NombreProducto'].str.extract('^.+\s(\D+) \d+$', expand=False)
    w = products['NombreProducto'].str.extract('(\d+)(Kg|g)', expand=True)
    products['weight'] = w[0].astype('float')*w[1].map({'Kg':1000, 'g':1})
    products['pieces'] =  products['NombreProducto'].str.extract('(\d+)p ', expand=False).astype('float')
    products['weight_per_piece'] = products['weight'] / products['pieces']

    products['short_product_name'] = (products['short_name']
                                        .map(lambda x: " ".join([i for i in x.lower()
                                                                 .split() if i not in stopwords.words("spanish")])))    
 

    products['short_product_name'] = (products['short_product_name']
                                        .map(lambda x: " ".join([stemmer.stem(i) for i in x.lower().split()])))

    # Drop unnecessary variables
    products = products.drop(['NombreProducto', 'short_name'], axis = 1)

    return products    

In [25]:
print "3. Generating new product features..."
products = create_product_features(products)
print "Complete!"
print products.head()

3. Generating new product features...
Complete!
   Producto_ID         brand  weight  pieces  weight_per_piece  \
0            0  IDENTIFICADO     NaN     NaN               NaN   
1            9           NES   750.0     NaN               NaN   
2           41           BIM   480.0     6.0             80.00   
3           53           LON   170.0     NaN               NaN   
4           72            TR    45.0     4.0             11.25   

      short_product_name  
0              identific  
1           capuccin mok  
2  bimboll ext sajonjoli  
3           burrit sincr  
4   div tir mini doradit  


### Write modified dataframes to CSV files

In [26]:
print "4. Writing to CSV..."
clients.to_csv("cliente_tabla_modified.csv", index = False, header = True)
products.to_csv("producto_tabla_modified.csv", index = False, header = True)
print "Complete!"

4. Writing to CSV...
Complete!


### Process complete!