# Import packages 

In [1]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np

# Get Data

In [2]:
df = pd.read_csv('final_data.csv', dtype = {'code_dept' : object})
df.head(2)

Unnamed: 0,code,type,Year,commune,code_dep,Surface,Code_post,Prixm2,price,Avg_sqm,transactions,lat,lon,No_rooms
0,1370,Appartement,2016,Saint-Laurent-sur-Saône,1,9,1750,1770,95397,63,37,46.31,4.843,2.7
1,1364,Maison,2016,Saint-Jean-sur-Reyssouze,1,1714,1560,1272,140964,118,18,46.422,5.086,4.3


In [3]:
df.columns

Index(['code', 'type', 'Year', 'commune', 'code_dep', 'Surface', 'Code_post',
       'Prixm2', 'price', 'Avg_sqm', 'transactions', 'lat', 'lon', 'No_rooms'],
      dtype='object')

In [4]:
df.shape

(199987, 14)

In [6]:
df2 = df.head(50) # To make first checking the Encoder processing, we only use the first 50 columns

# Investigating columns

In [7]:
df2.dtypes

code              int64
type             object
Year              int64
commune          object
code_dep          int64
Surface           int64
Code_post         int64
Prixm2            int64
price             int64
Avg_sqm           int64
transactions      int64
lat             float64
lon             float64
No_rooms        float64
dtype: object

## check if there are '_' separator in the columns "specially categorical ones "

In [8]:
def get_columns_with_separator(df, separator):
    cols_list = []
    for col in df.columns:
        X = np.where(separator in col) == np.zeros(1)
        if X : #& (df[col].dtype == 'object'):
            cols_list.append(col)
    return cols_list 


get_columns_with_separator(df= df2, separator= '_')

  if X : #& (df[col].dtype == 'object'):


['code_dep', 'Code_post', 'Avg_sqm', 'No_rooms']

## Replace separator '_' in columns dataframe by '-'

In [18]:
def modify_separator_of_columns_dataframe(df, old_separator, new_separator):
    cols_list = []
    for col in df.columns:
        X = np.where(old_separator in col) == np.zeros(1)
        if X : #& (df[col].dtype == 'object'):
            col = col.replace(old_separator, new_separator)
            cols_list.append(col)
        else :
            cols_list.append(col)
            
    df.columns = cols_list
    return df, cols_list


df2, cols_list = modify_separator_of_columns_dataframe(df= df2, old_separator= '_', new_separator = '-')
df2.head(2)

  if X : #& (df[col].dtype == 'object'):


Unnamed: 0,code,type,Year,commune,code-dep,Surface,Code-post,Prixm2,price,Avg-sqm,transactions,lat,lon,No-rooms
0,1370,Appartement,2016,Saint-Laurent-sur-Saône,1,9,1750,1770,95397,63,37,46.31,4.843,2.7
1,1364,Maison,2016,Saint-Jean-sur-Reyssouze,1,1714,1560,1272,140964,118,18,46.422,5.086,4.3


In [19]:
np.asarray(cols_list)

array(['code', 'type', 'Year', 'commune', 'code-dep', 'Surface',
       'Code-post', 'Prixm2', 'price', 'Avg-sqm', 'transactions', 'lat',
       'lon', 'No-rooms'], dtype='<U12')

## OneHotEncoding using pandas.get_dummies

In [20]:
one_hot_encoded_df = pd.get_dummies(df2, columns = ['commune', 'type'])
one_hot_encoded_df.head(2)

Unnamed: 0,code,Year,code-dep,Surface,Code-post,Prixm2,price,Avg-sqm,transactions,lat,...,commune_Saint-Laurent-sur-Saône,commune_Saint-Martin-du-Mont,commune_Saint-Étienne-du-Bois,commune_Tossiat,commune_Val-Revermont,commune_Vernoux,commune_Villereversure,commune_Viriat,type_Appartement,type_Maison
0,1370,2016,1,9,1750,1770,95397,63,37,46.31,...,1,0,0,0,0,0,0,0,1,0
1,1364,2016,1,1714,1560,1272,140964,118,18,46.422,...,0,0,0,0,0,0,0,0,0,1


In [23]:
def transform_columns_dataframe(string, separator, cols):
    L = string.split(separator)
    return L[1:][0] if L[0] in cols else string

In [25]:
new_names = []

for col in one_hot_encoded_df.columns:
    
    col = transform_columns_dataframe(string = col, separator = '_', cols = ['commune', 'type'])
    new_names.append(col)
    

one_hot_encoded_df.columns = new_names
one_hot_encoded_df.shape

(50, 60)

In [26]:
one_hot_encoded_df.head(2)

Unnamed: 0,code,Year,code-dep,Surface,Code-post,Prixm2,price,Avg-sqm,transactions,lat,...,Saint-Laurent-sur-Saône,Saint-Martin-du-Mont,Saint-Étienne-du-Bois,Tossiat,Val-Revermont,Vernoux,Villereversure,Viriat,Appartement,Maison
0,1370,2016,1,9,1750,1770,95397,63,37,46.31,...,1,0,0,0,0,0,0,0,1,0
1,1364,2016,1,1714,1560,1272,140964,118,18,46.422,...,0,0,0,0,0,0,0,0,0,1


# Build Function to Rename and Encode DataFrame columns 

## ALL useful elements in one function

In [41]:
# Get Data
df = pd.read_csv('final_data.csv', dtype = {'code_dept' : object})
df2 = df.head(50) # To make first checking the Encoder processing, we only use the first 50 columns

def global_encoding_dataframe(df, cols_to_encod, old_separator = '_', new_separator='-'):
    
    cols_list = [] # cols_list is a list of columns having a separator '_'
    all_cols = [] # all_cols is all dataframe columns, after transforming columns names
    new_names = [] # new_names is dataframe columns, after encoding
    
    for col in df.columns:
        X = np.where(old_separator in col) == np.zeros(1)
        if X : 
            cols_list.append(col)  # cols_list is a list of columns having a separator '_'
            col = col.replace(old_separator, new_separator)
            all_cols.append(col)
        else :
            all_cols.append(col) # all_cols is all dataframe columns, after transforming columns names
            
    df.columns = all_cols # rename columns dataframe with new strings with no '_' separator
                        # we need that to prevent conflits in columns names when encoding categorical ones 

    one_hot_encoded_df = pd.get_dummies(df, columns = cols_to_encod)
    
    for col in one_hot_encoded_df.columns:
        
        L = col.split('_')
        y = L[1:][0] if L[0] in cols_to_encod else col
        new_names.append(y)        
        
    one_hot_encoded_df.columns = new_names
    
    return one_hot_encoded_df

        
       
encoded_df = global_encoding_dataframe(df = df2 , cols_to_encod=['commune', 'type'],
                          old_separator = '_', new_separator='-')

encoded_df.head(2)

  if X :


Unnamed: 0,code,Year,code-dep,Surface,Code-post,Prixm2,price,Avg-sqm,transactions,lat,...,Saint-Laurent-sur-Saône,Saint-Martin-du-Mont,Saint-Étienne-du-Bois,Tossiat,Val-Revermont,Vernoux,Villereversure,Viriat,Appartement,Maison
0,1370,2016,1,9,1750,1770,95397,63,37,46.31,...,1,0,0,0,0,0,0,0,1,0
1,1364,2016,1,1714,1560,1272,140964,118,18,46.422,...,0,0,0,0,0,0,0,0,0,1


## Multiple functions are assembled and called in main function 

In [48]:
# Get Data
df = pd.read_csv('final_data.csv', dtype = {'code_dept' : object})
df2 = df.head(50) # To make first checking the Encoder processing, we only use the first 50 columns

def modify_separator_of_columns_dataframe(df, old_separator, new_separator):
    cols_list = [] # cols_list is a list of columns having a separator '_'
    all_cols = [] # all_cols is all dataframe columns, after transforming columns names 
    for col in df.columns:
        X = np.where(old_separator in col) == np.zeros(1)
        if X : 
            col = col.replace(old_separator, new_separator)
            cols_list.append(col)  # cols_list is a list of columns having a separator '_'
            all_cols.append(col)
        else :
            all_cols.append(col) # all_cols is all dataframe columns, after transforming columns names
            
    df.columns = all_cols # rename columns dataframe with new strings with no '_' separator
                        # we need that to prevent conflits in columns names when encoding categorical ones 
            
    return df, cols_list

def transform_columns_dataframe(string, separator, cols):
    L = string.split(separator)
    return L[1:][0] if L[0] in cols else string


def global_encoding_dataframe(df, cols_to_encod, old_separator = '_', new_separator='-'):
    
   
    new_names = [] # new_names is dataframe columns, after encoding
    df, cols_list = modify_separator_of_columns_dataframe(df, old_separator, new_separator)
    print("cols_list  ", cols_list)
  
    one_hot_encoded_df = pd.get_dummies(df, columns = cols_to_encod)   
    for col in one_hot_encoded_df.columns:
        
        if col in cols_list: #
            col = col.replace(new_separator, old_separator)
            new_names.append(col)
        else:
            col = transform_columns_dataframe(string = col, separator = '_', cols = cols_to_encod)
            new_names.append(col)

    one_hot_encoded_df.columns = new_names
    
    return one_hot_encoded_df

        
       
encoded_df = global_encoding_dataframe(df = df2 , cols_to_encod=['commune', 'type'],
                          old_separator = '_', new_separator='-')

encoded_df.head(2)

cols_list   ['code-dep', 'Code-post', 'Avg-sqm', 'No-rooms']


  if X :


Unnamed: 0,code,Year,code_dep,Surface,Code_post,Prixm2,price,Avg_sqm,transactions,lat,...,Saint-Laurent-sur-Saône,Saint-Martin-du-Mont,Saint-Étienne-du-Bois,Tossiat,Val-Revermont,Vernoux,Villereversure,Viriat,Appartement,Maison
0,1370,2016,1,9,1750,1770,95397,63,37,46.31,...,1,0,0,0,0,0,0,0,1,0
1,1364,2016,1,1714,1560,1272,140964,118,18,46.422,...,0,0,0,0,0,0,0,0,0,1


# Class

In [55]:
class Encoder():
    
    """ Initialize dataframe
    """
    def __init__(self, df):

        self.df = df
        
    def modify_separator_of_columns_dataframe(self, df, old_separator, new_separator):
        cols_list = [] # cols_list is a list of columns having a separator '_'
        all_cols = [] # all_cols is all dataframe columns, after transforming columns names 
        for col in self.df.columns:
            X = np.where(old_separator in col) == np.zeros(1)
            if X : 
                col = col.replace(old_separator, new_separator)
                cols_list.append(col)  # cols_list is a list of columns having a separator '_'
                all_cols.append(col)
            else :
                all_cols.append(col) # all_cols is all dataframe columns, after transforming columns names

        self.df.columns = all_cols # rename columns dataframe with new strings with no '_' separator
                            # we need that to prevent conflits in columns names when encoding categorical ones 

        return self.df, cols_list
    
    
    def transform_columns_dataframe(self, string, separator, cols):
        L = string.split(separator)
        return L[1:][0] if L[0] in cols else string
    
    def run(self, df, cols_to_encod, old_separator = '_', new_separator='-'):
    
   
        new_names = [] # new_names is dataframe columns, after encoding
        df, cols_list = self.modify_separator_of_columns_dataframe(self.df, old_separator, new_separator)
        #print("cols_list  ", cols_list)

        one_hot_encoded_df = pd.get_dummies(df, columns = cols_to_encod)   
        for col in one_hot_encoded_df.columns:

            if col in cols_list: #
                col = col.replace(new_separator, old_separator)
                new_names.append(col)
            else:
                col = self.transform_columns_dataframe(string = col, separator = '_', cols = cols_to_encod)
                new_names.append(col)

        one_hot_encoded_df.columns = new_names

        return one_hot_encoded_df      

In [58]:
# Get Data
data = pd.read_csv('final_data.csv', dtype = {'code_dept' : object})
df_test = data.head(50) # To make first checking the Encoder processing, we only use the first 50 columns


encoder = Encoder(df = df_test)
result = encoder.run(df = df_test, cols_to_encod = ['commune', 'type'] )
result.shape

  if X :


(50, 60)

In [59]:
result.head(2)

Unnamed: 0,code,Year,code_dep,Surface,Code_post,Prixm2,price,Avg_sqm,transactions,lat,...,Saint-Laurent-sur-Saône,Saint-Martin-du-Mont,Saint-Étienne-du-Bois,Tossiat,Val-Revermont,Vernoux,Villereversure,Viriat,Appartement,Maison
0,1370,2016,1,9,1750,1770,95397,63,37,46.31,...,1,0,0,0,0,0,0,0,1,0
1,1364,2016,1,1714,1560,1272,140964,118,18,46.422,...,0,0,0,0,0,0,0,0,0,1
