# Rename DataFrame columns 

In [131]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np

In [11]:
df = pd.read_csv('final_data.csv', dtype = {'code_dept' : object})
df.head(2)

Unnamed: 0,code,type,Year,commune,code_dep,Surface,Code_post,Prixm2,price,Avg_sqm,transactions,lat,lon,No_rooms
0,1370,Appartement,2016,Saint-Laurent-sur-Saône,1,9,1750,1770,95397,63,37,46.31,4.843,2.7
1,1364,Maison,2016,Saint-Jean-sur-Reyssouze,1,1714,1560,1272,140964,118,18,46.422,5.086,4.3


In [12]:
df.columns

Index(['code', 'type', 'Year', 'commune', 'code_dep', 'Surface', 'Code_post',
       'Prixm2', 'price', 'Avg_sqm', 'transactions', 'lat', 'lon', 'No_rooms'],
      dtype='object')

In [13]:
df.shape

(199987, 14)

In [14]:
df2 = df.head(50) # To make first checking the Encoder class, we only use the first 50 columns
df2.shape

(50, 14)

In [15]:
df2['type'].unique()

array(['Appartement', 'Maison'], dtype=object)

In [16]:
class Encoder():
    
    """ Initialize dataframe
    """
    def __init__(self, df):

        self.df = df
        
    def execute(self, col_name):
        
        L = list(self.df[col_name].unique())
        if '' in L:
            self.df[col_name].replace("", "NoValue", inplace=True) #Replace NaN by "NoCodeNature"

        ohe = OneHotEncoder(sparse = False) # Instanciate encoder
        ohe.fit(self.df[[col_name]]) # Fit encoder  ---> OneHotEncoder(sparse=False)

        col_encoded = ohe.transform(self.df[[col_name]]) # Encode

        dicts_col = {}
        keys = list(ohe.categories_[0])
        values = col_encoded.T.astype(int)

        for i,j in enumerate(keys):
            dicts_col[j] = values[i,:]

        result = pd.DataFrame.from_dict(dicts_col)

        self.df = self.df.reset_index(drop=True)

        #Concat df and result dataframes
        data_res = pd.concat([self.df, result], axis = 1)

        if 'NoValue' in list(data_res.columns):
            data_res = data_res.drop(columns= ['NoValue',col_name] )
        else:
            data_res = data_res.drop(columns= col_name)

        return data_res        

In [22]:
print(df2.shape, len(df2.commune.unique()), "*****", len(df2.type.unique()))
encoder = Encoder(df = df2)
res = encoder.execute(col_name = 'commune')
print("First encodeing", res.shape)

encoder2 = Encoder(df = res)
res_2 = encoder2.execute(col_name = 'type')
print("Second encodeing", res_2.shape)

(50, 14) 46 ***** 2
First encodeing (50, 59)
Second encodeing (50, 60)


In [23]:
def encoding_categ_column(df, col_name):
    
    L = list(df[col_name].unique())
    if '' in L:
        df[col_name].replace("", "NoValue", inplace=True) #Replace NaN by "NoCodeNature"
    
    ohe = OneHotEncoder(sparse = False) # Instanciate encoder
    ohe.fit(df[[col_name]]) # Fit encoder  ---> OneHotEncoder(sparse=False)
    
    col_encoded = ohe.transform(df[[col_name]]) # Encode

    dicts_col = {}
    keys = list(ohe.categories_[0])
    values = col_encoded.T.astype(int)
    
    for i,j in enumerate(keys):
        dicts_col[j] = values[i,:]

    result = pd.DataFrame.from_dict(dicts_col)
    
    df = df.reset_index(drop=True)
    
    #Concat df and result dataframes
    data_res = pd.concat([df, result], axis = 1)
    
    if 'NoValue' in list(data_res.columns):
        data_res = data_res.drop(columns= ['NoValue',col_name] )
    else:
        data_res = data_res.drop(columns= col_name)
        
    return data_res 

In [24]:
data_t = encoding_categ_column(df = df2, col_name = 'commune')

data_t3 = encoding_categ_column(df = data_t, col_name = 'type')
data_t3.shape

(50, 60)

In [25]:
def encoding_categ_column_list(df, col_name):
    
    ''' Here, col_name is a list of columns'''
    
    for col in col_name:
        df = encoding_categ_column(df = df, col_name = col)  
        
    return df

In [26]:
data = encoding_categ_column_list(df = df2, col_name = ['commune', 'type'])

data.shape

(50, 60)

In [123]:
class Encoder():
    
    """ Initialize dataframe
    """
    def __init__(self, df):

        self.df = df
        
    def execute_one_column(self, df, col_name):
        
        L = list(self.df[col_name].unique())
        if '' in L:
            self.df[col_name].replace("", "NoValue", inplace=True) #Replace NaN by "NoCodeNature"

        ohe = OneHotEncoder(sparse = False) # Instanciate encoder
        ohe.fit(self.df[[col_name]]) # Fit encoder  ---> OneHotEncoder(sparse=False)

        col_encoded = ohe.transform(self.df[[col_name]]) # Encode

        dicts_col = {}
        keys = list(ohe.categories_[0])
        values = col_encoded.T.astype(int)

        for i,j in enumerate(keys):
            dicts_col[j] = values[i,:]

        result = pd.DataFrame.from_dict(dicts_col)

        self.df = self.df.reset_index(drop=True)

        #Concat df and result dataframes
        data_res = pd.concat([self.df, result], axis = 1)

        if 'NoValue' in list(data_res.columns):
            data_res = data_res.drop(columns= ['NoValue',col_name] )
        else:
            data_res = data_res.drop(columns= col_name)

        return data_res        
    
    def execute(self, col_name):
        
        ''' Here, col_name is a list of columns'''
        
        df_i= self.execute_one_column(self.df, col_name = col_name[0])  
        print(df_i.shape)
    
        for col in col_name[1:]:
            print(col, df_i.shape)
            df_i= self.execute_one_column(df_i, col_name = col)  
            print(df_i.shape)

        return df_i

In [124]:
encoder = Encoder(df = df2)
res = encoder.execute(col_name = ['commune', 'type'])
print("First encodeing", res.shape)


(50, 59)
type (50, 59)
(50, 15)
First encodeing (50, 15)


In [102]:
col_name = ['commune', 'type']
col_name[1:]

['type']

In [152]:
df = pd.read_csv('final_data.csv', dtype = {'code_dept' : object})
df.head(2)

Unnamed: 0,code,type,Year,commune,code_dep,Surface,Code_post,Prixm2,price,Avg_sqm,transactions,lat,lon,No_rooms
0,1370,Appartement,2016,Saint-Laurent-sur-Saône,1,9,1750,1770,95397,63,37,46.31,4.843,2.7
1,1364,Maison,2016,Saint-Jean-sur-Reyssouze,1,1714,1560,1272,140964,118,18,46.422,5.086,4.3


In [153]:
df2 = df.head(50) # To make first checking the Encoder class, we only use the first 50 columns
df2.shape

(50, 14)

In [155]:
df2.head(2)

Unnamed: 0,code,type,Year,commune,code_dep,Surface,Code_post,Prixm2,price,Avg_sqm,transactions,lat,lon,No_rooms
0,1370,Appartement,2016,Saint-Laurent-sur-Saône,1,9,1750,1770,95397,63,37,46.31,4.843,2.7
1,1364,Maison,2016,Saint-Jean-sur-Reyssouze,1,1714,1560,1272,140964,118,18,46.422,5.086,4.3


In [156]:
class Encoder():
    
    """ Initialize dataframe
    """
    def __init__(self, df):

        self.df = df
        
    def execute_list(self, col_name):
        
        for col in col_name[1:]:
            
            L = list(self.df[col].unique())
            if '' in L:
                self.df[col].replace("", "NoValue", inplace=True) #Replace NaN by "NoCodeNature"

            ohe = OneHotEncoder(sparse = False) # Instanciate encoder
            ohe.fit(self.df[[col]]) # Fit encoder  ---> OneHotEncoder(sparse=False)

            col_encoded = ohe.transform(self.df[[col]]) # Encode

            dicts_col = {}
            keys = list(ohe.categories_[0])
            values = col_encoded.T.astype(int)

            for i,j in enumerate(keys):
                dicts_col[j] = values[i,:]

            result = pd.DataFrame.from_dict(dicts_col)

            self.df = self.df.reset_index(drop=True)

            #Concat df and result dataframes
            data_res = pd.concat([self.df, result], axis = 1)

            if 'NoValue' in list(data_res.columns):
                data_res = data_res.drop(columns= ['NoValue',col] )
            else:
                data_res = data_res.drop(columns= col)

        return data_res        
    

In [157]:
encoder = Encoder(df = df2)
res = encoder.execute_list(col_name = ['commune', 'type'])
print("First encodeing", res.shape)


First encodeing (50, 15)


In [159]:
res.head(2)

Unnamed: 0,code,Year,commune,code_dep,Surface,Code_post,Prixm2,price,Avg_sqm,transactions,lat,lon,No_rooms,Appartement,Maison
0,1370,2016,Saint-Laurent-sur-Saône,1,9,1750,1770,95397,63,37,46.31,4.843,2.7,1,0
1,1364,2016,Saint-Jean-sur-Reyssouze,1,1714,1560,1272,140964,118,18,46.422,5.086,4.3,0,1


In [144]:
df2.head(2)

Unnamed: 0,code,type,Year,commune,code_dep,Surface,Code_post,Prixm2,price,Avg_sqm,transactions,lat,lon,No_rooms
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [161]:
one_hot_encoded_data = pd.get_dummies(df2, columns = ['commune', 'type'])
one_hot_encoded_data.head(2)

Unnamed: 0,code,Year,code_dep,Surface,Code_post,Prixm2,price,Avg_sqm,transactions,lat,...,commune_Saint-Laurent-sur-Saône,commune_Saint-Martin-du-Mont,commune_Saint-Étienne-du-Bois,commune_Tossiat,commune_Val-Revermont,commune_Vernoux,commune_Villereversure,commune_Viriat,type_Appartement,type_Maison
0,1370,2016,1,9,1750,1770,95397,63,37,46.31,...,1,0,0,0,0,0,0,0,1,0
1,1364,2016,1,1714,1560,1272,140964,118,18,46.422,...,0,0,0,0,0,0,0,0,0,1


In [162]:
one_hot_encoded_data.shape

(50, 60)

In [175]:
def transform_string(string, separator):
    L = string.split(separator)
    return L[1:][0] if L[0] in ['commune', 'type'] else string

new_names = []

for col in one_hot_encoded_data.columns:
    #one_hot_encoded_data.col = one_hot_encoded_data.apply(lambda x: transform_string(x,'_'))
    
    col = transform_string(string = col, separator = '_')
    new_names.append(col)
    

one_hot_encoded_data.columns = new_names
one_hot_encoded_data.shape

(50, 60)

In [176]:
one_hot_encoded_data.head(2)

Unnamed: 0,code,Year,code_dep,Surface,Code_post,Prixm2,price,Avg_sqm,transactions,lat,...,Saint-Laurent-sur-Saône,Saint-Martin-du-Mont,Saint-Étienne-du-Bois,Tossiat,Val-Revermont,Vernoux,Villereversure,Viriat,Appartement,Maison
0,1370,2016,1,9,1750,1770,95397,63,37,46.31,...,1,0,0,0,0,0,0,0,1,0
1,1364,2016,1,1714,1560,1272,140964,118,18,46.422,...,0,0,0,0,0,0,0,0,0,1


# check if there are '_' separator in the columns "specially catgorical ones "

In [177]:
df = pd.read_csv('final_data.csv', dtype = {'code_dept' : object})
df.head(2)

Unnamed: 0,code,type,Year,commune,code_dep,Surface,Code_post,Prixm2,price,Avg_sqm,transactions,lat,lon,No_rooms
0,1370,Appartement,2016,Saint-Laurent-sur-Saône,1,9,1750,1770,95397,63,37,46.31,4.843,2.7
1,1364,Maison,2016,Saint-Jean-sur-Reyssouze,1,1714,1560,1272,140964,118,18,46.422,5.086,4.3


In [198]:
for col in df.columns:
    X = np.where('_' in col) == np.zeros(1)
    if X : #& (df[col].dtype == 'object'):
        print("********   ", col, df[col].dtype)

  if X & (df[col].dtype == 'object'):


In [190]:
df2.dtypes

code              int64
type             object
Year              int64
commune          object
code_dep          int64
Surface           int64
Code_post         int64
Prixm2            int64
price             int64
Avg_sqm           int64
transactions      int64
lat             float64
lon             float64
No_rooms        float64
dtype: object

In [199]:
def get_columns_with_separator(df, separator):
    cols_list = []
    for col in df.columns:
        X = np.where(separator in col) == np.zeros(1)
        if X : #& (df[col].dtype == 'object'):
            cols_list.append(col)
    return cols_list 


get_columns_with_separator(df= df2, separator= '_')

  if X : #& (df[col].dtype == 'object'):


['code_dep', 'Code_post', 'Avg_sqm', 'No_rooms']

In [203]:
def modify_separator_of_columns_dataframe(df, old_separator, new_separator):
    cols_list = []
    for col in df.columns:
        X = np.where(old_separator in col) == np.zeros(1)
        if X : #& (df[col].dtype == 'object'):
            col = col.replace(old_separator, new_separator)
            cols_list.append(col)
    return cols_list 


modify_separator_of_columns_dataframe(df= df2, old_separator= '_', new_separator = '-')

  if X : #& (df[col].dtype == 'object'):


['code-dep', 'Code-post', 'Avg-sqm', 'No-rooms']

In [209]:
df = pd.read_csv('final_data.csv', dtype = {'code_dept' : object})
df2 = df.head(50) # To make first checking the Encoder class, we only use the first 50 columns

def modify_separator_of_columns_dataframe(df, old_separator, new_separator):
    cols_list = []
    for col in df.columns:
        X = np.where(old_separator in col) == np.zeros(1)
        if X : #& (df[col].dtype == 'object'):
            col = col.replace(old_separator, new_separator)
            cols_list.append(col)
    return cols_list 


new_df = modify_separator_of_columns_dataframe(df= one_hot_encoded_df, old_separator= '_', new_separator = '-')


one_hot_encoded_df = pd.get_dummies(df2, columns = ['commune', 'type'])
one_hot_encoded_df.head(2)

  if X : #& (df[col].dtype == 'object'):


Unnamed: 0,code,Year,code_dep,Surface,Code_post,Prixm2,price,Avg_sqm,transactions,lat,...,commune_Saint-Laurent-sur-Saône,commune_Saint-Martin-du-Mont,commune_Saint-Étienne-du-Bois,commune_Tossiat,commune_Val-Revermont,commune_Vernoux,commune_Villereversure,commune_Viriat,type_Appartement,type_Maison
0,1370,2016,1,9,1750,1770,95397,63,37,46.31,...,1,0,0,0,0,0,0,0,1,0
1,1364,2016,1,1714,1560,1272,140964,118,18,46.422,...,0,0,0,0,0,0,0,0,0,1


In [None]:
def transform_columns_dataframe(string, separator):
    L = string.split(separator)
    return L[1:][0] if L[0] in ['commune', 'type'] else string

In [211]:
new_names = []

for col in one_hot_encoded_df.columns:
    #one_hot_encoded_data.col = one_hot_encoded_data.apply(lambda x: transform_string(x,'_'))
    
    col = transform_string(string = col, separator = '_')
    new_names.append(col)
    

one_hot_encoded_df.columns = new_names
one_hot_encoded_df.shape

(50, 60)

In [212]:
one_hot_encoded_df.head(2)

Unnamed: 0,code,Year,code_dep,Surface,Code_post,Prixm2,price,Avg_sqm,transactions,lat,...,Saint-Laurent-sur-Saône,Saint-Martin-du-Mont,Saint-Étienne-du-Bois,Tossiat,Val-Revermont,Vernoux,Villereversure,Viriat,Appartement,Maison
0,1370,2016,1,9,1750,1770,95397,63,37,46.31,...,1,0,0,0,0,0,0,0,1,0
1,1364,2016,1,1714,1560,1272,140964,118,18,46.422,...,0,0,0,0,0,0,0,0,0,1
