In [81]:
import pandas as pds
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
class DataHandler:

    def __init__(self):
        print("Initialization")
        self.df_listings_final = None
        self.df_price_availability = None
        self.df_merge = None

    def get_data(self):
        print("Get data from bucket")
        self.df_listings_final = pds.read_csv("https://storage.googleapis.com/h3-data/listings_final.csv", sep=";")
        self.df_price_availability = pds.read_csv("https://storage.googleapis.com/h3-data/price_availability.csv", sep=";")

    def group_data(self):
        # merge
        print("Data merged") 
        self.df_merge = pds.merge(self.df_price_availability.groupby('listing_id')['local_price'].mean(), self.df_listings_final, on='listing_id')

    def get_process_data(self):
        self.get_data()
        self.group_data()

In [3]:
d = DataHandler()


Initialization


In [4]:
d.get_process_data()
%time

Get data from bucket
Data merged
CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.48 µs


In [106]:
d.df_merge.head()

Unnamed: 0,listing_id,local_price,name,type,city,neighborhood,latitude,longitude,person_capacity,beds,bedrooms,bathrooms,is_rebookable,is_new_listing,is_fully_refundable,is_host_highly_rated,is_business_travel_ready,pricing_weekly_factor,pricing_monthly_factor
0,56093,170.0,Beau duplex dans le Marais,entire_home,Paris,3e arrondissement,48.867284,2.358431,4,2,1,1.0,False,False,True,True,False,0.88,1.0
1,57207,49.952756,Belle Chambre pour court,private_room,Paris,Vaugirard,48.846184,2.304455,2,1,1,1.0,False,False,True,False,False,0.87,1.0
2,114543,107.374026,Charming 1bdr 55m² - Eiffel Tower,entire_home,Paris,,48.84953,2.290219,2,1,1,1.0,False,False,True,True,False,0.9,0.9
3,149534,169.0,GREAT WARM FULL APT LE HAUT MARAIS,entire_home,Paris,,48.86636,2.361844,4,2,1,1.0,False,False,True,True,False,1.0,0.4
4,164255,75.876209,Perfect place in Le Marais - Paris,entire_home,Paris,3e arrondissement,48.861398,2.364299,4,2,1,1.0,False,False,True,False,False,1.0,1.0


In [107]:
d.df_merge.tail()

Unnamed: 0,listing_id,local_price,name,type,city,neighborhood,latitude,longitude,person_capacity,beds,bedrooms,bathrooms,is_rebookable,is_new_listing,is_fully_refundable,is_host_highly_rated,is_business_travel_ready,pricing_weekly_factor,pricing_monthly_factor
994,28684174,725.175781,Chambre familiale vue jardin avec petit-déjeun...,private_room,Paris,Ternes,48.879223,2.292382,5,0,1,1.0,False,True,True,False,False,1.0,1.0
995,28709644,475.0,LORD BYRON-SPACE& STYLE IN 8TH EME,entire_home,Paris,Champs-Elysées,48.872202,2.298349,4,2,1,1.0,False,True,True,False,False,1.0,1.0
996,28751412,117.0,Malesherbes Monceau Monsen,entire_home,Paris,Monceau,48.880923,2.314568,2,1,0,1.0,False,True,True,False,False,1.0,1.0
997,28774896,156.397468,5 min to invalides and 10 min to eiffel tower,entire_home,Paris,Invalides - Ecole Militaire,48.852915,2.314519,2,1,1,1.0,False,True,True,False,False,1.0,1.0
998,28792796,49.184211,Appartement 3 chambres madeleine.,entire_home,Paris,Madeleine - Vendôme,48.870109,2.321475,6,4,2,1.5,False,True,True,False,False,1.0,1.0


In [108]:
d.df_merge.shape

(999, 19)

In [143]:
class FeatureRecipe:
    
    def __init__(self, data: pds.DataFrame):
        print("FeatureRecipe starts...")
        self.df = data
        self.cate = []
        self.floa = []
        self.intt = []
        print("End of FeatureRecipe initialisation\n")
    
    def separate_variable_types(self) -> None:
        print("Separate variable types starts...")
        for col in self.df.columns:
            if self.df[col].dtypes == int:
                self.intt.append(self.df[col])
            elif self.df[col].dtypes == float:
                self.floa.append(self.df[col])
            else:
                self.cate.append(self.df[col])
        print("Separate variable types end...")
        print ("Dataset number of columns : {} \nnumber of discreet values : {} \nnumber of continuous values : {} \nnumber of others : {} \ntotal size : {}\n".format(len(self.df.columns),
        len(self.intt),len(self.floa),len(self.cate),len(self.intt)+len(self.floa)+len(self.cate) ))
        
    def drop_uselessf(self):
        print("Drop useless feature start...")
        
        if "Unnamed: 0" in self.df.columns:
            self.df.drop("Unnamed: 0", axis=1, inplace=True)
            
        for col in self.df.columns:
            if self.df[col].isna().sum() == len(self.df[col]):
                self.df.drop([col], axis=1, inplace=True)
                
        print("Drop useless feature end...")
        print("Number columns remaining {}\n".format(len(self.df.columns)))
        
    def deal_duplicate(self):
        print("Deal duplicate start...")
        dropped_duplicates = self.df.drop_duplicates(inplace=True)
        print("Dropped duplicates : {}".format(dropped_duplicates))
        print("Deal duplicate end...")
    
    def drop_nanp(self, threshold: float):
        dropped = 0
        print("Drop columns with {} percentage of NAN".format(threshold))
        self.get_duplicates()
              
        for col in self.df.columns:
            if self.df[col].isna().sum() / self.df.shape[0] >= threshold:
                self.df.drop([col], axis=1, inplace=True)
                dropped+=1
              
        print("Number of columns dropped : {}\n".format(dropped))
    
    def get_duplicates(self):
        print("Get duplicates")
        drop_col = []
        for col_index in range(self.df.shape[1]-1):
            for second_col_index in range(col_index+1,self.df.shape[1]-1):
                if np.where(sum(self.df.iloc[:,col_index] == self.df.iloc[:,second_col_index]) == 1, True, False):
                    drop_col.append(self.df.iloc[:,second_col_index])
                    
        print("Drop col : {}".format(drop_col))
        return drop_col
              
    def deal_dtime(self):
        pass
    
    def prepare_data(self, threshold: float):
        self.drop_uselessf()
        self.separate_variable_types()
        self.deal_duplicate()
        self.drop_nanp(threshold)
        self.deal_dtime()


In [144]:
frecipe = FeatureRecipe(d.df_merge)

FeatureRecipe starts...
End of FeatureRecipe initialisation



In [145]:
frecipe.prepare_data(3)

Drop useless feature start...
Drop useless feature end...
Number columns remaining 19

Separate variable types starts...
Separate variable types end...
Dataset number of columns : 19 
number of discreet values : 4 
number of continuous values : 6 
number of others : 9 
total size : 19

Deal duplicate start...
Dropped duplicates : None
Deal duplicate end...
Drop columns with 3 percentage of NAN
Get duplicates
Condition : 0      False
1      False
2      False
3      False
4      False
       ...  
994    False
995    False
996    False
997    False
998    False
Length: 999, dtype: bool test
Condition : 0      False
1      False
2      False
3      False
4      False
       ...  
994    False
995    False
996    False
997    False
998    False
Length: 999, dtype: bool test
Condition : 0      False
1      False
2      False
3      False
4      False
       ...  
994    False
995    False
996    False
997    False
998    False
Length: 999, dtype: bool test
Condition : 0      False
1      F

In [118]:
##### class FeatureExtractor:
    """
        Feature Extractor class
    """
    
    def __init__(self, data : pds.DataFrame, flist: list):
        """
            Input : pandas.DataFrame, feature list to drop
            Output : X_train, X_test, y_train, y_test according to sklearn.model_selection.train_test_split
        """
        self.x_train, self.x_test, self.y_train, self.y_test = None, None, None, None
        self.df = data
        self.flist = flist
        
    def split(size: float, random_state :int = 42):
        pass
    
    def train():
        pass
    

IndentationError: unexpected indent (<ipython-input-118-05fd4cfe185e>, line 2)