In [1]:
import wandb
import pandas as pd
import pandas_profiling

In [2]:
class EDA():
    def __init__(
            self, 
            project="nyc_airbnb", 
            group="eda", 
            save_code=True, 
            artifact="sample.csv:latest",
            min_price=10,
            max_price=350
        ):
        self.min_price = min_price
        self.max_price = max_price
        
        self.run = wandb.init(project=project, group=group, save_code=save_code)
        self.local_path = wandb.use_artifact(artifact).file()
        self.df = pd.read_csv(self.local_path)
        
    def create_profile(self):
        self.profile = pandas_profiling.ProfileReport(self.df, missing_diagrams={"Matrix": False, "Dendrogram": False})
        self.profile.to_widgets()
        
    def remove_outliers(self):
        # Drop outliers
        idx = self.df['price'].between(self.min_price, self.max_price)
        self.df = self.df[idx].copy()

    def post_process(self):
        self.remove_outliers()
        # Convert last_review to datetime
        self.df['last_review'] = pd.to_datetime(self.df['last_review'])
    
    def finish(self):
        self.run.finish()

    

In [3]:
eda = EDA()

[34m[1mwandb[0m: Currently logged in as: [33mhieutrungdao[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
eda.create_profile()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [5]:
eda.post_process()

In [6]:
eda.df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19001 entries, 0 to 19999
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              19001 non-null  int64         
 1   name                            18994 non-null  object        
 2   host_id                         19001 non-null  int64         
 3   host_name                       18993 non-null  object        
 4   neighbourhood_group             19001 non-null  object        
 5   neighbourhood                   19001 non-null  object        
 6   latitude                        19001 non-null  float64       
 7   longitude                       19001 non-null  float64       
 8   room_type                       19001 non-null  object        
 9   price                           19001 non-null  int64         
 10  minimum_nights                  19001 non-null  int64         
 11  nu

In [7]:
eda.finish()

VBox(children=(Label(value='0.078 MB of 0.078 MB uploaded (0.009 MB deduped)\r'), FloatProgress(value=1.0, max…