In [1]:
import numpy as np
import pandas as pd
from IPython.display import display
pd.options.display.max_columns = None
from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
from datetime import date, datetime
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
from statsmodels.stats.proportion import proportions_ztest
get_ipython().run_line_magic('matplotlib', 'inline')

In [2]:
def reader_csv(file):
       
    csv_file = pd.read_csv(file)
    return csv_file


def print_dim(names, df):
        
    print("The dataset {} has {} rows and {} columns".format(names, df.shape[0], df.shape[1]))


def drop_duplicates_items(df):
    
    print("The number of duplicates is {}".format(df.duplicated().sum()))
    df = df.drop_duplicates(inplace=True)
    return df


def get_data_info(df):
       
    print("Data Information:\n {}".format(df.info()))

    
def data_nan_count(df):
    
    print("Amount of Nan per columns:\n {}".format(pd.isnull(df).sum()))

def data_description(df):
    
    print("Descriptive Statistics:\n {}".format(df.describe()))
    

def host_to_year(df, col, col_new):
    
    df['tmp'] = datetime.now() - df[col]
    df[col_new] = df['tmp'].apply(lambda x : x.days / 365)
    df = df.drop(['tmp', col], axis=1)
    return df


def replace_chars(input_string, chars, modified_string):
        
    for char in chars:
        try :
            if char in input_string:
                input_string = input_string.replace(char, modified_string)
        except:
            continue       
    return input_string


def plots_comparison(group1, group2, feature):
        
    plt.tight_layout()
    plt.subplot(1,2,1)
    plt.xticks(rotation='vertical')
    sns.countplot(group1[feature])
    plt.subplot(1,2,2)
    sns.countplot(group2[feature])
    plt.xticks(rotation='vertical')
    plt.tight_layout()
    plt.show()

def category_impute(strategy, df, features_to_impute):
   
    imputer = SimpleImputer(strategy=strategy)
    df_imp = imputer.fit_transform(df[features_to_impute])
    imputed_features = pd.DataFrame(data=df_imp, columns=features_to_impute)
    df = pd.concat([df.drop(features_to_impute, axis=1), imputed_features], axis=1)
    return df

In [3]:
listings_data = reader_csv('listings.csv')

In [4]:
drop_duplicates_items(listings_data)
print()
get_data_info(listings_data)
print()
data_nan_count(listings_data)
print()
data_description(listings_data)
print()

The number of duplicates is 0

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3818 entries, 0 to 3817
Data columns (total 92 columns):
id                                  3818 non-null int64
listing_url                         3818 non-null object
scrape_id                           3818 non-null int64
last_scraped                        3818 non-null object
name                                3818 non-null object
summary                             3641 non-null object
space                               3249 non-null object
description                         3818 non-null object
experiences_offered                 3818 non-null object
neighborhood_overview               2786 non-null object
notes                               2212 non-null object
transit                             2884 non-null object
thumbnail_url                       3498 non-null object
medium_url                          3498 non-null object
picture_url                         3818 non-null object
xl_pictur

In [5]:
features_list = ['accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'minimum_nights',
            'price', 'host_since', 'host_response_rate', 'host_response_time',
            'host_acceptance_rate', 'host_is_superhost', 'host_has_profile_pic',                        
            'has_availability', 'number_of_reviews', 'review_scores_rating', 
            'cancellation_policy', 'require_guest_profile_picture', 'require_guest_phone_verification',
            'host_identity_verified', 'neighbourhood', 'property_type', 'room_type']

vars_bin = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'has_availability'
            'require_guest_profile_picture', 'require_guest_phone_verification']
var_target = 'price'
vars_num = ['host_response_rate', 'host_acceptance_rate', 'accommodates', 'bathrooms', 'bedrooms',
            'beds', 'minimum_nights', 'number_of_reviews', 'review_scores_rating', 'host_in_years']
vars_cat = ['host_response_time', 'neighbourhood', 'property_type', 'room_type', 'bed_type',
            'cancellation_policy', ]

df = listings_data[features_list]