In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt

In [2]:
def convert_price_to_float(data, data_dictionary):
    """
    Function to convert string price columns into float price columns
    :param data: type of data set
    :param data_dictionary: dictionary with airbnb dataframes
    :return: dictionary containing updated dataframes with float price columns
    """
    string_columns = data_dictionary[data].select_dtypes(include=['object']).columns
    price_columns = [col for col in string_columns if 'price' in col]
    if len(price_columns) > 0:
        for price in price_columns:
            data_dictionary[data][price + '_float'] = data_dictionary[data][price]\
                .str\
                .replace('$', '')\
                .str\
                .replace(',', '')\
                .astype(float)
    return data_dictionary

In [3]:
def split_date(data, data_dictionary, date_column):
    """
    Function to split date column into separate year, month, day columns
    :param data: type of data set
    :param data_dictionary: dictionary with airbnb dataframes
    :param date_column: name of date column
    :return: dictionary containing updated dataframes with new date columns
    """
    if date_column in data_dictionary[data].select_dtypes(include=['object']).columns:
        data_dictionary[data][[date_column +'_year'
                               , date_column +'_month'
                               , date_column +'_day']] = data_dictionary[data][date_column]\
            .str.split('-', 3, expand=True)
    return data_dictionary

In [4]:
def load_csv(cities, data_sets, source_directory):
    """
    Function to load three types of csv data sets for Boston and Seattle
    :param city_type: list of city names
    :param data_type: list of data sets
    :param source_directory: location of the csv data sets
    :return: 
    """
    city_dictionary = {}
    for city in cities:
        data_dictionary = {}
        for data in data_sets:
            file_path = source_directory + '/' + data + '-' + city + '.csv'
            # 1. read csv into dataframe
            data_dictionary[data] = pd.read_csv(file_path)
            # 2. convert string price columns into float columns
            data_dictionary = convert_price_to_float(data, data_dictionary)
            # 3. split date column into separate year, month, day column
            data_dictionary = split_date(data, data_dictionary, 'date')
            # 4. split host_since column into separate year, month, day column
            data_dictionary = split_date(data, data_dictionary, 'host_since')
        city_dictionary[city] = data_dictionary
    return city_dictionary

In [7]:
def concatenate_dataframes(dictionary, data):
    """
    Function to get combine datasets including data for each city
    :param dictionary: complete data dictionary
    :param data: particular data of interest
    :return: dataframe including both cities
    """
    seattle_df = dictionary['seattle'][data]
    boston_df = dictionary['boston'][data]
    seattle_df['boston'] = 0
    boston_df['boston'] = 1
    intersection_columns = set(seattle_df.columns).intersection(set(boston_df.columns))
    df = pd.concat([seattle_df[intersection_columns], boston_df[intersection_columns]])
    return df

In [11]:
def check_missing_values(df, missing_values_maximum):
    """
    Function to check what columns have the most amount of missing data
    :param df: dataframe with data to compare
    :param missing_values_maximum: value spevifying the cutoff for missing data
    :return: list of columns that have more missing data than specified amount
    """
    seattle_data = df[df['boston'] == 0].isnull().mean().reset_index()
    boston_data = df[df['boston'] == 1].isnull().mean().reset_index()
    boston_data.columns = ['index', 'boston']
    seattle_data.columns = ['index', 'seattle']
    merged_data = seattle_data.merge(boston_data, how='inner', on='index')
    merged_data = merged_data.sort_values(by=['boston', 'seattle'], ascending=False)
    print(merged_data.head(60))
    missing_data_list = merged_data[(merged_data['boston'] > missing_values_maximum) 
                               | (merged_data['seattle'] > missing_values_maximum)]['index'].tolist()
    return missing_data_list

In [12]:
def drop_specified_columns(key, df):
    """
    Function to drop columns that include a specified substring
    :param key: substring to specify columns to delete 
    :param df: dataframe to remove columns from
    :return: updated dataframe
    """
    drop_columns = [col for col in df.columns if key in col.lower()]
    if len(drop_columns) > 0:
        df = df.drop(columns=drop_columns)
    return df

In [None]:
def compare_all_float_columns(df):
    """
    Function to compare numeric series between Bostng and Seattle data
    :param df: dataframe containing airbnb data for Seattle and Boston
    """
    columns_to_check = df\
            .select_dtypes(include=['bool', 'int64', 'float64'])\
            .columns
    for column in columns_to_check:
        if 'id' not in column:
            print('\n Describe ' + column)
            seattle_data = df[df['boston'] == 0][column].describe().reset_index()
            seattle_data.columns = ['index', column + '_seattle']
            boston_data = df[df['boston'] == 1][column].describe().reset_index()
            boston_data.columns = ['index', column + '-boston']
            merged_data = seattle_data.merge(boston_data, how='inner', on='index')
            print(merged_data.head(10))

In [None]:
def compare_specified_float_columns(df, compare_columns, agg_methods):
    """
    Function to plot comparison of specified numeric series in boston and seattle data
    :param df: dataframe containing airbnb data for Seattle and Boston
    :param compare_columns: columns to compare
    :param agg_methods: method to use for aggregation
    """
    list_results = []
    if not isinstance(agg_methods, list):
        agg_methods = [agg_methods]
    for method in agg_methods:
        seattle_data = df[df['boston'] == 0][compare_columns].agg(method)
        boston_data = df[df['boston'] == 1][compare_columns].agg(method)
        if isinstance(seattle_data, pd.Series):
            seattle_data = seattle_data.to_frame()
            boston_data = boston_data.to_frame()
        seattle_data.columns = [method + '-seattle']
        boston_data.columns = [method + '-boston']
        merged_data = seattle_data.merge(boston_data, how='inner', left_index=True, right_index=True)
        list_results = list_results + [merged_data]
    merged_data_all = pd.concat(list_results, axis=1, join='inner')
    print(merged_data_all.head(len(compare_columns)))
    ax = plt.gca()
    merged_data_all.plot(kind='bar', ax=ax)
    plt.show()

In [None]:
def run_linear_regression(df, y_column, x_columns):
    """
    Function to prepare training/testing data sets and run a linear regression
    :param df: dataframe containing data to use regression on
    :param y_column: RHS column name
    :param x_columns: LHS columns names
    :return: 
    """
    X = df[x_columns]
    y = df[y_column]

    # Split data into training and test data, and fit a linear model
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=42)
    lm_model = LinearRegression(normalize=True)
    lm_model.fit(X_train, y_train)
    coeff_df = pd.DataFrame(np.transpose(lm_model.coef_), X.columns, columns=['Coefficient'])\
    .sort_values('Coefficient', ascending=False)
    y_test_preds = lm_model.predict(X_test)
    r2_test =  r2_score(y_test, y_test_preds)
    results = dict()
    results['coefficients'] = coeff_df
    results['r2_test'] = r2_test
    return results

In [15]:
def compare_grouby_results(df, groupby_colunmns, agg_columns, method):
    """
    Function to groupby Boston and Seattle data on specified columns and compare it
    :param df: dataframe containing airbnb data for Seattle and Boston
    :param groupby_colunmns: columns to use as a basis for aggregation
    :param agg_columns: columns to aggregate and compare
    :param agg_methods: method to use for aggregation
    """
    seattle_data = df[df['boston'] == 0].groupby(groupby_colunmns)[agg_columns].agg(method)
    boston_data = df[df['boston'] == 1].groupby(groupby_colunmns)[agg_columns].agg(method)
    seattle_data.columns = [col + '-seattle' for col in agg_columns]
    boston_data.columns =  [col + '-boston' for col in agg_columns]
    merged_data = seattle_data.merge(boston_data, how='outer', left_index=True, right_index=True)
    ax = plt.gca()
    merged_data.plot(kind='bar', ax=ax)
    plt.show()