# Missing Values

In [4]:
import pandas as pd
df = pd.read_csv('listings_summary.csv.zip')

In [9]:
pd.set_option('display.max_rows',100)

In [2]:
def nas_sorted(df):
    return df.isnull().sum().sort_values(ascending = False)

In [5]:
sorted_nas = nas_sorted(df)

In [6]:
sorted_nas[0:3]

xl_picture_url        22552
jurisdiction_names    22552
thumbnail_url         22552
dtype: int64

### Look for additional na values

Oftentimes, we can have missing values that are stored as other values.  For example, we may have values that are stored as -999 or positive 999.  Our values may be stored as empty strings.  

Pandas provides us with a `df.replace` function that allows us to perform find and replace across a dataframe.  However, we may not like to replace our values until we view the values we are about to replace.  After all, perhaps they indicate real values that we do not wish to replace.

In [32]:
def column_matches(df, column, match_value):
    return np.array([column, df[df[column] == match_value].index.to_numpy()])

In [33]:
import numpy as np
def any_matches(df, match_value):
    column_idx_matches = np.array([column_matches(df, column, match_value) for column in df.columns])
    return np.array(np.array([match for match in column_idx_matches if match[1].any()]))

In [45]:
matches = any_matches(df, 3.76)

In [46]:
matches

array([['reviews_per_month', array([    0,  7464, 12313, 16992])]],
      dtype=object)

In [36]:
def view_matches(df, match_value):
    matches = any_matches(df, match_value)
    if not matches.any():
        print('NO MATCHES FOR PROVIDED VALUE')
        return pd.DataFrame()
    match_columns = matches[:, 0]
    rows = np.concatenate(matches[:, 1])
    return df[match_columns].iloc[rows]

In [53]:
matched_df = view_matches(df, 3.76)

In [54]:
matched_df

Unnamed: 0,reviews_per_month
0,3.76
7464,3.76
12313,3.76
16992,3.76


If we feel like we want to remove numbers like 3.76, we can do so with the `df.replace` method.

In [55]:
new_df = df.replace(to_replace = 3.76, value = 'foobar')

In [56]:
view_matches(new_df, 'foobar')

  result = method(y)


Unnamed: 0,reviews_per_month
0,foobar
7464,foobar
12313,foobar
16992,foobar


### Find outliers 

Sometimes our missing value numbers are not stored as numbers that we cannot predict.  We may discover them to be outliers in our dataset.  It's a good idea to have some methods so that we can explore outliers in our dataset.

In [272]:
np.arange(-3, 4, 1)

array([-3, -2, -1,  0,  1,  2,  3])

In [60]:
from scipy import stats

def percentiles(column):
    z_scores = stats.zscore(column)
    # segment based on number of standard deviations away from the mean     
    hist, bin_edges = np.histogram(z_scores, bins=np.arange(-3, 4, 1), density=True)
    return np.stack((hist, bin_edges[1:]))

In [106]:
percentiles(df['availability_30'])

array([[ 0.        ,  0.        ,  0.71368393,  0.12034409,  0.07901738,
         0.08695459],
       [-2.        , -1.        ,  0.        ,  1.        ,  2.        ,
         3.        ]])

In [112]:
def too_many_outliers(column, threshold = .05):
    #  expected .021 if normal distribution
    z_less_neg_two = percentiles(column)[0, 0]
    z_gt_two = percentiles(column)[0, -1]
    if z_less_neg_two > threshold or z_gt_two > threshold:
        return np.hstack((column.name, z_less_neg_two, z_gt_two))
    else:
        False

In [113]:
def outlier_columns(df, threshold = .05):
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    outlier_columns = np.array([too_many_outliers(df[column]) for column in numeric_columns])
    return np.array([column for column in outlier_columns if column is not None])

In [135]:
def select_outliers(column, upper_tail = True):
    if upper_tail:
        return column[stats.zscore(column) > 2]
    else:
        return column[stats.zscore(column) < -2]

In [137]:
outlier_columns(df)

  return (a - mns) / sstd
  return n/db/n.sum(), bin_edges


array([['host_id', '0.0', '0.06784320681092586'],
       ['availability_30', '0.0', '0.08695459382759844'],
       ['availability_60', '0.0', '0.08043632493792124'],
       ['availability_90', '0.0', '0.07338595246541327'],
       ['availability_365', '0.0', '0.08526959914863427']], dtype='<U32')

In [138]:
select_outliers(df['availability_30']).value_counts()

29    448
30    344
23    246
27    234
28    228
25    180
24    148
26    133
Name: availability_30, dtype: int64

### Add Columns

In [153]:
def informative(df):
    non_informative = [column for column in df.columns if len(df[column].unique()) == 1]
    informative_columns = list(set(df.columns.to_list()) - set(non_informative))
    return df[informative_columns]
    
def some_nans(df):
    informative_df = informative(df)
    some_nans_bools = pd.isnull(informative_df).any()
    return some_nans_bools.index[some_nans_bools]
    
def new_na_columns(df):
    nan_columns = some_nans(df)
    df_nans = pd.isnull(df[nan_columns])
    column_name_nas = ["{column_nan}_is_na".format(column_nan = column_nan) for column_nan in nan_columns]
    df_nans.columns = column_name_nas
    return df_nans

def new_df_with_na_cols(df):
    return pd.concat([df, new_na_columns(df)], axis = 1)

In [145]:
len(df.columns)

96

In [147]:
len(informative(df).columns)

85

In [161]:
df_with_na_cols = new_df_with_na_cols(df)

In [162]:
len(df_with_na_cols.columns)

144

### Impute Means

We should impute the means only after properly coercing columns into the correct format.

In [139]:
# def impute_means_on_nas(df):
#     copied_df = df.copy()
#     mean_series = df.mean(axis = 0, skipna = True, numeric_only = True)
#     values = dict(zip(mean_series.index, mean_series.values))
#     copied_df.fillna(value = values)

### Resources

[detecting missing values](https://towardsdatascience.com/ways-to-detect-and-remove-the-outliers-404d16608dba)

[detecting outliers](https://machinelearningmastery.com/how-to-use-statistics-to-identify-outliers-in-data/)