#### Practical Statistics for Data Scientists (Python)
#### Chapter 1. Exploratory Data Analysis
##### > (c) 2019 Peter C. Bruce, Andrew Bruce, Peter Gedeck


In [2]:

from pathlib import Path

import pandas as pd
import numpy as np
from scipy.stats import trim_mean
from statsmodels import robust
import wquantiles

import seaborn as sns
import matplotlib.pylab as plt

try:
    import common
    DATA = common.dataDirectory()
except ImportError:
    DATA = Path().resolve() / 'dataset/data'

In [3]:
# Define paths to data sets. If you don't keep your data in the same directory as the code, adapt the path names.

AIRLINE_STATS_CSV = DATA / 'airline_stats.csv'
KC_TAX_CSV = DATA / 'kc_tax.csv.gz'
LC_LOANS_CSV = DATA / 'lc_loans.csv'
AIRPORT_DELAYS_CSV = DATA / 'dfw_airline.csv'
SP500_DATA_CSV = DATA / 'sp500_data.csv.gz'
SP500_SECTORS_CSV = DATA / 'sp500_sectors.csv'
STATE_CSV = DATA / 'state.csv'


In [5]:
## Estimates of Location
### Example: Location Estimates of Population and Murder Rates

# Table 1-2
state = pd.read_csv(STATE_CSV)
print(state.head(8))

         State  Population  Murder.Rate Abbreviation
0      Alabama     4779736          5.7           AL
1       Alaska      710231          5.6           AK
2      Arizona     6392017          4.7           AZ
3     Arkansas     2915918          5.6           AR
4   California    37253956          4.4           CA
5     Colorado     5029196          2.8           CO
6  Connecticut     3574097          2.4           CT
7     Delaware      897934          5.8           DE


To compute mean and median in Python we can use the pandas methods of the data frame

In [9]:
print(state['Population'].mean())
print(trim_mean(state['Population'],0.1))
# This is because the trimmed mean excludes the largest and smallest five states (trim=0.1 drops 10% from each end).
print(state['Population'].median())

# The mean is bigger  than the trimmed mean, which is bigger than the median 

6162876.3
4783697.125
4436369.5


In [12]:
print(np.average(state['Murder.Rate'], weights=state['Population']))
print(wquantiles.median(state['Murder.Rate'], weights=state['Population']))

#In this case, the weighted mean and the weighted median are about the same

4.445833981123393
4.4
