In [2]:
# Import packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [9]:
# Read in CSV data
df = pd.read_csv('data/product_purchases.csv')
print(df.head(), "\n")
print(df.info(),  "\n")

     id  nps_score  subscribed_to_newsletter    interests  number_of_purchases
0  1462        3.0                     False  Fly Fishing                    8
1  1491        2.0                      True  Fly Fishing                   14
2  1492        NaN                      True  Fly Fishing                   13
3  1551        NaN                     False  Fly Fishing                   12
4  1553        NaN                     False  Fly Fishing                   10 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        1000 non-null   int64  
 1   nps_score                 391 non-null    float64
 2   subscribed_to_newsletter  1000 non-null   bool   
 3   interests                 789 non-null    object 
 4   number_of_purchases       1000 non-null   int64  
dtypes: bool(1), float64(1), int64

## Summary Statistics

### Mean, Median, Min, Max

In [18]:
# Mean
mean_purchases = df['number_of_purchases'].mean()
print("Mean Purchases: "+str(mean_purchases))

# Median
median_purchases = df['number_of_purchases'].median()
print("Median Purchases: "+str(median_purchases))

# Min
min_purchases = df['number_of_purchases'].min()
print("Min Purchases: "+str(min_purchases))

# Max
max_purchases = df['number_of_purchases'].max()
print("Max Purchases: "+str(max_purchases))

Mean Purchases: 15.852
Median Purchases: 16.0
Min Purchases: 3
Max Purchases: 31


### Inter-quartile range (IQR)

In [21]:
# Inter-quartile range, which is the 75th percentile minus the 25th percentile
# Using the .agg() method with a custom function
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)

print(df['number_of_purchases'].agg(iqr))

# Print the IQR and Median
print(df['number_of_purchases'].agg([iqr, np.median]))

5.0
iqr        5.0
median    16.0
Name: number_of_purchases, dtype: float64


### Percentiles

In [8]:
# Define a function to return the 30th percentile
def pct30(column):
    return column.quantile(0.3)

# Use the .agg() method to call the function on the 'number_of_purchases' column
df['number_of_purchases'].agg(pct30)

14.0

### .describe() Method

In [23]:
# We can get all the summary statistics above with the .describe() method.
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.describe.html

df['number_of_purchases'].describe()

count    1000.000000
mean       15.852000
std         4.089354
min         3.000000
25%        13.000000
50%        16.000000
75%        18.000000
max        31.000000
Name: number_of_purchases, dtype: float64

## Computing the variance

In [13]:
# Build array from DataFrame column
purchases = df['number_of_purchases'].values

# Array of differences to mean: differences
differences = (purchases - np.mean(purchases))

# Square the differences: diff_sq
diff_sq = differences**2

# Compute the mean square difference: variance_explicit
variance_explicit = np.mean(diff_sq)

# Now we'll do it the easy way
# Compute the variance using NumPy: variance_np
variance_np = np.var(purchases)

# Print the results
print("Explicit:", variance_explicit)
print("Numpy:",variance_np)

Explicit: 16.706096
Numpy: 16.706096
