**Exercise 1 – Synthetic Data Generation**

In [3]:
import numpy as np
import time as time

In [4]:
#help(np.random)
help(np.std)

Help on function std in module numpy:

std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=<no value>, *, where=<no value>)
    Compute the standard deviation along the specified axis.
    
    Returns the standard deviation, a measure of the spread of a distribution,
    of the array elements. The standard deviation is computed for the
    flattened array by default, otherwise over the specified axis.
    
    Parameters
    ----------
    a : array_like
        Calculate the standard deviation of these values.
    axis : None or int or tuple of ints, optional
        Axis or axes along which the standard deviation is computed. The
        default is to compute the standard deviation of the flattened array.
    
        .. versionadded:: 1.7.0
    
        If this is a tuple of ints, a standard deviation is performed over
        multiple axes, instead of a single axis or all the axes as before.
    dtype : dtype, optional
        Type to use in computing the standard deviation. 

In [5]:
# Initialise the seed of the random number generator using the current time

#round(time.time())
np.random.seed(round(time.time()))


In [6]:
# Generate 5,000 customers’ purchase amounts in dollars. The data should follow a normal distribution with mean of 100 and standard deviation of 25.

mean = 100.0
std_deviation = 25.0
size = 5000
purchase_amt = np.random.normal(mean, std_deviation, size)

# Clip to ensure that there is no negative spending
purchase_amt = np.clip(purchase_amt,0,None)
purchase_amt

array([112.44368529, 133.74011347, 102.50371516, ...,  81.70934881,
        98.16644721,  91.84024439])

**Exercise 2 – Basic Data Analysis**

In [None]:
#average spend

average_spend = np.mean(purchase_amt)
print(f'The average spend of 5000 customer is: ${average_spend:.2f}')

the average spend of 5000 customer is: $99.72


In [9]:
#standard deviation

std_deviation =np.std(purchase_amt)
print(f'The standard deviation is: ${std_deviation:.2f}')

The standard deviation is: $25.13


In [49]:
#Find high-value customers who spend more than $100. State the number of such customers and the percentage.

high_spend_cust = purchase_amt[purchase_amt > 100]    #[]array
high_spend_cust_pct = float(len(high_spend_cust)/len(purchase_amt)*100)

print(f'The no. of high value-customers (>$100): {len(high_spend_cust)}')
print(F'The total spend amt of high-value customers: ${sum(high_spend_cust):.2f}')
print(F'The percentage of high-value customers: {high_spend_cust_pct:.1f}%')

# for amt in high_spend_cust:
#     print(f'The high-value customers: ${amt}')



The no. of high value-customers (>$100): 2470
The total spend amt of high-value customers: $296641.81
The percentage of high-value customers: 49.4%


#State the number of top 5% spenders and the spending threshold.

# 95th percentile spending threshold
top5_pct_threshold = np.percentile(purchase_amt,95)

# Find customers who are in the top 5%
top_spenders = purchase_amt[purchase_amt >= top5_pct_threshold]

print(f'The no. of top 5% spenders: {len(top_spenders)}')
print(f'The top 5% spending threshold: ${top5_pct_threshold:.2f}')
print(f'Total spend amount of top 5% spenders: ${sum(top_spenders):.2f}')
#print(top_spenders)


In [53]:
#Calculate the total revenue contributed by these 5,000 customers

total_revenue = np.sum(purchase_amt)
print(f'${total_revenue:.2f}')

$498606.85


**Exercise 3 – Advanced Data Analysis**

In [24]:
# Outliers: below or above 2 standard deviations from the mean

#find mean and std deviation
average_spend = np.mean(purchase_amt)
std_deviation = np.std(purchase_amt)

#Define the threshold (2 standard deviations)
num_std_dev = 2

#define the lower and upper bound
lower_bound = average_spend - (num_std_dev * std_deviation)
upper_bound = average_spend + (num_std_dev * std_deviation)

print(f'Avg spend is ${average_spend:.2f}')
print(f'Standard deviation is ${std_deviation:.2f}')
print(f'Lower bound threshold: ${lower_bound:.2f}')
print(f'Upper bound threshold: ${upper_bound:.2f}')

lower_outliers = purchase_amt[purchase_amt > upper_bound]
upper_outliers = purchase_amt[purchase_amt < lower_bound]

# Using a list comprehension to format each number in the array
formatted_high = [f'${x:.2f}' for x in upper_outliers]
formatted_low = [f'${x:.2f}' for x in lower_outliers]
formatted_high.sort(reverse=True)
formatted_low.sort()

print(f'No. of high outliers purchase: {len(lower_outliers)}')
print(f'No. of lower outliers purchase: {len(upper_outliers)}')
print(f'low outliers purchase: {formatted_low}')
print(f'high outliers purchase: {formatted_high}')


Avg spend is $100.11
Standard deviation is $25.36
Lower bound threshold: $49.38
Upper bound threshold: $150.84
No. of high outliers purchase: 106
No. of lower outliers purchase: 127
low outliers purchase: ['$150.88', '$151.01', '$151.01', '$151.15', '$151.17', '$151.19', '$151.19', '$151.53', '$151.57', '$151.64', '$151.68', '$151.80', '$151.89', '$151.94', '$151.95', '$152.00', '$152.20', '$152.27', '$152.30', '$152.32', '$152.52', '$152.64', '$152.80', '$153.05', '$153.26', '$153.29', '$153.53', '$154.07', '$154.15', '$154.32', '$154.66', '$154.89', '$154.94', '$154.95', '$155.08', '$155.25', '$155.39', '$155.52', '$155.66', '$155.73', '$155.82', '$155.94', '$156.29', '$156.43', '$157.08', '$157.08', '$157.25', '$157.43', '$157.47', '$157.58', '$157.59', '$157.74', '$158.10', '$158.16', '$158.16', '$158.35', '$158.46', '$158.47', '$158.76', '$158.81', '$158.89', '$159.39', '$159.69', '$159.89', '$160.01', '$160.32', '$160.42', '$160.50', '$160.72', '$161.34', '$161.66', '$162.59', '$

In [38]:
# Group the customers in the purchase amounts dataset into 5 groups based on the following thresholds:
# • Very Low Spender: 0 < $ <= 50
# • Low Spender: 50 < $ <= 100
# • Average Spender: 100 < $ <= 150
# • High Spender: 150 < $ <= 175
# • Very High Spender: 175 < $
# Report the number of customers in each of the 5 groups.

bins = [0, 50, 100, 150, 175]
# labels = ['Very Low Spender', 'Low Spender', 'Average Spender', 'High Spender', 'Very High Spender']
labels = ['Very Low Spender (<=$50)', 
          'Low Spender (<=$100)', 
          'Average Spender (<=$150)', 
          'High Spender (<=$175)' , 
          'Very High Spender (>$175)'
          ]

spend_tier = np.digitize(purchase_amt, bins, right=True) # looks at value in purchase_amt array and identify the order no. of bins
np.searchsorted(bins, purchase_amt, side='left')

print(f"Thresholds used: {bins}")
print(f"Purchase amounts: {purchase_amt}")
print(f"Spending Tiers: {spend_tier}")


# Count customers in each tier
for i, label in enumerate(labels):      #enumerate() provides both the index (i) and the value (label) for each item in the labels list.
    count = np.sum(spend_tier == i+1)   #This is the core counting logic using NumPy.
    print(f"{label}: {count} customers")

Thresholds used: [0, 50, 100, 150, 175]
Purchase amounts: [112.44368529 133.74011347 102.50371516 ...  81.70934881  98.16644721
  91.84024439]
Spending Tiers: [3 3 3 ... 2 2 2]
Very Low Spender (<=$50): 138 customers
Low Spender (<=$100): 2342 customers
Average Spender (<=$150): 2403 customers
High Spender (<=$175): 113 customers
Very High Spender (>$175): 4 customers
