## NumPy Library

In [None]:
# pip install numpy
import numpy as np

### Creating NumPy Arrays

##### Using Array Methods

In [None]:
# 1D: Monthlysales for North America (in thousands USD)
monthly_na_sales_kusd = np.array([120.5, 135.2, 140.0, 130.8, 145.6])
print(f" Loaded 1D time series: {len(monthly_na_sales_kusd)} months of NA sales")

print(f"\n Monthly Sales (North America, K USD): {monthly_na_sales_kusd}")
print(f"1D Monthly Sales | Shape: {monthly_na_sales_kusd.shape} | Dim: {monthly_na_sales_kusd.ndim} | Size: {monthly_na_sales_kusd.size}")

# 2D: Sales by Region × Product Category
sales_by_region_product_kusd = np.array([
    [250, 80],   # North: [Laptops, Accessories]
    [180, 95]    # South: [Laptops, Accessories]
])
regions = ["North", "South"]
products = ["Laptops", "Accessories"]

print(" Built 2D sales matrix: Region vs Product")
print(f"\n Sales by Region & Product (K USD):\n{sales_by_region_product_kusd}")
print(f"2D Sales Matrix | Shape: {sales_by_region_product_kusd.shape} | Dim: {sales_by_region_product_kusd.ndim} | Size: {sales_by_region_product_kusd.size}")


##### Special ndarrays

In [None]:
# Initialize matrices
zeros_template = np.zeros((2, 5))  # Blank report template
identity_matrix = np.identity(3)   # For financial model stability checks

# ndarray with np.ones
np.ones(shape=(2,2))

# ndarray with np.arange
np.arange(5)

# an ndarray with np.random.randn that "not need to reshape"
np.random.randn(2,2)

# Evenly spaced revenue targets over 50 periods (planning)
revenue_targets = np.linspace(100, 500, num=50)

# Random daily customer flow simulation
customer_flow = np.arange(0.5, 10.4, 0.8)

In [None]:
planning_periods = 50
revenue_targets_kusd = np.linspace(100, 500, num=planning_periods)
print(f" Generated {planning_periods} revenue targets for strategic planning")

# Blank templates for report initialization
blank_performance_template = np.zeros((2, 5))  # e.g., 2 teams, 5 KPIs
model_stability_matrix = np.eye(3)  # Identity for model validation
print(" Initialized blank templates and identity matrix for QA")

np.array([-1, 0, 1], dtype=np.float64)

##### using Eyes Method and random numbers

In [None]:
eyes_array = np.eye(7)
print(eyes_array)

# Uniform random (e.g., sensitivity testing)
sensitivity_min, sensitivity_max = -3.4, 5.9
sensitivity_noise = (sensitivity_max - sensitivity_min) * np.random.random_sample((3, 4)) + sensitivity_min
print(f"\n Sensitivity Noise (Range [{sensitivity_min}, {sensitivity_max}]):\n{np.round(sensitivity_noise, 2)}")

# Daily demand simulation (Normal dist: mean=300, std=50)
np.random.seed(42)  # For reproducibility
simulated_daily_demand = np.random.randn(4, 5) * 50 + 300
print(f"\n Simulated Daily Demand (units):\n{simulated_daily_demand.astype(int)}")
print("Simulated 4 weeks of demand for inventory planning")

# Random order sizes
order_quantities = np.random.randint(20, 100, size=5)
print(f" Simulated Order Sizes: {order_quantities}")
print("Generated random order sizes for procurement testing")

# random integers between a certain range
integer_random = np.random.randint(5, 50, 5)
print (integer_random)

### Reshaping NumPy Arrays

In [None]:
# 4D: Simulated revenue cube (Year × Quarter × Region × Product)
# Structure: (2 years, 2 quarters/year, 2 regions, 2 products)
revenue_cube_kusd = np.arange(16).reshape((2, 2, 2, 2)) * 10 + 100
print(" Initialized 4D revenue cube for multi-year forecasting")

print(f"\n 4D Revenue Cube Shape: {revenue_cube_kusd.shape}")
print(f"4D Revenue Cube | Shape: {revenue_cube_kusd.shape} | Dim: {revenue_cube_kusd.ndim} | Size: {revenue_cube_kusd.size}")

# Flatten for export to flat-file reporting systems (e.g., CSV)
flattened_revenue_export = revenue_cube_kusd.flatten()
print(f" Flattened cube for export: {len(flattened_revenue_export)} values")
print(f"Flattened Data (first 8): {flattened_revenue_export[:8]} ...")

#### Array Indexing And Slicing

In [None]:
s = np.arange(1,10)
print(s)
print(s[1])

# lower index (inclusive) to the upper index (exclusive) 
print(s[1:9])
print(s[:5])
print(s[5:])

row1 = [10,12,13]
row2 = [43,32,21]
row3 = [65,75,85]
nums_2d = np.array([row1, row2, row3])

print(nums_2d[:2,:]) # the rows from the first and second index are returned
print(nums_2d[:,:2]) # all the rows but only the first two columns

In [None]:
# Simulated profit margins (%) across 3 regions and 3 products
np.random.seed(42)  # For reproducibility
profit_matrix = np.random.randn(3, 3) * 15 + 5  # Mean ~5%, std ~15%
print("\n Simulated Profit Margins (%):")
print(np.round(profit_matrix, 1))

# Extract insights
print("First region:", profit_matrix[0])
print("Best product in Region 1:", profit_matrix[0, np.argmax(profit_matrix[0])])

# Boolean indexing: Find loss-making products
loss_makers = profit_matrix < 0
print("Loss-making cells:", profit_matrix[loss_makers])

## using the `&` and `| `operators
# Moderate performers: between -5% and +10%
moderate = (profit_matrix > -5) & (profit_matrix < 10)
print("Moderate performers:", np.round(profit_matrix[moderate], 1)) 

# Slicing: Analyze last two regions and first two products
subset = profit_matrix[1:, :2]
print(" Subset (Regions 2-3, Products 1-2):")
print(subset)

In [None]:
# Extract key insights
first_region_margins = profit_matrix[0]
best_product_in_first_region = profit_matrix[0, np.argmax(profit_matrix[0])]

print(f" Region 1 best product margin: {best_product_in_first_region:.1f}%")

# Boolean indexing: identify loss-making SKUs
loss_making_mask = profit_matrix < 0
loss_making_values = profit_matrix[loss_making_mask]

print(f" Loss-making SKUs found: {len(loss_making_values)} products")
if len(loss_making_values) > 0:
    print(f" {len(loss_making_values)} products operating at a loss")

# Slice: analyze last two regions and first two products
regional_subset = profit_matrix[1:, :2]
print(f"\n Subset (Regions 2–3, Products 1–2):\n{regional_subset}")

#### Arithmetic operation with ndarrays

In [None]:
# SUMMARY & AGGREGATION
print(f"\n Profitability Summary Report:")
min_margin = np.round(np.min(profit_matrix), 1)
max_margin = np.round(np.max(profit_matrix), 1)
avg_margin = np.round(np.mean(profit_matrix), 1)

print(f"• Min Margin: {min_margin}%")
print(f"• Max Margin: {max_margin}%")
print(f"• Avg Margin: {avg_margin}%")

# Regional averages (axis=1), Product averages (axis=0)
regional_avg_margins = np.round(np.mean(profit_matrix, axis=1), 1)
product_avg_margins = np.round(np.mean(profit_matrix, axis=0), 1)

print(f" • By Region: {list(zip(['R1', 'R2', 'R3'], regional_avg_margins))}")
print(f" • By Product: {list(zip(['P1', 'P2', 'P3'], product_avg_margins))}")

# Identify best-performing region
best_region_idx = np.argmax(regional_avg_margins)
print(f"\n Best Performing Region: R{best_region_idx + 1}")
print(f" Average Margin: {regional_avg_margins[best_region_idx]:.1f}%")
print(f" Region {best_region_idx + 1} identified as top performer")

# CUMULATIVE
cumulative_margin_by_product = np.cumsum(profit_matrix, axis=1)
print(f"\n Cumulative Margin by Product (per region):\n{np.round(cumulative_margin_by_product, 1)}")

#### Arithmetic Operations

In [None]:
# square roots
nums = [100,200,300,400,500]
print(np.sqrt(nums))

# log
print(np.log(nums))

# Exponents
print(np.exp(nums))

# sine and cosine
print(np.sin(nums))
print(np.cos(nums))

#### Linear Algebra Operation

In [None]:
# 1. INPUT-OUTPUT TRANSFORMATION MATRICES
# Modeling interdependencies between production sectors (e.g., manufacturing, logistics)
production_coefficients = np.array([
    [2.0, 3.0],  # Manufacturing: raw materials → finished goods
    [3.0, 5.0]   # Logistics: distribution scaling with feedback loops
], dtype=np.float64)

market_response_matrix = np.array([
    [1.0, 2.0],  # Consumer demand elasticity
    [5.0, -1.0]  # Competitive response (positive & negative shocks)
], dtype=np.float64)

print(f" Production Coefficients (A):\n   {production_coefficients}")
print(f" Market Response (B):\n   {market_response_matrix}\n")

# 2. MATRIX MULTIPLICATION: SYSTEMIC IMPACT MODELING
# A × B = Total ripple effect across economic sector
total_impact_matrix = np.dot(production_coefficients, market_response_matrix)
print(total_impact_matrix)

# 3. CROSS PRODUCT: DIRECTIONAL RISK VECTOR ANALYSIS
# Measuring orthogonal risk exposure (e.g., portfolio hedging, market momentum)
print("DIRECTIONAL RISK ASSESSMENT\n")

cross_ab = np.cross(production_coefficients[:, 0], market_response_matrix[:, 0])
cross_ba = np.cross(market_response_matrix[:, 0], production_coefficients[:, 0])

print(f"Risk Vector (A → B): {cross_ab:.1f} (counterclockwise exposure)")
print(f"Risk Vector (B → A): {cross_ba:.1f} (clockwise exposure)")

In [None]:
# 4. ELEMENT-WISE MULTIPLICATION: DISCOUNT & SCALING FACTORS
# Apply personalized pricing or regional adjustments (e.g., promo scaling)
pricing_multiplier_grid = np.multiply(production_coefficients, market_response_matrix)

print(" TARGETED DISCOUNT ENGINE (Element-wise Scaling)\n")
print(f"Per-Component Pricing Adjustment Matrix (A ⊙ B):\n{pricing_multiplier_grid}")

# 5. TRANSPOSE: DATA PIVOTING FOR REPORTING
# Switch from sector-by-sector to time-series or role-based views
production_transposed = np.transpose(production_coefficients)

print(" DATA PIVOT OPERATION (Matrix Transpose)\n")
print(f"Original (Rows = Inputs, Cols = Outputs):\n{production_coefficients}")
print(f"Transposed (Rows = Outputs, Cols = Inputs):\n{production_transposed}")

# 6. INVERSE & DETERMINANT: MODEL STABILITY CHECK
# Is the economic system reversible? Can we back-solve demand?
print(" MODEL STABILITY DIAGNOSTIC (Invertibility Test)\n")

production_inverse = np.linalg.inv(production_coefficients)
production_determinant = np.linalg.det(production_coefficients)
    
print(f" Matrix is invertible. Determinant = {production_determinant:.2f}")
print(f"Inverse Matrix (A⁻¹) — used for root-cause analysis:\n{production_inverse}")

In [None]:
# 7. LINEAR SYSTEM SOLVER: DEMAND BACKCASTING
# Given observed outputs, what were the original demand drivers?
print("LINEAR SYSTEM RESOLUTION: Demand Backcasting\n")

coeff_matrix = np.array([[2.0, 4.0], [6.0, 8.0]], dtype=np.float64)  # System equations
const_vector = np.array([5.0, 6.0])                                 # Observed outcomes

solution_vector = np.linalg.solve(coeff_matrix, const_vector)
print(f" System solved. Original demand vector: x = [{solution_vector[0]:.2f}, {solution_vector[1]:.2f}]")    

# 8. EIGEN-DECOMPOSITION: PRINCIPAL COMPONENT ANALYSIS (PCA)
# Identify dominant patterns in multi-dimensional data
print(" PATTERN DETECTION: Eigen-Analysis for Strategic Drivers\n")

eigen_matrix = np.array([
    [1.0, 2.0, 3.0],
    [1.0, 3.0, 4.0],
    [3.0, 2.0, 1.0]
], dtype=np.float64)

eigenvalues, eigenvectors = np.linalg.eig(eigen_matrix)

print(f"Top Eigenvalues (Variance explained):")
for i, val in enumerate(eigenvalues):
    print(f" λ{i+1} = {val:.2f}")

print(f"\nPrincipal Eigenvector (Strategic Direction):")
print(f"{eigenvectors[:, 0]}")

# ANALYSIS COMPLETE
print(" OMNIANALYTICS ENGINE: Linear Algebra Module Execution Complete")
print(" All results ready for export to dashboard, report, or API endpoint.\n")

#### BROADCASTING

In [None]:
base_sales_kusd = np.array([
    [110, 120, 130],  # Region 1
    [210, 220, 230],  # Region 2
    [310, 320, 330]   # Region 3
])

regional_growth_rates = np.array([1.1, 1.2, 1.05])  # Marketing boost

# Broadcasting: apply per-region multiplier across all products
projected_sales_kusd = base_sales_kusd * regional_growth_rates[:, np.newaxis]
projected_sales_int = projected_sales_kusd.astype(int)

print(f"\n Projected Sales After Growth (K USD):\n{projected_sales_int}")

#### Random Numbers and Probability

In [None]:
uniform_random = np.random.uniform(4, 5) # uniform distribution
normal_random = np.random.randn(4, 5) # normal distribution
integer_random = np.random.randint(10, 50, 5) # integers between a certain range

np.random.random(10) # random numbers

# generate arrays with values from an arbitrary interval [a, b), 
# where a has to be less than b. (b - a) * random_sample() + a
a = -3.4
b = 5.9
A = (b - a) * np.random.random_sample((3, 4)) + a # Random Samples       

# Simulate 4 weeks of daily demand (normal distribution)
daily_demand = np.random.randn(4, 5) * 50 + 300  # Mean=300, Std=50
print("\n Simulated Daily Demand:")
print(daily_demand.astype(int))

# Random order quantities (integer)
order_sizes = np.random.randint(20, 100, 5)
print(" Simulated Order Sizes:", order_sizes)

# Sensitivity test: Random inputs in range [-3.4, 5.9]
a, b = -3.4, 5.9
noise = (b - a) * np.random.random_sample((3, 4)) + a
print("Sensitivity Noise:", np.round(noise, 2))

## Pandas Library

In [None]:
# pip install pandas
import pandas as pd
import numpy as np

In [None]:
ser1 = pd.Series(range(1, 6))
# The right column contains our `data`, 
# whereas the left column contains the `index`. 

#  specify custom index names
ser2 = pd.Series(range(1, 6),
                 index=['a', 'b', 'c', 'd', 'e'])

ser2.index
ser2.values

In [None]:
# Custom indexed series for regional risk tiers
risk_tier_scores = pd.Series(
    data=[1.2, 1.8, 2.1, 1.5, 3.0],
    index=['Tier_A', 'Tier_B', 'Tier_C', 'Tier_D', 'Tier_E'],
    name="Risk_Score_Profile"
)
print(risk_tier_scores)
print(risk_tier_scores.info())
print(risk_tier_scores.describe())

In [None]:
loan_application_data_raw = {
    "Applicant_Name": [
        "Jones", "Smith", "Lynn", "Rebecca", "Vizon", "Phyllis",
        "Roberto", "Sybil", "Fernando", "Eric", "Michael", "Fredrick",
        "Allan", "Mary", "Joseph"
    ],
    "Loan_Amount_USD": [
        10000, 20000, 1000, 500, 700, 850, 900, 1500, 12000,
        16000, 1350, 16000, 8000, 7500, 850
    ],
    "Applicant_Age": [45, 38, 25, 29, 31, 42, 50, 33, 27, 36, 40, 48, 34, 39, 44],
    "Residence_City": [
        "Taipei", "Kaohsiung", "Taichung", "Hsinchu", "Tainan",
        "Pingtung", "Yilan", "Hualien", "Keelung", "Miaoli",
        "Changhua", "Nantou", "Yunlin", "Chiayi", "Taitung"
    ]
}

# Convert to DataFrame — stage 1 raw ingestion
df_applications_raw = pd.DataFrame(data=loan_application_data_raw)
print("\n Raw Loan Application Data (First 5 Records):")
print(df_applications_raw.head().to_string(index=False)) #head(10) Top 5 rows

# Setup the index set.index(column)-- .reset_index() no index
df_applications_indexed = df_applications_raw.set_index('Applicant_Name')
print(df_applications_indexed.head(3))

# EXPLORATORY DATA ANALYSIS (EDA)
print(f"\n Dataset Shape: {df_applications_indexed.shape}")
print("\n Descriptive Statistics (Numerical Fields):")
print(df_applications_indexed.describe())

# For export to CSV (reset index)
#df_for_export = df_applications_indexed.reset_index()
#export_columns = df_for_export.columns.tolist()

#### Selection, and Filtering

In [None]:
#### Filtering Rows and Columns
# Mock DataFrame for feature engineering
feature_matrix = pd.DataFrame({
    'Feature_A': range(1, 5),
    'Feature_B': range(10, 50, 10),
    'Feature_C': range(100, 500, 100),
    'Loan_Risk_Score': [800, 1200, 2500, 3100]
}, index=['Cust_01', 'Cust_02', 'Cust_03', 'Cust_04'])

print(feature_matrix)

# Select specific features (by columns)
selected_features = feature_matrix[['Feature_A', 'Feature_C']]
print(selected_features)

# Boolean filtering: high-risk loans only
high_risk_loans = feature_matrix[(feature_matrix['Loan_Risk_Score'] > 1000) & 
                                (feature_matrix['Loan_Risk_Score'] <= 3000)]
print(f"Identified {len(high_risk_loans)} medium-to-high risk applicants")
print(high_risk_loans)

# Row access via label (loc) and position (iloc)
target_customers = feature_matrix.loc[['Cust_03', 'Cust_04']] # select row(s) by index_name
first_applicant = feature_matrix.iloc[0]                      # Select row(s) by index_position
print(target_customers)
print(first_applicant)

# DATA CLEANING & INDEX MANIPULATION
feature_matrix_clean = feature_matrix.drop(index='Cust_02') # drop one row by index
# feature_matrix_clean = feature_matrix.drop(index='Cust_02', inplace=True) 
print(" Removed test record: Cust_02")
print(feature_matrix_clean)

# Drop non-critical columns
feature_matrix_reduced = feature_matrix_clean.drop(columns=['Feature_A', 'Feature_B']) # by column_name
# feature_matrix_clean.drop(['A', 'B'], axis=1)
print(feature_matrix_reduced)

# Ensure no duplicates exist 
feature_matrix_unique = feature_matrix_reduced.drop_duplicates()
print(" Deduplication complete")

#### Sorting values and Ranking

In [None]:
print(feature_matrix)
# SORTING & RANKING
sorted_by_risk = feature_matrix.sort_values(by='Loan_Risk_Score', ascending=False) # by values
print(sorted_by_risk)

ranked_by_amount = df_applications_indexed[['Loan_Amount_USD']].rank(method='max') # by index
print(f"\n Ranking by Loan Amount:\n{ranked_by_amount.head(3)}")

#### Descriptive Statistics

In [None]:
df=pd.DataFrame(
    [[2.4,np.nan],[6.3,-5.4],
     [np.nan,np.nan],[0.75,-1.3]],
    index=["a","b","c","d"],
    columns=["one","two"])

df.describe()
df.sum()
df.sum(axis=1)
df.mean(axis=1) 

df.mean(axis=1,skipna=False)
df.idxmax()
df.idxmin()
df.cumsum()

#### Merging and combining multiple DataFrame

In [None]:
# DATA MERGING & JOINING
# Internal customer tags
df_internal = pd.DataFrame({
    "Customer_Tag": ["a", "b", "c", "c", "d", "e"],
    "Internal_Score": range(6)
})
print(df_internal)

# External credit bureau match
df_external = pd.DataFrame({
    "Bureau_Key": ["b", "c", "e", "f"],
    "Credit_Rating": range(4)
})
print(df_external)

# Inner join: only matched customers
matched_customers = pd.merge(
    df_internal, df_external,
    left_on="Customer_Tag", right_on="Bureau_Key",
    how="inner"
)
print(f" Inner join: {len(matched_customers)} matched customers")
print(matched_customers)

# Left join: retain all internal records
full_internal_view = pd.merge(df_internal, df_external,
                             left_on="Customer_Tag", right_on="Bureau_Key",
                             how="left")

# Concatenate vertically (new batch)
batch_1 = pd.DataFrame({'X': [1, 2], 'Y': [3, 4]}, index=['A', 'C'])
batch_2 = pd.DataFrame({'X': [5, 6], 'Y': [7, 8]}, index=['B', 'D'])
combined_batches = pd.concat([batch_1, batch_2]).sort_index()
print(batch_1)
print(batch_2)
print(combined_batches)

'''
# merge by rows
pd.concat([df1, df2])
pd.concat([df1, df2], axis=1)
'''

#### Grouping operations

In [None]:
# GROUPING & AGGREGATION
segmentation_data = pd.DataFrame({
    'Customer_Segment': ['Premium', 'Standard', 'Premium', 'Standard', 'Premium', 'Standard'],
    'Loan_Value_USD': [10000, 20000, 30000, 40000, 50000, 60000]
})
print(segmentation_data)

grouped_by_segment = segmentation_data.groupby('Customer_Segment')
print(grouped_by_segment.describe())

mean_per_segment = grouped_by_segment['Loan_Value_USD'].mean()
sum_per_segment = grouped_by_segment['Loan_Value_USD'].sum()
print(mean_per_segment)
print(sum_per_segment)

from scipy import stats
# Apply z-score normalization within groups
normalized_risk = grouped_by_segment['Loan_Value_USD'].transform(stats.zscore)
segmentation_data['Risk_Z_Score'] = normalized_risk
print(segmentation_data)

'''
# DataFrameGroupBy.agg(...) accepts functions and 
# aggregates each column for each group using that method
df.groupby('Category').agg({'Value':['sum',,'count']})
'''

#### Missing data

In [None]:
# HANDLING MISSING DATA
df_with_missing = df_applications_indexed.copy() 
df_with_missing.loc['Jones', 'Applicant_Age'] = np.nan
df_with_missing.loc['Smith', 'Loan_Amount_USD'] = np.nan

print(f"\n  Records with Missing Data:")
print(df_with_missing[df_with_missing.isnull().any(axis=1)])

# Cleaning strategy:
# - Drop if loan amount missing (critical field)
# - Impute age with group mean
df_cleaned = df_with_missing.dropna(subset=['Loan_Amount_USD'])
mean_age_imputed = df_cleaned['Applicant_Age'].mean()
df_cleaned['Applicant_Age'].fillna(mean_age_imputed, inplace=True)

print(f" Imputed missing age with mean value: {mean_age_imputed:.1f}")
print(f"\n Cleaned Data Sample:\n{df_cleaned[['Loan_Amount_USD', 'Applicant_Age']].head()}")

#### Transformation and and Mappings

In [None]:
# DATA TRANSFORMATION & CATEGORIZATION
loan_binning_data = pd.DataFrame({
    "Applicant": ["Jones", "Smith", "Lynn", "Rebecca", "Eric", "Fernando"],
    "Loan_Amount_USD": [10000, 20000, 1000, 500, 16000, 12000]
})

# Equal-width bins
min_loan = loan_binning_data['Loan_Amount_USD'].min()
max_loan = loan_binning_data['Loan_Amount_USD'].max()
bins = np.linspace(min_loan, max_loan, num=4)  # 3 groups
labels = ["Low", "Medium", "High"]

loan_binning_data["Risk_Tier_EqualWidth"] = pd.cut(
    loan_binning_data["Loan_Amount_USD"],
    bins=bins,
    labels=labels,
    include_lowest=True
)

# Equal-frequency bins (quartiles)
loan_binning_data["Risk_Tier_EqualFreq"] = pd.qcut(
    loan_binning_data["Loan_Amount_USD"],
    q=3,
    precision=1,
    labels=labels
)

print(f"\n Loan Tier Distribution (Equal Frequency):\n{loan_binning_data['Risk_Tier_EqualFreq'].value_counts()}")

In [None]:
# REGION MAPPING & PIVOT REPORTING
region_mapping = {
    'Taipei': 'North', 'Kaohsiung': 'South', 'Taichung': 'Central',
    'Hsinchu': 'North', 'Tainan': 'South', 'Pingtung': 'South',
    'Yilan': 'North', 'Hualien': 'East', 'Keelung': 'North',
    'Miaoli': 'North', 'Changhua': 'Central', 'Nantou': 'Central',
    'Yunlin': 'Central', 'Chiayi': 'South', 'Taitung': 'East'
}

df_final = df_cleaned.reset_index().copy()
df_final['Region'] = df_final['Residence_City'].map(region_mapping)
df_final['Risk_Group'] = np.where(df_final['Loan_Amount_USD'] > 10000, 'High_Value', 'Standard')

# Pivot: Regional exposure by risk group
pivot_loan_volume = pd.pivot_table(
    df_final,
    index='Region',
    columns='Risk_Group',
    values='Loan_Amount_USD',
    aggfunc='sum',
    fill_value=0
).astype(int)

print(" Generated pivot table for regional risk exposure")
print(f"\n Loan Volume by Region & Risk Group (USD):\n{pivot_loan_volume}")

#### Numpy Array and Pandas DataFrame

In [None]:
# ARRAY ↔ DATAFRAME INTEROPERABILITY
raw_numerical_data = np.array([[11, 22, 33], [44, 55, 66]])
df_from_array = pd.DataFrame(
    raw_numerical_data,
    columns=['Feature_1', 'Feature_2', 'Feature_3']
)
print("Converted NumPy array to Pandas DataFrame for feature engineering")

# Reverse: export to NumPy for model training
numerical_df = pd.DataFrame({
    'Age': [25, 47, 38],
    'Birth_Year': [1995, 1973, 1982],
    'Graduation_Year': [2016, 2000, 2005]
})
model_input_array = numerical_df.to_numpy()

#### Apply Function

In [None]:
# Step 1: Sample real-world data (e.g., from a CSV or database)
data = {
    'customer_id': ['C001', 'C002', 'C003', 'C004', 'C005'],
    'annual_spending': [12000, 8000, 15000, 3000, 500],
    'purchase_frequency': [5, 3, 6, 1, 0.5]
}
df = pd.DataFrame(data)

# Step 2: Simple function to categorize customer value
def assign_tier(row):
    if row['annual_spending'] > 10000 and row['purchase_frequency'] >= 4:
        return 'Premium'
    elif row['annual_spending'] > 5000 and row['purchase_frequency'] >= 2:
        return 'Standard'
    else:
        return 'Basic'

# Step 3: Use apply() to create a new column
df['customer_tier'] = df.apply(assign_tier, axis=1)

# Step 4: Show result
print(df)

### C. Data Visualization

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('default')  # Clean, professional look
plt.rcParams['figure.figsize'] = (8, 5)

#### # Trend Line Chart

In [None]:
# Plot monthly sales performance with markers and styling
months_numeric = np.array([1, 2, 3, 4, 5, 6])
actual_sales_kusd = np.array([120, 135, 140, 130, 145, 160])

plt.figure(figsize=(9, 5))
plt.plot(months_numeric, actual_sales_kusd,
         marker='o', ms=8, mec='darkblue', color='green',
         linestyle='-', linewidth=2, label='Actual Sales') # <--- plot

# Inline set_title logic: set title and subtitle
plt.title("Monthly Sales Trend (2025)\nJan–Jun Performance", fontweight='bold')
plt.xlabel("Month")
plt.ylabel("Sales (K USD)")
plt.xticks(months_numeric, ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun'])
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
# plt.savefig("sales_trend_2025.png", dpi=150, bbox_inches='tight')
plt.show()

#### SCATTER PLOT: AD SPEND VS. SALES

In [None]:
# marketing ROI scatter plot
ad_spend_kusd = np.array([50, 60, 70, 80, 90, 100])
sales_revenue_kusd = np.array([120, 130, 145, 150, 160, 175])

plt.figure(figsize=(8, 5))
plt.scatter(ad_spend_kusd, sales_revenue_kusd,
            color='#2ca02c', alpha=0.7, s=60) # <--- scatter
            
plt.title("Advertising Spend vs. Sales Performance", fontweight='bold')
plt.xlabel("Ad Spend (K USD)")
plt.ylabel("Sales Revenue (K USD)")
plt.grid(True, alpha=0.3)
plt.tight_layout()
# plt.savefig("ad_spend_vs_sales.png", dpi=150)
plt.show()

#### BAR CHART

In [None]:
# product revenue bar chart
products = ["Laptops", "Phones", "Tablets", "Accessories"]
revenue_kusd = [240, 180, 90, 70]
colors = ['skyblue', 'lightcoral', 'lightgreen', 'gold']

plt.figure(figsize=(8, 5))
bars = plt.bar(products, revenue_kusd, color=colors) # <--- bar
# Inline set_title logic: set title
plt.title("Revenue by Product Category", fontweight='bold')

# Add value labels on bars
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 5,
             f'{int(yval)}K', ha='center', va='bottom', fontsize=10)

plt.ylabel("Revenue (K USD)")
plt.xticks(rotation=15)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
# plt.savefig("revenue_by_product.png", dpi=150)
plt.show()

In [None]:
# HORIZONTAL BAR
# store performance horizontal bar chart
stores = ["Taipei", "Kaohsiung", "Taichung", "Hsinchu", "Tainan"]
performance_index = [95, 88, 92, 85, 80]

plt.figure(figsize=(8, 5))
plt.barh(stores, performance_index, color='steelblue') # <---- barh

# Inline set_title logic: set title and subtitle
plt.title("Store Performance Score\nQ1 2025 Review", fontweight='bold')
plt.xlabel("Performance Index (0–100)")
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
# plt.savefig("store_performance.png", dpi=150)
plt.show()

In [None]:
# HISTOGRAM
# customer age histogram
np.random.seed(42)
customer_ages = np.random.normal(38, 10, 500).clip(18, 70)

plt.figure(figsize=(8, 5))
plt.hist(customer_ages, bins=20, color='teal', alpha=0.7, edgecolor='black') # <--- hist

# Inline set_title logic: set title and subtitle
plt.title("Customer Age Distribution\nn=500", fontweight='bold')
plt.xlabel("Age")
plt.ylabel("Number of Customers")
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
# plt.savefig("age_distribution.png", dpi=150)
plt.show()

In [None]:
# PIE CHART
# market share pie chart
regions = ["North", "South", "Central", "East"]
market_share_pct = [40, 30, 20, 10]
explode = [0.1, 0, 0, 0]  # Highlight North

plt.figure(figsize=(7, 7))
plt.pie(market_share_pct, labels=regions, autopct='%1.1f%%',
        startangle=90, explode=explode, colors=['#ff9999','#66b3ff','#99ff99','#ffcc99']) # <--- pie

# Inline set_title logic: set title and subtitle
plt.title("Market Share by Region\nNorth region receives strategic focus", fontweight='bold')
plt.tight_layout()
# plt.savefig("market_share.png", dpi=150)
plt.show()

In [None]:
# COMPARATIVE LINE PLOT
# actual vs. target performance chart
months = np.arange(1, 7)
actual_sales = np.array([120, 135, 140, 130, 145, 160])
target_sales = np.array([125, 130, 135, 140, 145, 150])

plt.figure(figsize=(9, 5))
plt.plot(months, actual_sales, marker='o', color='blue',
         label='Actual Sales', linewidth=2) # <--- plot 1
plt.plot(months, target_sales, linestyle='--', marker='s', color='red',
         label='Sales Target', linewidth=2) # <--- plot 2

# Inline set_title logic: set title and subtitle
plt.title("Actual vs. Target Sales\nPerformance Gap Analysis", fontweight='bold')
plt.xlabel("Month")
plt.ylabel("Sales (K USD)")
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(months, ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun'])
plt.tight_layout()
# plt.savefig("actual_vs_target.png", dpi=150)
plt.show()

In [None]:
# SUBPLOTS
np.random.seed(123)
months = np.arange(1, 7)
actual_sales = np.array([120, 135, 140, 130, 145, 160])

products = ["Laptops", "Phones", "Tablets", "Accessories"]
revenue = [240, 180, 90, 70]

customer_ages = np.random.normal(38, 10, 500).clip(18, 70)

regions = ["North", "South", "Central", "East"]
market_share = [40, 30, 20, 10]

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 8))
fig.suptitle("Business Intelligence Dashboard — Q2 2025", fontsize=16, fontweight='bold') #<---suptitle

# 1. Sales Trend
ax1.plot(months, actual_sales, marker='o', color='blue') # <---plot
ax1.set_title("Sales Trend")
ax1.grid(True, alpha=0.3)
ax1.set_ylabel("Sales (K USD)")
ax1.set_xticks(months)
ax1.set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun'], rotation=15)

# 2. Product Revenue
ax2.bar(products, revenue, color='skyblue') # <--- bar
ax2.set_title("Revenue by Product")
ax2.tick_params(axis='x', rotation=15)

# 3. Age Distribution
ax3.hist(customer_ages, bins=15, color='teal', alpha=0.7) #<--- hist
ax3.set_title("Customer Age Distribution")
ax3.set_xlabel("Age")
ax3.set_ylabel("Count")

# 4. Market Share
ax4.pie(market_share, labels=regions, autopct='%1.1f%%', startangle=90,
        colors=['#ff9999','#66b3ff','#99ff99','#ffcc99']) # <-- pie
ax4.set_title("Market Share")

plt.tight_layout(rect=[0, 0, 1, 0.96])
# plt.savefig("executive_dashboard_q1_2025.png", dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# TIME SERIES
# trend area plot
dates = pd.date_range('2021-01-01', periods=60, freq='M')
passengers = np.random.randint(300, 500, size=60).cumsum()

df_flights = pd.DataFrame({'Passengers': passengers}, index=dates)

df_flights.plot.area(y='Passengers', figsize=(9, 5),
                     title="Cumulative Passengers Over Time",
                     color='orange', alpha=0.7) # <--- area
plt.ylabel("Total Passengers")
plt.xlabel("Year")
plt.grid(True, alpha=0.3)
plt.tight_layout()
# plt.savefig("passenger_growth_area.png", dpi=150)
plt.show()

In [None]:
# BOX PLOT
# box plot for daily sales variability"
np.random.seed(42)
tip_data = pd.DataFrame({
    'Day': np.random.choice(['Mon', 'Tue', 'Wed', 'Thu', 'Fri'], 200),
    'Total_Bill_USD': np.random.lognormal(3, 0.5, 200) * 10
})

tip_data['Day'] = pd.Categorical(tip_data['Day'],
                                 categories=['Mon', 'Tue', 'Wed', 'Thu', 'Fri'],
                                 ordered=True)

tip_data.boxplot(by='Day', column=['Total_Bill_USD'],
                 vert=False, grid=False, figsize=(9, 5)) # <--- boxplot
plt.title("Sales Distribution by Day of Week")
plt.suptitle("")  # Remove default
plt.xlabel("Total Bill (USD)")
plt.tight_layout()
# plt.savefig("sales_by_day_boxplot.png", dpi=150)
plt.show()

In [None]:
# GROUPED BAR
# average age by gender bar chart
np.random.seed(123)
df_customers = pd.DataFrame({
    'Gender': np.random.choice(['Female', 'Male'], 100),
    'Age': np.random.randint(18, 70, 100)
})

avg_age_by_gender = df_customers.groupby('Gender')['Age'].mean().reindex(['Female', 'Male'])

df_gender = pd.DataFrame({
    'Gender': avg_age_by_gender.index,
    'Average_Age': avg_age_by_gender.values
})

ax = df_gender.plot.bar(x='Gender', y='Average_Age', figsize=(8, 5),
                        title="Average Customer Age by Gender",
                        legend=False, color=['pink', 'lightblue']) # plot.bar
ax.set_ylabel("Age")
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
# plt.savefig("avg_age_by_gender.png", dpi=150)
plt.show()