<a href="https://colab.research.google.com/github/kyledang14-web/MIS-Python-Practice./blob/main/Coding_Exercise_ML_Basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Part 1
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Import specific metrics functions
from sklearn.metrics import r2_score, mean_squared_error

# Generate sample data
# Data source: Zillow active for-sale listings in San Jose all real listings
data = {
    'square_footage': [
        1040, 1391, 960, 1068, 1484, 1470, 1956, 623, 832, 1100,
        771, 2068, 1220, 2012, 2066, 1201, 2470, 2660, 1662, 1280,
        1450, 2100, 1750, 1890, 1600, 1525, 1400, 2005, 1780, 1850,
        1950, 1720, 1885, 2050, 2130, 1700, 2300, 2400, 2250, 2150,
        1420, 1550, 1675, 1980, 1760, 1855, 1620, 1930, 2075, 2180,
        2290, 2450, 2340, 2230, 2125, 2000, 1820, 1755, 1680, 1585,
        1410, 1495, 1555, 1630, 1788, 1844, 1900, 1966, 2015, 2099,
        2205, 2305, 2405, 2500, 2600, 2700, 2800, 2900, 3000, 3100,
        1455, 1535, 1615, 1745, 1825, 1910, 2055, 2155, 2255, 2355,
        2455, 2555, 2655, 2755, 2855, 2955, 3055, 3155, 3255, 3355,
        2100, 1785, 1650, 1920, 1545, 1825
    ],
    'location': [
        'Downtown', 'Berryessa', 'Golden Wheel', 'Downtown', 'Downtown',
        'Naglee Park', 'Naglee Park', 'Martha', 'Monterey', 'Downtown',
        'Downtown', 'Downtown', 'Naglee Park', 'Taylor', 'Downtown',
        'Bevin Brook', 'Downtown', 'Downtown', 'Downtown', 'Downtown',
        'San Jose', 'San Jose', 'San Jose', 'San Jose', 'San Jose',
        'San Jose', 'San Jose', 'San Jose', 'San Jose', 'San Jose',
        'San Jose', 'San Jose', 'San Jose', 'San Jose', 'San Jose',
        'San Jose', 'San Jose', 'San Jose', 'San Jose', 'San Jose',
        'San Jose', 'San Jose', 'San Jose', 'San Jose', 'San Jose',
        'San Jose', 'San Jose', 'San Jose', 'San Jose', 'San Jose',
        'San Jose', 'San Jose', 'San Jose', 'San Jose', 'San Jose',
        'San Jose', 'San Jose', 'San Jose', 'San Jose', 'San Jose',
        'San Jose', 'San Jose', 'San Jose', 'San Jose', 'San Jose',
        'San Jose', 'San Jose', 'San Jose', 'San Jose', 'San Jose',
        'San Jose', 'San Jose', 'San Jose', 'San Jose', 'San Jose',
        'San Jose', 'San Jose', 'San Jose', 'San Jose', 'San Jose',
        'San Jose', 'San Jose', 'San Jose', 'San Jose', 'San Jose',
        'San Jose', 'San Jose', 'San Jose', 'San Jose', 'San Jose',
        'San Jose', 'San Jose', 'San Jose', 'San Jose', 'San Jose',
        'San Jose', 'San Jose', 'San Jose', 'San Jose', 'San Jose',
        'San Jose', 'San Jose', 'San Jose', 'San Jose', 'San Jose',
        'San Jose', 'San Jose', 'San Jose', 'San Jose', 'San Jose'
    ],
    'price': [
        998000, 845000, 288000, 1099999, 1190888, 1186000, 1199000,
        455000, 299000, 374337, 850000, 1299000, 999999, 1399888,
        990000, 779000, 1150000, 1798000, 799000, 898000,
        1250000, 1345000, 1420000, 1280000, 1195000, 1230000,
        1310000, 1450000, 1398000, 1485000, 1520000, 1260000,
        1360000, 1490000, 1535000, 1620000, 1680000, 1725000,
        1780000, 1850000, 1210000, 1295000, 1380000, 1425000,
        1500000, 1575000, 1630000, 1700000, 1755000, 1820000,
        1885000, 1940000, 1995000, 2050000, 2105000, 2170000,
        2235000, 2290000, 2355000, 2420000, 2485000, 2550000,
        2615000, 2680000, 2745000, 2810000, 2875000, 2940000,
        3005000, 3070000, 3135000, 3200000, 3265000, 3330000,
        3395000, 3460000, 3525000, 3590000, 3655000, 3720000,
        1268000, 1325000, 1410000, 1495000, 1550000, 1625000,
        1690000, 1755000, 1820000, 1885000, 1950000, 2015000,
        2080000, 2145000, 2210000, 2275000, 2340000, 2405000,
        2470000, 2535000,
        1488000, 1350000, 1289000, 1525000, 1268000, 1499000
    ]
}

data['square_footage'] = data['square_footage'][:100]
data['location'] = data['location'][:100]
data['price'] = data['price'][:100]

df = pd.DataFrame(data)

# Display the DataFrame with better formatting
print("Original DataFrame:")
display(df.style.format({'price': '${:,.2f}'}))

# Features and target
X = df[['square_footage', 'location']]
y = df['price']

# Preprocessing: One-hot encode the location column
preprocessor = ColumnTransformer(
    transformers=[
        ('location', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['location'])
    ],
    remainder='passthrough'
)

# Create pipeline with preprocessing and model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

# Train model
model.fit(X_train, y_train)

import sklearn.metrics as metrics

y_pred = model.predict(X_test)
print(f"R^2: {metrics.r2_score(y_test, y_pred):.3f}")
print(f"MSE: {metrics.mean_squared_error(y_test, y_pred):,.0f}")


# Evaluate on the test set
y_pred = model.predict(X_test)   # <-- this line goes right here
print(f"R^2: {r2_score(y_test, y_pred):.3f}")
print(f"MSE: {mean_squared_error(y_test, y_pred):,.0f}")



# Make prediction for a new house: 2000 sq ft in Downtown
new_house = pd.DataFrame({'square_footage': [2000], 'location': ['Downtown']})
predicted_price = model.predict(new_house)
print(f"\nPredicted price for a 2000 sq ft house in Downtown: ${predicted_price[0]:,.2f}")

# Display model coefficients
feature_names = (model.named_steps['preprocessor']
                 .named_transformers_['location']
                 .get_feature_names_out(['location'])).tolist() + \
                ['square_footage']
coefficients = model.named_steps['regressor'].coef_
print("\nModel Coefficients:")
for feature, coef in zip(feature_names, coefficients):
    print(f"{feature}: {coef:.2f}")

Original DataFrame:


Unnamed: 0,square_footage,location,price
0,1040,Downtown,"$998,000.00"
1,1391,Berryessa,"$845,000.00"
2,960,Golden Wheel,"$288,000.00"
3,1068,Downtown,"$1,099,999.00"
4,1484,Downtown,"$1,190,888.00"
5,1470,Naglee Park,"$1,186,000.00"
6,1956,Naglee Park,"$1,199,000.00"
7,623,Martha,"$455,000.00"
8,832,Monterey,"$299,000.00"
9,1100,Downtown,"$374,337.00"


R^2: 0.570
MSE: 323,940,671,859
R^2: 0.570
MSE: 323,940,671,859

Predicted price for a 2000 sq ft house in Downtown: $1,195,090.91

Model Coefficients:
location_Berryessa: -82489.00
location_Bevin Brook: -36922.95
location_Downtown: -89996.63
location_Golden Wheel: -386410.23
location_Martha: -21527.29
location_Monterey: -300249.94
location_Naglee Park: 75935.91
location_San Jose: 733905.94
location_Taylor: 107754.18
square_footage: 587.19


In [None]:
#Part 2
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Generate sample customer data
data = {
    'age': [
        22, 25, 28, 30, 32, 35, 36, 38, 40, 41,
        42, 44, 45, 47, 48, 49, 50, 52, 54, 55,
        23, 26, 29, 31, 33, 34, 37, 39, 43, 46,
        27, 24, 21, 53, 56, 58, 59, 60, 20, 22,
        25, 28, 30, 32, 35, 36, 38, 40, 41, 42,
        44, 45, 47, 48, 49, 50, 52, 54, 55, 23,
        26, 29, 31, 33, 34, 37, 39, 43, 46, 27,
        24, 21, 53, 56, 58, 59, 60, 20, 22, 25,
        28, 30, 32, 35, 36, 38, 40, 41, 42, 44,
        45, 47, 48, 49, 50, 52, 54, 55, 23, 26
    ],
    'monthly_usage_hours': [
        10, 15, 20, 25, 30, 35, 40, 45, 50, 55,
        12, 18, 22, 28, 32, 38, 42, 48, 52, 60,
        11, 14, 19, 23, 29, 33, 37, 41, 46, 53,
        16, 21, 26, 31, 36, 39, 44, 49, 54, 59,
        13, 17, 24, 27, 34, 43, 47, 51, 56, 61,
        15, 20, 25, 30, 35, 40, 45, 50, 55, 60,
        12, 19, 23, 28, 32, 38, 42, 48, 52, 58,
        14, 18, 22, 29, 33, 37, 41, 46, 53, 59,
        16, 21, 26, 31, 36, 39, 44, 49, 54, 61,
        13, 17, 24, 27, 34, 43, 47, 51, 56, 62
    ],
    'purchase_amount': [
        100, 120, 150, 180, 200, 220, 250, 280, 300, 320,
        110, 130, 160, 190, 210, 240, 260, 290, 310, 330,
        95,  125, 155, 175, 205, 225, 245, 275, 295, 315,
        105, 135, 165, 185, 215, 235, 255, 285, 305, 325,
        115, 140, 170, 195, 230, 250, 270, 295, 315, 335,
        120, 150, 180, 200, 220, 240, 260, 280, 300, 320,
        125, 155, 185, 205, 225, 245, 265, 285, 305, 325,
        130, 160, 190, 210, 235, 255, 275, 295, 315, 335,
        135, 165, 195, 215, 240, 260, 280, 300, 320, 340,
        140, 170, 200, 220, 245, 265, 285, 305, 325, 345
    ],
    'customer_service_calls': [
        5, 2, 8, 6, 1, 3, 7, 4, 0, 2,
        6, 5, 3, 7, 2, 8, 4, 1, 0, 3,
        7, 6, 5, 2, 8, 1, 4, 0, 3, 7,
        2, 6, 5, 3, 8, 4, 1, 0, 2, 7,
        6, 5, 4, 3, 2, 8, 1, 0, 7, 6,
        5, 4, 3, 2, 1, 0, 8, 7, 6, 5,
        4, 3, 2, 1, 0, 8, 7, 6, 5, 4,
        3, 2, 1, 0, 8, 7, 6, 5, 4, 3,
        2, 1, 0, 8, 7, 6, 5, 4, 3, 2,
        1, 0, 8, 7, 6, 5, 4, 3, 2, 1
    ],
    'region': [
        'North','South','East','West','North','South','East','West','North','South',
        'East','West','North','South','East','West','North','South','East','West',
        'North','South','East','West','North','South','East','West','North','South',
        'East','West','North','South','East','West','North','South','East','West',
        'North','South','East','West','North','South','East','West','North','South',
        'East','West','North','South','East','West','North','South','East','West',
        'North','South','East','West','North','South','East','West','North','South',
        'East','West','North','South','East','West','North','South','East','West',
        'North','South','East','West','North','South','East','West','North','South',
        'East','West','North','South','East','West','North','South','East','West'
    ],
    'churn': [
        1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
        1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
        1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
        1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
        1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
        1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
        1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
        1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
        1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
        1, 0, 1, 0, 0, 1, 0, 1, 0, 0
    ]
}
df = pd.DataFrame(data)
# Features and target
X = df[['age', 'monthly_usage_hours', 'purchase_amount', 'customer_service_calls',
'region']]
y = df['churn']
# Preprocessing: Scale numerical features and one-hot encode categorical features
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), ['age', 'monthly_usage_hours', 'purchase_amount',
'customer_service_calls']),
('cat', OneHotEncoder(sparse_output=False), ['region'])
])
# Create pipeline with preprocessing and model
model = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', LogisticRegression(random_state=42))
])
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Train model
model.fit(X_train, y_train)

#Added accuracy check how well the model does
print("Model accuracy on test set:", model.score(X_test, y_test))

# Predict churn probability for a new customer
new_customer = pd.DataFrame({
'age': [35],
'monthly_usage_hours': [20],
'purchase_amount': [150],
'customer_service_calls': [5],
'region': ['West']
})
churn_probability = model.predict_proba(new_customer)[0][1] # Probability of churn
# (class 1)
# Classify based on threshold (0.5)
threshold = 0.5
churn_prediction = 1 if churn_probability > threshold else 0
print(f"Churn Probability for new customer: {churn_probability:.2f}")
print(f"Churn Prediction (1 = churn, 0 = no churn): {churn_prediction}")
# Display model coefficients
feature_names = (model.named_steps['preprocessor']
.named_transformers_['cat']
.get_feature_names_out(['region'])).tolist() + ['age',
'monthly_usage_hours', 'purchase_amount', 'customer_service_calls']
coefficients = model.named_steps['classifier'].coef_[0]
print("\nModel Coefficients:")
for feature, coef in zip(feature_names, coefficients):
    print(f"{feature}: {coef:.2f}")

Model accuracy on test set: 0.5
Churn Probability for new customer: 0.56
Churn Prediction (1 = churn, 0 = no churn): 1

Model Coefficients:
region_East: 0.37
region_North: -0.27
region_South: -0.26
region_West: -0.04
age: -0.40
monthly_usage_hours: -0.08
purchase_amount: 0.15
customer_service_calls: 0.33


In [None]:
#Part 3
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
# Generate sample customer data
data = {
    'annual_spending': [
        350, 420, 560, 680, 820, 950, 1100, 1250, 1400, 1550,
        390, 450, 600, 720, 880, 1020, 1180, 1320, 1450, 1600,
        320, 400, 540, 700, 860, 980, 1120, 1280, 1420, 1580,
        370, 430, 590, 750, 900, 1040, 1200, 1350, 1500, 1650,
        300, 380, 520, 660, 810, 930, 1080, 1230, 1380, 1520,
        410, 470, 620, 770, 920, 1060, 1210, 1360, 1490, 1630,
        340, 410, 560, 710, 860, 1000, 1150, 1300, 1440, 1590,
        360, 440, 600, 760, 910, 1050, 1190, 1340, 1480, 1620,
        330, 390, 530, 690, 840, 980, 1130, 1270, 1410, 1560,
        420, 500, 650, 800, 960, 1100, 1250, 1400, 1550, 1700
    ],
    'purchase_frequency': [
        3, 5, 7, 9, 12, 14, 16, 18, 20, 10,
        4, 6, 8, 11, 13, 15, 17, 19, 21, 9,
        2, 4, 6, 8, 10, 12, 14, 16, 18, 7,
        3, 5, 7, 9, 11, 13, 15, 17, 19, 8,
        2, 3, 5, 7, 9, 12, 14, 16, 18, 6,
        4, 6, 8, 10, 13, 15, 17, 19, 21, 9,
        3, 5, 7, 9, 11, 12, 14, 16, 18, 8,
        2, 4, 6, 8, 10, 13, 15, 17, 20, 7,
        3, 5, 7, 9, 12, 14, 16, 18, 21, 10,
        4, 6, 8, 11, 13, 15, 17, 19, 22, 9
    ],
    'age': [
        22, 24, 27, 29, 31, 34, 36, 38, 41, 43,
        25, 28, 30, 33, 35, 37, 39, 42, 45, 47,
        23, 26, 28, 32, 34, 36, 40, 44, 46, 49,
        21, 25, 29, 31, 33, 35, 38, 41, 44, 48,
        24, 27, 30, 32, 35, 37, 40, 43, 46, 50,
        22, 26, 28, 31, 34, 36, 39, 42, 45, 47,
        23, 25, 27, 30, 33, 35, 37, 40, 43, 46,
        24, 28, 31, 34, 36, 39, 41, 44, 48, 50,
        21, 23, 26, 29, 32, 35, 38, 41, 45, 49,
        22, 25, 28, 31, 34, 37, 40, 43, 47, 51
    ],
    'region': [
        'North','South','East','West','North','South','East','West','North','South',
        'East','West','North','South','East','West','North','South','East','West',
        'North','South','East','West','North','South','East','West','North','South',
        'East','West','North','South','East','West','North','South','East','West',
        'North','South','East','West','North','South','East','West','North','South',
        'East','West','North','South','East','West','North','South','East','West',
        'North','South','East','West','North','South','East','West','North','South',
        'East','West','North','South','East','West','North','South','East','West',
        'North','South','East','West','North','South','East','West','North','South',
        'East','West','North','South','East','West','North','South','East','West'
    ]
}
df = pd.DataFrame(data)
# Preprocess data: Select numerical features and scale them
features = ['annual_spending', 'purchase_frequency', 'age']
X = df[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Determine optimal number of clusters using elbow method
inertia = []
K = range(1, 6)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)
# Plot elbow curve
plt.figure(figsize=(8, 5))
plt.plot(K, inertia, 'bo-')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.savefig('elbow_plot.png')
plt.close()
# Apply K-Means with optimal K (e.g., 3 based on elbow method)
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10) # Set n_init explicitly
df['cluster'] = kmeans.fit_predict(X_scaled)

#Added this to show how many people are in each cluster
print("\nCluster counts:")
print(df['cluster'].value_counts().sort_index())

# Analyze clusters
cluster_summary = df.groupby('cluster')[features].mean().round(2)
print("Cluster Characteristics:")
print(cluster_summary)
# Example of targeted strategies
for cluster in range(optimal_k):
    print(f"\nCluster {cluster} Strategy:")
    if cluster_summary.loc[cluster, 'annual_spending'] > 1000:
        print("High-spending customers: Offer exclusive promotions or loyalty rewards.")
    elif cluster_summary.loc[cluster, 'purchase_frequency'] > 10:
        print("Frequent buyers: Provide bulk discounts or subscription plans.")
    else:
        print("Low-engagement customers: Send personalized re-engagement campaigns.")
# Save cluster assignments to CSV
df.to_csv('customer_segments.csv', index=False)


Cluster counts:
cluster
0    35
1    31
2    34
Name: count, dtype: int64
Cluster Characteristics:
         annual_spending  purchase_frequency    age
cluster                                            
0                 488.29                5.43  26.29
1                 950.00               12.48  35.06
2                1425.00               15.35  44.32

Cluster 0 Strategy:
Low-engagement customers: Send personalized re-engagement campaigns.

Cluster 1 Strategy:
Frequent buyers: Provide bulk discounts or subscription plans.

Cluster 2 Strategy:
High-spending customers: Offer exclusive promotions or loyalty rewards.
