<a href="https://colab.research.google.com/github/kyledang14-web/MIS-Python-Practice./blob/main/Coding_Exercise_ML_Basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Part 1
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

#Import specific metrics functions
from sklearn.metrics import r2_score, mean_squared_error

#uploads Csv in Colab
from google.colab import files
uploaded = files.upload()

#Read Dataset that I uploaded
df = pd.read_csv('ML Practice - Sheet1.csv')

df['location'] = np.random.choice(['Downtown', 'Rural', 'Suburb'], size=len(df))

#Display the DataFrame with better formatting
print("Original DataFrame:")
display(df.style.format({'price': '${:,.2f}'}))

#Features and target
X = df[['square_footage', 'location']]
y = df['price']

#Preprocessing: One-hot encode the location column
preprocessor = ColumnTransformer(
    transformers=[
        ('location', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['location'])
    ],
    remainder='passthrough'
)

# Create pipeline with preprocessing and model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

#Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

#Train model
model.fit(X_train, y_train)

import sklearn.metrics as metrics

#Evaluate on the test set
y_pred = model.predict(X_test)   # <-- this line goes right here
print(f"R^2: {r2_score(y_test, y_pred):.3f}")
print(f"MSE: {mean_squared_error(y_test, y_pred):,.0f}")

#Make prediction for a new house: 2000 sq ft in Downtown
new_house = pd.DataFrame({'square_footage': [2000], 'location': ['Downtown']})
predicted_price = model.predict(new_house)
print(f"\nPredicted price for a 2000 sq ft house in Downtown: ${predicted_price[0]:,.2f}")

#Display model coefficients
feature_names = (model.named_steps['preprocessor']
                 .named_transformers_['location']
                 .get_feature_names_out(['location'])).tolist() + \
                ['square_footage']
coefficients = model.named_steps['regressor'].coef_
print("\nModel Coefficients:")
for feature, coef in zip(feature_names, coefficients):
    print(f"{feature}: {coef:.2f}")

Saving ML Practice - Sheet1.csv to ML Practice - Sheet1 (3).csv
Original DataFrame:


Unnamed: 0,price,square_footage,location
0,"$998,000.00",1040,Rural
1,"$845,000.00",1391,Suburb
2,"$288,000.00",960,Suburb
3,"$1,099,999.00",1068,Rural
4,"$1,190,888.00",1484,Suburb
5,"$1,186,000.00",1470,Rural
6,"$1,199,000.00",1956,Rural
7,"$455,000.00",623,Rural
8,"$299,000.00",832,Rural
9,"$374,337.00",1100,Downtown


R^2: 0.552
MSE: 18,689,368,157

Predicted price for a 2000 sq ft house in Downtown: $1,208,085.02

Model Coefficients:
location_Downtown: -13311.42
location_Rural: 11174.89
location_Suburb: 2136.54
square_footage: 573.24


In [None]:
#Part 2
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from google.colab import files

#Upload CSV
uploaded = files.upload()
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

#Fix TotalCharges column (some blanks → NaN)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'].astype(str).str.strip(), errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

print(df.head())

#Features and target
X = df[['tenure', 'MonthlyCharges', 'TotalCharges', 'Contract', 'PaymentMethod']]
y = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

#Preprocessing: scale numeric + one-hot categorical
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['tenure', 'MonthlyCharges', 'TotalCharges']),
        ('cat', OneHotEncoder(sparse_output=False), ['Contract', 'PaymentMethod'])
    ]
)

#Create pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

#Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Fit model
model.fit(X_train, y_train)

#Evaluate
print("Model accuracy on test set:", model.score(X_test, y_test))

#Example: predict for a new customer
new_customer = pd.DataFrame({
    'tenure': [12],
    'MonthlyCharges': [70],
    'TotalCharges': [840],
    'Contract': ['Month-to-month'],
    'PaymentMethod': ['Electronic check']
})
churn_probability = model.predict_proba(new_customer)[0][1]
print(f"Churn Probability: {churn_probability:.2f}")
print(f"Prediction (1 = churn): {1 if churn_probability > 0.5 else 0}")


Saving archive.zip to archive (9).zip
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingT

In [21]:
#Part 3
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from google.colab import files

#Upload CSV
uploaded = files.upload()
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

#Pick columns from Telco and rename them to match starter code
df_cluster = df[['MonthlyCharges', 'tenure', 'SeniorCitizen', 'Contract']].copy()

#Rename to match the expected features
df_cluster = df_cluster.rename(columns={
    'MonthlyCharges': 'annual_spending',
    'tenure': 'purchase_frequency',
    'SeniorCitizen': 'age',
    'Contract': 'region'
})

print(df_cluster.head())

#Preprocess data: Select numerical features and scale them
features = ['annual_spending', 'purchase_frequency', 'age']
X = df_cluster[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#Determine optimal number of clusters using elbow method
inertia = []
K = range(1, 6)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

#Plot elbow curve
plt.figure(figsize=(8, 5))
plt.plot(K, inertia, 'bo-')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.savefig('elbow_plot.png')
plt.close()

#Apply K-Means with optimal K (e.g., 3)
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df_cluster['cluster'] = kmeans.fit_predict(X_scaled)

#Show how many people are in each cluster
print("\nCluster counts:")
print(df_cluster['cluster'].value_counts().sort_index())

#Analyze clusters
cluster_summary = df_cluster.groupby('cluster')[features].mean().round(2)
print("Cluster Characteristics:")
print(cluster_summary)

#Example of targeted strategies
for cluster in range(optimal_k):
    print(f"\nCluster {cluster} Strategy:")
    if cluster_summary.loc[cluster, 'annual_spending'] > 1000:
        print("High-spending customers: Offer exclusive promotions or loyalty rewards.")
    elif cluster_summary.loc[cluster, 'purchase_frequency'] > 10:
        print("Frequent buyers: Provide bulk discounts or subscription plans.")
    else:
        print("Low-engagement customers: Send personalized re-engagement campaigns.")

# Save cluster assignments to CSV
df_cluster.to_csv('customer_segments.csv', index=False)


Saving archive.zip to archive (11).zip
   annual_spending  purchase_frequency  age          region
0            29.85                   1    0  Month-to-month
1            56.95                  34    0        One year
2            53.85                   2    0  Month-to-month
3            42.30                  45    0        One year
4            70.70                   2    0  Month-to-month

Cluster counts:
cluster
0    3354
1    2547
2    1142
Name: count, dtype: int64
Cluster Characteristics:
         annual_spending  purchase_frequency  age
cluster                                          
0                  50.87               14.04  0.0
1                  76.30               56.10  0.0
2                  79.82               33.30  1.0

Cluster 0 Strategy:
Frequent buyers: Provide bulk discounts or subscription plans.

Cluster 1 Strategy:
Frequent buyers: Provide bulk discounts or subscription plans.

Cluster 2 Strategy:
Frequent buyers: Provide bulk discounts or subscription 