## **Model Bulding**

In [7]:
# Import libraries
import os
import geopandas as gp
import shapely
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
def load_data(filepath):
    """
    Load the dataset from a specified filepath and display basic information.
    
    Parameters:
        filepath (str): The path to the dataset file.
    
    Returns:
        GPDataFrame or DataFrame: Loaded data.
    """
    data = gp.read_file(filepath)
    print("Basic Information:")
    print(data.info())
    return data
    
data = load_data('/work/DigitalEquity/DATA/BDC_fixed.gpkg')

Basic Information:
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 34390 entries, 0 to 34389
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   building_type_code             34390 non-null  object  
 1   land_use_code                  34390 non-null  int64   
 2   provider_id                    34390 non-null  int64   
 3   brand_name                     34390 non-null  object  
 4   technology                     34390 non-null  int64   
 5   technology_name                34390 non-null  object  
 6   max_advertised_download_speed  34390 non-null  int64   
 7   max_advertised_upload_speed    34390 non-null  int64   
 8   low_latency                    34390 non-null  int64   
 9   business_residential_code      34390 non-null  object  
 10  h3_9                           34390 non-null  object  
 11  up_down_ratio                  29073 non-null  float64 
 12  geome

In [5]:
def create_features(data):
    """
    Create new features based on existing data.
    
    Parameters:
        data (pd.DataFrame): Original dataset.
        
    Returns:
        pd.DataFrame: Dataset with new features.
    """
    # Creating a new feature 'speed_ratio' to see the relative proportion of upload vs download speed
    data['speed_ratio'] = data['max_advertised_upload_speed'] / data['max_advertised_download_speed']
    
    # Applying logarithmic transformation to 'max_advertised_download_speed' to reduce skewness and improve normality
    data['log_max_download_speed'] = np.log(data['max_advertised_download_speed'] + 1)
    
    return data

data = create_features(data)
data.head()

Unnamed: 0,building_type_code,land_use_code,provider_id,brand_name,technology,technology_name,max_advertised_download_speed,max_advertised_upload_speed,low_latency,business_residential_code,h3_9,up_down_ratio,geometry,speed_ratio,log_max_download_speed
0,R,4,130235,Charter Communications Inc,40,Cable,1000,35,1,R,8927636a07bffff,0.035,POINT (-85.50221 46.35000),0.035,6.908755
1,R,4,130235,Charter Communications Inc,40,Cable,1000,35,1,R,8927636a2b7ffff,0.035,POINT (-85.50681 46.35198),0.035,6.908755
2,X,4,130235,Charter Communications Inc,40,Cable,1000,35,1,B,8927636a2b3ffff,0.035,POINT (-85.51158 46.35514),0.035,6.908755
3,R,4,130235,Charter Communications Inc,40,Cable,1000,35,1,R,8927636a357ffff,0.035,POINT (-85.51040 46.33740),0.035,6.908755
4,R,4,130235,Charter Communications Inc,40,Cable,1000,35,1,R,8927636a07bffff,0.035,POINT (-85.50482 46.35135),0.035,6.908755


- **Speed Ratio**: This feature provides insights into the balance between download and upload speeds, which can be critical for applications that rely on symmetric internet usage (e.g., video conferencing requires good upload speed).
- **Logarithmic Max Download Speed**: Log transformations are used to handle skewed data and make it more normally distributed. This can improve the performance of many machine learning algorithms that assume normality.


**Insights**
- **Speed Ratio**: Understanding the balance in internet speed offerings can guide decisions in service enhancements tailored for specific user groups.
**Log Transformation** Stabilizing variance and making the data more symmetric allows for better modeling and prediction, thus facilitating more reliable forecasts of demand and user satisfaction.- 

# Model Preparation
Preparing the data involves encoding categorical variables and scaling numerical values to ensure that the machine learning algorithms perform optimally. These preprocessing steps are crucial for dealing with non-numeric data and ensuring that variable scales do not distort their importance given to different features by some algorithms.

In [6]:
def prepare_data(data, target_variable):
    """
    Prepare data for modeling: encode categorical variables, split data, and scale features.
    
    Parameters:
        data (pd.DataFrame): The dataset.
        target_variable (str): The target variable name for prediction.
        
    Returns:
        tuple: Split data (X_train, X_test, y_train, y_test).
    """
    # Selecting features and target
    X = data.drop(target_variable, axis=1)
    y = data[target_variable]
    
    # Defining numerical and categorical columns
    categorical_cols = data.select_dtypes(include=['object']).columns
    numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.difference([target_variable])
    
    # Creating transformers for categorical and numerical columns
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])
    
    # Combining transformers into a preprocessor with ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])
    
    # Splitting data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Apply transformation to both the training set and the test set
    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(X_test)
    
    return X_train, X_test, y_train, y_test


OneHot Encoding: Converts categorical variables into a form that could be provided to ML algorithms to do a better job in prediction.
Scaling: Ensures that the numerical features contribute equally to the model's predictive ability, preventing models from misinterpreting the data.


In [13]:
from sklearn.impute import SimpleImputer

def prepare_data(data, target_variable):
    """
    Prepare data for modeling: encode categorical variables, split data, and scale features.
    
    Parameters:
        data (pd.DataFrame): The dataset.
        target_variable (str): The target variable name for prediction.
        
    Returns:
        tuple: Split data (X_train, X_test, y_train, y_test).
    """
    # Selecting features and target
    X = data.drop(target_variable, axis=1)
    y = data[target_variable]
    
    # Defining numerical and categorical columns
    categorical_cols = data.select_dtypes(include=['object']).columns
    numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.difference([target_variable])
    
    # Creating transformers for categorical and numerical columns
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')), # Imputing missing values
        ('scaler', StandardScaler())
    ])
    
    # Combining transformers into a preprocessor with ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])
    
    # Splitting data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Apply transformation to both the training set and the test set
    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(X_test)
    
    return X_train, X_test, y_train, y_test

model, metrics = build_and_evaluate_model(*prepare_data(data, 'log_max_download_speed'))
model

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

def build_and_evaluate_model(X_train, X_test, y_train, y_test):
    """
    Build a machine learning model and evaluate its performance.
    
    Parameters:
        X_train, X_test, y_train, y_test: Training and testing data.
        
    Returns:
        model: Trained machine learning model.
        metrics (dict): Performance metrics.
    """
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Predicting on test data
    y_pred = model.predict(X_test)
    
    # Evaluating the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    metrics = {'MSE': mse, 'R^2': r2}
    
    return model, metrics
print(model, metrics)

RandomForestRegressor(random_state=42) {'MSE': 5.198279268474537e-25, 'R^2': 1.0}


In [17]:
# Building and evaluating the model
X_train, X_test, y_train, y_test = prepare_data(data, 'log_max_download_speed')
model, metrics = build_and_evaluate_model(X_train, X_test, y_train, y_test)
print("Model Metrics:", metrics)

Model Metrics: {'MSE': 5.198279268474537e-25, 'R^2': 1.0}


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b828342d-3de7-46aa-b87a-bd160a6c7e7e' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>