In [279]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn import metrics

## Preprocessing Data

### Import data

In [280]:
df = pd.read_csv('../Data/PGH/DemandPrediction/pgh_grid_census.csv')
df.head(4)

Unnamed: 0.1,Unnamed: 0,cell_num,stations,ids,tractce10,total_households,median_hh_income,mean_hh_income,population_16yrs+,perc_employed,...,public_transit,walked,other,wfh,mean_travel_time_to_work,perc_public,perc_alone,perc_walk,perc_other,outflow
0,0,51,0,set(),563000,1306,67409,69254,2660,0.634586,...,265,63,0,58,23,0.156342,0.638938,0.037168,0.0,0
1,1,52,0,set(),563000,1306,67409,69254,2660,0.634586,...,265,63,0,58,23,0.156342,0.638938,0.037168,0.0,0
2,2,82,0,set(),563000,1306,67409,69254,2660,0.634586,...,265,63,0,58,23,0.156342,0.638938,0.037168,0.0,0
3,3,83,0,set(),563000,1306,67409,69254,2660,0.634586,...,265,63,0,58,23,0.156342,0.638938,0.037168,0.0,0


In [281]:
df.columns

Index(['Unnamed: 0', 'cell_num', 'stations', 'ids', 'tractce10',
       'total_households', 'median_hh_income', 'mean_hh_income',
       'population_16yrs+', 'perc_employed', 'perc_unemployed',
       'Workers_16yrs+', 'own_alone', 'carpool', 'public_transit', 'walked',
       'other', 'wfh', 'mean_travel_time_to_work', 'perc_public', 'perc_alone',
       'perc_walk', 'perc_other', 'outflow'],
      dtype='object')

### Fix Tract Feature for Label Encoding

In [282]:
#fix tract number for PGH
df['tractce10'] = df.tractce10.astype(str)

def fixtract(df, column):
    for row, col in df.iterrows():
        #fix census tract number
        tract = df.loc[row][column]
        if len(tract) < 6:
            tract = '0' + tract
        df.at[row, column] = tract
        
    for row, col in df.iterrows():
        for col in df.columns:
            #check for invalid values
            if df.loc[row][col] == '-' or pd.isna(df.loc[row][col]):
                df.at[row, col] = 0
            
    return df

df = fixtract(df, 'tractce10')

## Remove Some Features
Removing 'Unnamed: 0', 'cell_num', and 'ids'

In [283]:
df = df.drop(columns=['Unnamed: 0', 'cell_num', 'ids'])

### Encoding Census Tract feature

In [284]:
le = preprocessing.LabelEncoder()
le.fit(df['tractce10'])
df['tractce10'] = le.transform(df['tractce10'])

In [285]:
df.head(1)

Unnamed: 0,stations,tractce10,total_households,median_hh_income,mean_hh_income,population_16yrs+,perc_employed,perc_unemployed,Workers_16yrs+,own_alone,...,public_transit,walked,other,wfh,mean_travel_time_to_work,perc_public,perc_alone,perc_walk,perc_other,outflow
0,0,117,1306,67409,69254,2660,0.634586,0.04812,1695,1083,...,265,63,0,58,23,0.156342,0.638938,0.037168,0.0,0


In [286]:
# df['tractce10'] = le.inverse_transform(df['tractce10'])

In [287]:
# df.head(1)

### Split data for training and test

We have a small number of locations that have bike stations. First, we need to filter out any cell that does not have any stations.

In [288]:
stations_df = df[df['stations'] > 0]

#define the X and Y in our dataset
X = stations_df.iloc[:, 0:20].values
y = stations_df.iloc[:, 20].values

# np.nan_to_num(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

### Perform Feature Scaling

In [289]:
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_test = sc.transform(X_test)

## Defining Models

In [290]:
#Random Forest Classifier
rf_model = RandomForestRegressor(
    n_estimators=150,
    criterion='mae',
    max_depth=5,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=-1,
    random_state=0,
    verbose=0,
    warm_start=False
)

## Training our Random Forest Model

In [291]:
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

In [292]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 6690.431159420289
Mean Squared Error: 108077496.10321012
Root Mean Squared Error: 10396.03270979897
