In [3]:
import pandas as pd
from sodapy import Socrata
import numpy as np
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [4]:
def fetch_data(MyAppToken,username,password):
    MyAppToken = MyAppToken
    try:
        client = Socrata('data.cityofnewyork.us', MyAppToken, username=username, password=password)
        results = client.get_all("2xir-kwzz")
        df = pd.DataFrame.from_records(results)
        print("Data fetched successfully from API.")
        return df
    except Exception as e:
        print(str(e))
        print("Unable to fetch from API, loading from pre-existing dataset.")
        try:
            df = pd.read_csv('Beach_Water_Samples_20240210.csv')
            return df
        except FileNotFoundError:
            print("Dataset file not found.")
    return df

In [5]:
df = fetch_data('YOUR-APP-TOKEN','name@example.com', 'FakePassword')

Data fetched successfully from API.


In [6]:
df

Unnamed: 0,sample_id,sample_date,beach_name,sample_location,enterococci_results,units_or_notes
0,JB2309201015-1.1,2023-09-20T00:00:00.000,WEST FORDHAM STREET ASSOCIATION,Left,9.9,MPN/100 ml
1,JB2309201015-1.2,2023-09-20T00:00:00.000,WEST FORDHAM STREET ASSOCIATION,Center,10,MPN/100 ml
2,JB2309201015-1.3,2023-09-20T00:00:00.000,WEST FORDHAM STREET ASSOCIATION,Right,10,MPN/100 ml
3,JB2309131035-1.2,2023-09-13T00:00:00.000,WEST FORDHAM STREET ASSOCIATION,Center,74,MPN/100 ml
4,JB2309131100-1.2,2023-09-13T00:00:00.000,MORRIS YACHT AND BEACH CLUB,Center,158,MPN/100 ml
...,...,...,...,...,...,...
26994,050205LA13,2005-05-02T00:00:00.000,SEAGATE BEACH - 42ND STREET,Left,31.00,MPN/100 ml
26995,050205BH09,2005-05-02T00:00:00.000,SOUTH BEACH,Right,9.90,MPN/100 ml
26996,050205LA08,2005-05-02T00:00:00.000,KINGSBOROUGH COMMUNITY COLLEGE,Center,31.00,MPN/100 ml
26997,050205BH11,2005-05-02T00:00:00.000,CONEY ISLAND WEST 16TH - WEST 27TH,Center,9.90,MPN/100 ml


In [7]:
df.dtypes

sample_id              object
sample_date            object
beach_name             object
sample_location        object
enterococci_results    object
units_or_notes         object
dtype: object

In [8]:
df.isnull().sum()

sample_id                 0
sample_date               0
beach_name                0
sample_location          37
enterococci_results    7445
units_or_notes            0
dtype: int64

In [9]:
df.drop(columns=['sample_id', 'units_or_notes','sample_date'], inplace=True)#Removing irrelevant columns

df['enterococci_results'] = df['enterococci_results'].astype(float) # object to float 
df.rename(columns={'enterococci_results': 'enterococci_results(MPN/100 ml)'}, inplace=True)#renaming the column

#If anything apart from Center, Left or Right in sample_location, replace it with a blank (NaN) value
df['sample_location'] = df['sample_location'].apply(lambda x: x if x in ['Center', 'Left', 'Right'] else np.nan)

#Replacing missing values in sample_location with most frequent observation - "Center"
imputer = SimpleImputer(strategy='most_frequent') 
df['sample_location'] = imputer.fit_transform(df[['sample_location']])

In [10]:
df.isnull().sum()

beach_name                            0
sample_location                      38
enterococci_results(MPN/100 ml)    7445
dtype: int64

In [11]:
df

Unnamed: 0,beach_name,sample_location,enterococci_results(MPN/100 ml)
0,WEST FORDHAM STREET ASSOCIATION,Left,9.9
1,WEST FORDHAM STREET ASSOCIATION,Center,10.0
2,WEST FORDHAM STREET ASSOCIATION,Right,10.0
3,WEST FORDHAM STREET ASSOCIATION,Center,74.0
4,MORRIS YACHT AND BEACH CLUB,Center,158.0
...,...,...,...
26994,SEAGATE BEACH - 42ND STREET,Left,31.0
26995,SOUTH BEACH,Right,9.9
26996,KINGSBOROUGH COMMUNITY COLLEGE,Center,31.0
26997,CONEY ISLAND WEST 16TH - WEST 27TH,Center,9.9


In [12]:
location_encoder = OneHotEncoder(sparse=False, drop='first', handle_unknown='ignore')
encoded_df = pd.DataFrame(location_encoder.fit_transform(df[['sample_location']]))

encoded_df.columns = ['sample_location_Left', 'sample_location_Center']

df.drop('sample_location', axis=1, inplace=True)

df = pd.concat([df, encoded_df], axis=1)

df = df.astype({'sample_location_Left': int, 'sample_location_Center': int})

In [13]:
df

Unnamed: 0,beach_name,enterococci_results(MPN/100 ml),sample_location_Left,sample_location_Center
0,WEST FORDHAM STREET ASSOCIATION,9.9,1,0
1,WEST FORDHAM STREET ASSOCIATION,10.0,0,0
2,WEST FORDHAM STREET ASSOCIATION,10.0,0,1
3,WEST FORDHAM STREET ASSOCIATION,74.0,0,0
4,MORRIS YACHT AND BEACH CLUB,158.0,0,0
...,...,...,...,...
26994,SEAGATE BEACH - 42ND STREET,31.0,1,0
26995,SOUTH BEACH,9.9,0,1
26996,KINGSBOROUGH COMMUNITY COLLEGE,31.0,0,0
26997,CONEY ISLAND WEST 16TH - WEST 27TH,9.9,0,0


In [14]:
df['beach_name_encoded'] = TargetEncoder(cols=['beach_name']).fit_transform(df['beach_name'], df['enterococci_results(MPN/100 ml)'])

In [15]:
df = df[['beach_name_encoded','sample_location_Left','sample_location_Center', 'enterococci_results(MPN/100 ml)']]

In [16]:
df.dtypes

beach_name_encoded                 float64
sample_location_Left                 int32
sample_location_Center               int32
enterococci_results(MPN/100 ml)    float64
dtype: object

In [17]:
df.isnull().sum()

beach_name_encoded                    0
sample_location_Left                  0
sample_location_Center                0
enterococci_results(MPN/100 ml)    7445
dtype: int64

In [19]:
df = df.dropna()

In [20]:
# Final dataset for modeling
df

Unnamed: 0,beach_name_encoded,sample_location_Left,sample_location_Center,enterococci_results(MPN/100 ml)
0,71.724390,1,0,9.9
1,71.724390,0,0,10.0
2,71.724390,0,1,10.0
3,71.724390,0,0,74.0
4,120.912515,0,0,158.0
...,...,...,...,...
26994,24.995265,1,0,31.0
26995,73.906800,0,1,9.9
26996,62.341729,0,0,31.0
26997,35.031897,0,0,9.9


In [21]:
X = df.drop(columns=['enterococci_results(MPN/100 ml)'])
y = df['enterococci_results(MPN/100 ml)'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [22]:
rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train, y_train)

y_pred = rf_regressor.predict(X_test)

In [23]:
print(round(np.sqrt(mean_squared_error(y_test, y_pred)),4))
print(round(mean_absolute_error(y_test, y_pred),4))
print(round(r2_score(y_test, y_pred),4))

345.1706
108.6698
-0.0008


In [24]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50,100],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [40, 50, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_leaf_nodes': [None, 10, 20],
    'max_samples': [None, 0.5]
}

rf = RandomForestRegressor()

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

In [25]:
best_params

{'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': 10,
 'max_samples': None,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 50}

In [26]:
# RandomForestRegressor with the best parameters
best_rf = RandomForestRegressor(**best_params)
best_rf.fit(X_train, y_train)

y_pred = best_rf.predict(X_test)

In [27]:
print(round(np.sqrt(mean_squared_error(y_test, y_pred)),4))
print(round(mean_absolute_error(y_test, y_pred),4))
print(round(r2_score(y_test, y_pred),4))

344.005
107.6693
0.006
