In [1]:
import pandas as pd
from sodapy import Socrata
import numpy as np
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
def fetch_data(MyAppToken,username,password):
    MyAppToken = MyAppToken
    try:
        client = Socrata('data.cityofnewyork.us', MyAppToken, username=username, password=password)
        results = client.get_all("2xir-kwzz")
        df = pd.DataFrame.from_records(results)
        print("Data fetched successfully from API.")
        return df
    except Exception as e:
        print(str(e))
        print("Unable to fetch from API, loading from pre-existing dataset.")
        try:
            df = pd.read_csv('Beach_Water_Samples_20240210.csv')
            return df
        except FileNotFoundError:
            print("Dataset file not found.")
    return df

In [34]:
df = fetch_data('YOUR-APP-TOKEN','name@example.com', 'FakePassword')

Data fetched successfully from API.


In [35]:
df

Unnamed: 0,sample_id,sample_date,beach_name,sample_location,enterococci_results,units_or_notes
0,JB2309201015-1.1,2023-09-20T00:00:00.000,WEST FORDHAM STREET ASSOCIATION,Left,9.9,MPN/100 ml
1,JB2309201015-1.2,2023-09-20T00:00:00.000,WEST FORDHAM STREET ASSOCIATION,Center,10,MPN/100 ml
2,JB2309201015-1.3,2023-09-20T00:00:00.000,WEST FORDHAM STREET ASSOCIATION,Right,10,MPN/100 ml
3,JB2309131035-1.2,2023-09-13T00:00:00.000,WEST FORDHAM STREET ASSOCIATION,Center,74,MPN/100 ml
4,JB2309131100-1.2,2023-09-13T00:00:00.000,MORRIS YACHT AND BEACH CLUB,Center,158,MPN/100 ml
...,...,...,...,...,...,...
26994,050205BH11,2005-05-02T00:00:00.000,CONEY ISLAND WEST 16TH - WEST 27TH,Center,9.90,MPN/100 ml
26995,050205BH05,2005-05-02T00:00:00.000,MIDLAND BEACH,Center,20.00,MPN/100 ml
26996,050205LA09,2005-05-02T00:00:00.000,KINGSBOROUGH COMMUNITY COLLEGE,Right,10.00,MPN/100 ml
26997,050205LA08,2005-05-02T00:00:00.000,KINGSBOROUGH COMMUNITY COLLEGE,Center,31.00,MPN/100 ml


In [36]:
df.dtypes

sample_id              object
sample_date            object
beach_name             object
sample_location        object
enterococci_results    object
units_or_notes         object
dtype: object

In [37]:
df.isnull().sum()

sample_id                 0
sample_date               0
beach_name                0
sample_location          37
enterococci_results    7449
units_or_notes            0
dtype: int64

In [38]:
df.drop(columns=['sample_id', 'units_or_notes'], inplace=True)#Removing irrelevant columns
df['sample_date'] = pd.to_datetime(df['sample_date']) #Converting into YYY-MM-DD

df['enterococci_results'] = df['enterococci_results'].astype(float) # object to float 
df.rename(columns={'enterococci_results': 'enterococci_results(MPN/100 ml)'}, inplace=True)#renaming the column

#If anything apart from Center, Left or Right in sample_location, replace it with a blank (NaN) value
df['sample_location'] = df['sample_location'].apply(lambda x: x if x in ['Center', 'Left', 'Right'] else np.nan)
#Replacing missing values in sample_location with most frequent observation - "Center"
mode = df['sample_location'].mode()[0]
df['sample_location'].fillna(mode, inplace=True)

#Dropping null values from the enterocci_results column since it is below detection limit
df.dropna(inplace = True)

In [39]:
df.isnull().sum()

sample_date                        0
beach_name                         0
sample_location                    0
enterococci_results(MPN/100 ml)    0
dtype: int64

In [40]:
df

Unnamed: 0,sample_date,beach_name,sample_location,enterococci_results(MPN/100 ml)
0,2023-09-20,WEST FORDHAM STREET ASSOCIATION,Left,9.9
1,2023-09-20,WEST FORDHAM STREET ASSOCIATION,Center,10.0
2,2023-09-20,WEST FORDHAM STREET ASSOCIATION,Right,10.0
3,2023-09-13,WEST FORDHAM STREET ASSOCIATION,Center,74.0
4,2023-09-13,MORRIS YACHT AND BEACH CLUB,Center,158.0
...,...,...,...,...
26994,2005-05-02,CONEY ISLAND WEST 16TH - WEST 27TH,Center,9.9
26995,2005-05-02,MIDLAND BEACH,Center,20.0
26996,2005-05-02,KINGSBOROUGH COMMUNITY COLLEGE,Right,10.0
26997,2005-05-02,KINGSBOROUGH COMMUNITY COLLEGE,Center,31.0


In [41]:
# One hot encoding sample_location column
df = pd.concat([df, pd.get_dummies(df['sample_location'], prefix='sample_location')], axis=1)
# Target encoding beach_name column 
df['beach_name_encoded'] = ce.TargetEncoder(cols=['beach_name']).fit_transform(df['beach_name'], df['enterococci_results(MPN/100 ml)'])

In [42]:
# Re-arranging columns of the dataframe
df = df[['beach_name_encoded','sample_location_Center','sample_location_Left','sample_location_Right', 'enterococci_results(MPN/100 ml)']]

In [43]:
# Final dataset for modeling
df

Unnamed: 0,beach_name_encoded,sample_location_Center,sample_location_Left,sample_location_Right,enterococci_results(MPN/100 ml)
0,71.840946,0,1,0,9.9
1,71.840946,1,0,0,10.0
2,71.840946,0,0,1,10.0
3,71.840946,1,0,0,74.0
4,118.371635,1,0,0,158.0
...,...,...,...,...,...
26994,35.031897,1,0,0,9.9
26995,39.123370,1,0,0,20.0
26996,62.331704,0,0,1,10.0
26997,62.331704,1,0,0,31.0


In [44]:
X = df.drop(columns=['enterococci_results(MPN/100 ml)'])
y = df['enterococci_results(MPN/100 ml)'] 

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train, y_train)

y_pred = rf_regressor.predict(X_test)

In [46]:
print(round(np.sqrt(mean_squared_error(y_test, y_pred)),4))
print(round(mean_absolute_error(y_test, y_pred),4))
print(round(r2_score(y_test, y_pred),4))

450.0569
114.8196
0.0031
