In [1]:
import pandas as pd
from sodapy import Socrata
import numpy as np
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
def fetch_data(MyAppToken,username,password):
    MyAppToken = MyAppToken
    try:
        client = Socrata('data.cityofnewyork.us', MyAppToken, username=username, password=password)
        results = client.get_all("2xir-kwzz")
        df = pd.DataFrame.from_records(results)
        print("Data fetched successfully from API.")
        return df
    except Exception as e:
        print(str(e))
        print("Unable to fetch from API, loading from pre-existing dataset.")
        try:
            df = pd.read_csv('Beach_Water_Samples_20240210.csv')
            return df
        except FileNotFoundError:
            print("Dataset file not found.")
    return df

In [3]:
df = fetch_data('YOUR-APP-TOKEN','name@example.com', 'FakePassword')

Data fetched successfully from API.


In [4]:
df

Unnamed: 0,sample_id,sample_date,beach_name,sample_location,enterococci_results,units_or_notes
0,JB2309201015-1.3,2023-09-20T00:00:00.000,WEST FORDHAM STREET ASSOCIATION,Right,10,MPN/100 ml
1,JB2309201015-1.1,2023-09-20T00:00:00.000,WEST FORDHAM STREET ASSOCIATION,Left,9.9,MPN/100 ml
2,JB2309201015-1.2,2023-09-20T00:00:00.000,WEST FORDHAM STREET ASSOCIATION,Center,10,MPN/100 ml
3,KB2309130925-1.3,2023-09-13T00:00:00.000,DOUGLASTON HOMEOWNERS ASSOCIATION,Right,41,MPN/100 ml
4,JB2309131035-1.1,2023-09-13T00:00:00.000,WEST FORDHAM STREET ASSOCIATION,Left,97,MPN/100 ml
...,...,...,...,...,...,...
26994,050205BH11,2005-05-02T00:00:00.000,CONEY ISLAND WEST 16TH - WEST 27TH,Center,9.90,MPN/100 ml
26995,050205BH05,2005-05-02T00:00:00.000,MIDLAND BEACH,Center,20.00,MPN/100 ml
26996,050205LA09,2005-05-02T00:00:00.000,KINGSBOROUGH COMMUNITY COLLEGE,Right,10.00,MPN/100 ml
26997,050205LA08,2005-05-02T00:00:00.000,KINGSBOROUGH COMMUNITY COLLEGE,Center,31.00,MPN/100 ml


In [5]:
df.dtypes

sample_id              object
sample_date            object
beach_name             object
sample_location        object
enterococci_results    object
units_or_notes         object
dtype: object

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26999 entries, 0 to 26998
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   sample_id            26999 non-null  object
 1   sample_date          26999 non-null  object
 2   beach_name           26999 non-null  object
 3   sample_location      26962 non-null  object
 4   enterococci_results  19550 non-null  object
 5   units_or_notes       26999 non-null  object
dtypes: object(6)
memory usage: 1.2+ MB


In [7]:
df.isnull().sum()

sample_id                 0
sample_date               0
beach_name                0
sample_location          37
enterococci_results    7449
units_or_notes            0
dtype: int64

In [8]:
#Removing irrelevant columns
df.drop(columns=['sample_id', 'units_or_notes'], inplace=True)

df['sample_date'] = pd.to_datetime(df['sample_date']) #YYY-MM-DD
df['month'] = df['sample_date'].dt.month #Fetching month (1 to 12)
df['day_of_week'] = df['sample_date'].dt.dayofweek # Fetching day of the week (0 to 6)
df.drop(columns=['sample_date'], inplace=True) # Removing the existing date column

#Putting month & day of the week column at the very start of the dataframe
cols = df.columns.tolist()
cols = cols[-2:] + cols[:-2]
df = df[cols] 

df['enterococci_results'] = df['enterococci_results'].astype(float)
df.rename(columns={'enterococci_results': 'enterococci_results(MPN/100 ml)'}, inplace=True)
df['sample_location'] = df['sample_location'].apply(lambda x: x if x in ['Center', 'Left', 'Right'] else np.nan)

df.dropna(inplace=True)

le_location = LabelEncoder()
df['sample_location'] = le_location.fit_transform(df['sample_location']) #Center = 0, Left = 1, Right = 2

In [9]:
df

Unnamed: 0,month,day_of_week,beach_name,sample_location,enterococci_results(MPN/100 ml)
0,9,2,WEST FORDHAM STREET ASSOCIATION,2,10.0
1,9,2,WEST FORDHAM STREET ASSOCIATION,1,9.9
2,9,2,WEST FORDHAM STREET ASSOCIATION,0,10.0
3,9,2,DOUGLASTON HOMEOWNERS ASSOCIATION,2,41.0
4,9,2,WEST FORDHAM STREET ASSOCIATION,1,97.0
...,...,...,...,...,...
26994,5,0,CONEY ISLAND WEST 16TH - WEST 27TH,0,9.9
26995,5,0,MIDLAND BEACH,0,20.0
26996,5,0,KINGSBOROUGH COMMUNITY COLLEGE,2,10.0
26997,5,0,KINGSBOROUGH COMMUNITY COLLEGE,0,31.0


In [10]:
df.dtypes

month                                int64
day_of_week                          int64
beach_name                          object
sample_location                      int32
enterococci_results(MPN/100 ml)    float64
dtype: object

In [11]:
# Initializing the TargetEncoder to encode beach_name column
beach_encoder = ce.TargetEncoder(cols=['beach_name'])
beach_encoder.fit(df['beach_name'], df['enterococci_results(MPN/100 ml)'])
df['beach_name_encoded'] = beach_encoder.transform(df['beach_name'], df['enterococci_results(MPN/100 ml)'])

columns = df.columns.tolist()
new_columns = columns[:-1] 
new_columns.insert(2, columns[-1]) 
df = df[new_columns] 

# Removing the original beach_name column
df = df.drop(columns=['beach_name'])

In [12]:
df

Unnamed: 0,month,day_of_week,beach_name_encoded,sample_location,enterococci_results(MPN/100 ml)
0,9,2,71.620942,2,10.0
1,9,2,71.620942,1,9.9
2,9,2,71.620942,0,10.0
3,9,2,206.551311,2,41.0
4,9,2,71.620942,1,97.0
...,...,...,...,...,...
26994,5,0,35.031897,0,9.9
26995,5,0,39.082385,0,20.0
26996,5,0,62.468216,2,10.0
26997,5,0,62.468216,0,31.0


In [49]:
df

Unnamed: 0,month,day_of_week,beach_name_encoded,sample_location,enterococci_results(MPN/100 ml)
0,9,2,71.853997,1,9.9
1,9,2,71.853997,0,10.0
2,9,2,71.853997,2,10.0
3,9,2,71.853997,0,74.0
4,9,2,118.490012,0,158.0
...,...,...,...,...,...
26994,5,0,35.031897,0,9.9
26995,5,0,39.123370,0,20.0
26996,5,0,62.468216,2,10.0
26997,5,0,62.468216,0,31.0


In [14]:
X = df[['month', 'day_of_week','beach_name_encoded', 'sample_location']]
y = df['enterococci_results(MPN/100 ml)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

linear_reg_model = LinearRegression()
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)

linear_reg_model.fit(X_train, y_train)
random_forest_model.fit(X_train, y_train)

y_pred_linear_reg = linear_reg_model.predict(X_test)
y_pred_random_forest = random_forest_model.predict(X_test)


In [15]:
r2_linear_reg = r2_score(y_test, y_pred_linear_reg)
print("R-squared (Linear Regression):", r2_linear_reg)

r2_random_forest = r2_score(y_test, y_pred_random_forest)
print("R-squared (Random Forest):", r2_random_forest)

R-squared (Linear Regression): 0.009976559151497533
R-squared (Random Forest): -0.2046619135732617
