In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

df= pd.read_csv("london_merged.csv")
print(df.head())

df['timestamp']= pd.to_datetime(df['timestamp'])
df['hour']= df['timestamp'].dt.hour
df['year']= df['timestamp'].dt.year
df['day_of_week']= df['timestamp'].dt.dayofweek
df['month']= df['timestamp'].dt.month
df.head()

print(df.info())

#preprocessing the data
missing_values= df.isnull().sum()
print(missing_values)

#remove or fill the missing values if necessary
df['t1'].fillna(df['t1'].mean(), inplace= True)
df['t2'].fillna(df['t2'].mean(), inplace= True)
df['hum'].fillna(df['hum'].mean(), inplace= True)
df['wind_speed'].fillna(df['wind_speed'].mean(), inplace= True)
df['weather_code'].fillna(method= 'ffill', inplace= True)

threshold= df['cnt'].mean()
df['user_type']= (df['cnt'] > threshold).astype(int)

df= df.drop(columns= ['cnt', 'timestamp'])

X= df.drop('user_type', axis=1)
y= df['user_type']

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.3, random_state=0)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rf_model= RandomForestClassifier(n_estimators= 100, random_state=42)
rf_model.fit(X_train, y_train)

rf_pred= rf_model.predict(X_test)

print("\nEvaluation:\n")
print("Accuracy=%.4f"%accuracy_score(y_test, rf_pred))
print("Precision=%.4f"%precision_score(y_test, rf_pred))
print("F1-score=%.4f"%f1_score(y_test, rf_pred))
print("Recall=%.4f"%recall_score(y_test, rf_pred))
print(classification_report(y_test, rf_pred))

             timestamp  cnt   t1   t2    hum  wind_speed  weather_code  \
0  2015-01-04 00:00:00  182  3.0  2.0   93.0         6.0           3.0   
1  2015-01-04 01:00:00  138  3.0  2.5   93.0         5.0           1.0   
2  2015-01-04 02:00:00  134  2.5  2.5   96.5         0.0           1.0   
3  2015-01-04 03:00:00   72  2.0  2.0  100.0         0.0           1.0   
4  2015-01-04 04:00:00   47  2.0  0.0   93.0         6.5           1.0   

   is_holiday  is_weekend  season  
0         0.0         1.0     3.0  
1         0.0         1.0     3.0  
2         0.0         1.0     3.0  
3         0.0         1.0     3.0  
4         0.0         1.0     3.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17414 entries, 0 to 17413
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   timestamp     17414 non-null  datetime64[ns]
 1   cnt           17414 non-null  int64         
 2   t1            17414 non-n

  df['weather_code'].fillna(method= 'ffill', inplace= True)



Evaluation:

Accuracy=0.9489
Precision=0.9253
F1-score=0.9353
Recall=0.9456
              precision    recall  f1-score   support

           0       0.96      0.95      0.96      3183
           1       0.93      0.95      0.94      2042

    accuracy                           0.95      5225
   macro avg       0.94      0.95      0.95      5225
weighted avg       0.95      0.95      0.95      5225

