<a href="https://colab.research.google.com/github/hadeelfarash/INTERMEDIATE_MACHINE_LEARNING/blob/main/Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.impute  import SimpleImputer

pd.set_option("display.max_columns",200)
pd.set_option("display.max_info_rows",800)
pd.set_option("display.max_info_columns",800)

from sklearn import set_config
set_config(transform_output= "pandas")

In [20]:
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

def classification_metrics(y_true, y_pred, label='',
                           output_dict=False, figsize=(8,4),
                           normalize='true', cmap='Blues',
                           colorbar=False):
    # Get the classification report
    report = classification_report(y_true, y_pred)
    ## Print header and report
    header = "-"*70
    print(header, f" Classification Metrics: {label}", header, sep='\n')
    print(report)
    ## CONFUSION MATRICES SUBPLOTS
    fig, axes = plt.subplots(ncols=2, figsize=figsize)
    # create a confusion matrix  of raw counts
    ConfusionMatrixDisplay.from_predictions(y_true, y_pred,
                normalize=None, cmap='gist_gray', colorbar=colorbar,
                ax = axes[0],);
    axes[0].set_title("Raw Counts")
    # create a confusion matrix with the test data
    ConfusionMatrixDisplay.from_predictions(y_true, y_pred,
                normalize=normalize, cmap=cmap, colorbar=colorbar,
                ax = axes[1]);
    axes[1].set_title("Normalized Confusion Matrix")
    # Adjust layout and show figure
    fig.tight_layout()
    plt.show()
    # Return dictionary of classification_report
    if output_dict==True:
        report_dict = classification_report(y_true, y_pred, output_dict=True)
        return report_dict


def evaluate_classification(model, X_train, y_train, X_test, y_test,
                         figsize=(6,4), normalize='true', output_dict = False,
                            cmap_train='Blues', cmap_test="Reds",colorbar=False):
    # Get predictions for training data
    y_train_pred = model.predict(X_train)
    # Call the helper function to obtain regression metrics for training data
    results_train = classification_metrics(y_train, y_train_pred, #verbose = verbose,
                                     output_dict=True, figsize=figsize,
                                         colorbar=colorbar, cmap=cmap_train,
                                     label='Training Data')
    print()
    # Get predictions for test data
    y_test_pred = model.predict(X_test)
    # Call the helper function to obtain regression metrics for test data
    results_test = classification_metrics(y_test, y_test_pred, #verbose = verbose,
                                  output_dict=True,figsize=figsize,
                                         colorbar=colorbar, cmap=cmap_test,
                                    label='Test Data' )
    if output_dict == True:
        # Store results in a dataframe if ouput_frame is True
        results_dict = {'train':results_train,
                    'test': results_test}
        return results_dict

In [21]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
df = pd.read_csv("/content/drive/MyDrive/CodingDojo/05-IntermediateML/Week17/Data/bikeshare_train - bikeshare_train.csv")
df.head()
df_model1=df.copy()

In [23]:
df = df.drop(columns = ['casual','registered'])
df

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011-01-01 0:00:00,1,0,0,1,9.84,14.395,81,0.0000,16
1,2011-01-01 1:00:00,1,0,0,1,9.02,13.635,80,0.0000,40
2,2011-01-01 2:00:00,1,0,0,1,9.02,13.635,80,0.0000,32
3,2011-01-01 3:00:00,1,0,0,1,9.84,14.395,75,0.0000,13
4,2011-01-01 4:00:00,1,0,0,1,9.84,14.395,75,0.0000,1
...,...,...,...,...,...,...,...,...,...,...
10881,2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,336
10882,2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,241
10883,2012-12-19 21:00:00,4,0,1,1,13.94,15.910,61,15.0013,168
10884,2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,129


In [24]:
target = 'count'
y = df[target]
X = df.drop(columns = [target])

In [25]:
df['datetime'] = pd.to_datetime(df['datetime'])

# Create new columns
df['Month'] = df['datetime'].dt.month_name()
df['Day_of_Week'] = df['datetime'].dt.day_name()
df['Hour_of_Day'] = df['datetime'].dt.hour

# Convert new columns to 'object' datatype
df[['Month', 'Day_of_Week', 'Hour_of_Day']] = df[['Month', 'Day_of_Week', 'Hour_of_Day']].astype('object')

# Drop redundant columns
df = df.drop(['datetime', 'season'], axis=1)

df

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,Month,Day_of_Week,Hour_of_Day
0,0,0,1,9.84,14.395,81,0.0000,16,January,Saturday,0
1,0,0,1,9.02,13.635,80,0.0000,40,January,Saturday,1
2,0,0,1,9.02,13.635,80,0.0000,32,January,Saturday,2
3,0,0,1,9.84,14.395,75,0.0000,13,January,Saturday,3
4,0,0,1,9.84,14.395,75,0.0000,1,January,Saturday,4
...,...,...,...,...,...,...,...,...,...,...,...
10881,0,1,1,15.58,19.695,50,26.0027,336,December,Wednesday,19
10882,0,1,1,14.76,17.425,57,15.0013,241,December,Wednesday,20
10883,0,1,1,13.94,15.910,61,15.0013,168,December,Wednesday,21
10884,0,1,1,13.94,17.425,61,6.0032,129,December,Wednesday,22


In [26]:
# Assuming 'temp' and 'atemp' are the temperature columns
df['temp'] = df['temp'].apply(lambda celsius: (celsius * 9/5) + 32)
df['atemp'] = df['atemp'].apply(lambda celsius: (celsius * 9/5) + 32)
df

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,Month,Day_of_Week,Hour_of_Day
0,0,0,1,49.712,57.911,81,0.0000,16,January,Saturday,0
1,0,0,1,48.236,56.543,80,0.0000,40,January,Saturday,1
2,0,0,1,48.236,56.543,80,0.0000,32,January,Saturday,2
3,0,0,1,49.712,57.911,75,0.0000,13,January,Saturday,3
4,0,0,1,49.712,57.911,75,0.0000,1,January,Saturday,4
...,...,...,...,...,...,...,...,...,...,...,...
10881,0,1,1,60.044,67.451,50,26.0027,336,December,Wednesday,19
10882,0,1,1,58.568,63.365,57,15.0013,241,December,Wednesday,20
10883,0,1,1,57.092,60.638,61,15.0013,168,December,Wednesday,21
10884,0,1,1,57.092,63.365,61,6.0032,129,December,Wednesday,22


In [27]:
# Calculate the temperature variance
df['temp_variance'] = df['temp'] - df['atemp']

# If the current temperature is warmer than average, set temp_variance to be positive
df['temp_variance'] = df['temp_variance'].apply(lambda x: x if x >= 0 else -x)
df



Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,Month,Day_of_Week,Hour_of_Day,temp_variance
0,0,0,1,49.712,57.911,81,0.0000,16,January,Saturday,0,8.199
1,0,0,1,48.236,56.543,80,0.0000,40,January,Saturday,1,8.307
2,0,0,1,48.236,56.543,80,0.0000,32,January,Saturday,2,8.307
3,0,0,1,49.712,57.911,75,0.0000,13,January,Saturday,3,8.199
4,0,0,1,49.712,57.911,75,0.0000,1,January,Saturday,4,8.199
...,...,...,...,...,...,...,...,...,...,...,...,...
10881,0,1,1,60.044,67.451,50,26.0027,336,December,Wednesday,19,7.407
10882,0,1,1,58.568,63.365,57,15.0013,241,December,Wednesday,20,4.797
10883,0,1,1,57.092,60.638,61,15.0013,168,December,Wednesday,21,3.546
10884,0,1,1,57.092,63.365,61,6.0032,129,December,Wednesday,22,6.273


In [28]:
# Drop the 'atemp' column
df = df.drop('atemp', axis=1)


In [29]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=321)

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Model1 with privios features

features_model1 = ['temp', 'windspeed', 'datetime', 'holiday', 'workingday', 'weather', 'temp','humidity','season']
target = 'count'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_model1[features_model1 ], df_model1[target], test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
model1 = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model1.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model1.predict(X_test)

# Evaluate the model performance
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error (RMSE): {rmse}')



ValueError: could not convert string to float: '2011-07-06 5:00:00'

In [32]:


# Model2 with New features

features_model2 = ['holiday',	'workingday',	'weather',	'temp',	'humidity',	'windspeed'	,'Month',	'Day_of_Week','Hour_of_Day','temp_variance']
target = 'count'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[features_model2], df[target], test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
model2 = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model2.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model2.predict(X_test)

# Evaluate the model performance
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error (RMSE): {rmse}')

ValueError: could not convert string to float: 'July'