<a href="https://colab.research.google.com/github/kailliang/Data-Science-with-Machine-Learning-Models/blob/main/Grid_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report
from sklearn.svm import SVC
# from xgboost import XGBClassifier
random_state = 2023

# Read data
data = pd.read_csv('/content/drive/MyDrive/data.csv')

date_columns = ['reported_date', 'target_finish', 'actual_finish']
# Parse date-time columns
for col in ['reported_date', 'target_finish', 'actual_finish']:
    data[col] = pd.to_datetime(data[col], dayfirst=True, errors='coerce')

# Drop rows with invalid datetime values
data = data.dropna(subset=date_columns)
# Create "on_time" column
data['on_time'] = (data['actual_finish'] <= data['target_finish']).astype(int)

# Create "job_duration" column
data['job_duration'] = (data['target_finish'] - data['reported_date']).dt.total_seconds() / 3600

# Handle missing data
data = data.dropna()

# Create time features
data['reported_hour'] = data['reported_date'].dt.hour
data['reported_day_of_week'] = data['reported_date'].dt.dayofweek
data['reported_month'] = data['reported_date'].dt.month
data['reported_year'] = data['reported_date'].dt.year

data['target_hour'] = data['target_finish'].dt.hour
data['target_day_of_week'] = data['target_finish'].dt.dayofweek
data['target_month'] = data['target_finish'].dt.month
data['target_year'] = data['target_finish'].dt.year

# Encode categorical features
le = LabelEncoder()
data['location_type'] = le.fit_transform(data['location_type'])
data['raised_within_workhours'] = le.fit_transform(data['raised_within_workhours'])

# Select features and target
features = data[['priority', 'location_type', 'raised_within_workhours', 'reported_hour', 'reported_day_of_week', 
                 'reported_month', 'reported_year', 'target_hour', 'target_day_of_week', 'target_month', 'target_year']]

target = data['on_time']

# Handle Class Imbalance with SMOTE
sm = SMOTE(random_state=random_state)
features_res, target_res = sm.fit_resample(features, target)
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features_res, target_res, test_size=0.2, random_state=2023)

# Create and train the model

svm_model = SVC(kernel='rbf', gamma = 0.05, C=50)        # 77.36%
# svm_model = SVC(kernel='linear', C=50)

gb_model = GradientBoostingClassifier(random_state=random_state)  # 71.80%

# xgbc_model = XGBClassifier(eval_metric='logloss')       # 77.70%




In [4]:

from sklearn.model_selection import GridSearchCV
# from sklearn.grid_search import GridSearchCV
rf = RandomForestClassifier()    
param_grid = {   'n_estimators': [500,2000],  # no. of trees    
                               'max_depth': [None, 10, 30],     # max deepth of trees   
                               'min_samples_split': [2, 10],    # minimun samples to split a tree    
                               'min_samples_leaf': [2, 4],      # minimun samples in a leaf, too few will be pruned
                               'bootstrap': [True]}             # whether sampled data will be put back
                              
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("最佳参数：", best_params)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
最佳参数： {'bootstrap': True, 'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 500}


In [5]:

best_params = grid_search.best_params_
rf_model = RandomForestClassifier(**best_params)
model = rf_model

# Define K-fold cross validation
kfold = KFold(n_splits=10, random_state=random_state, shuffle=True)

# Perform K-fold cross validation
results = cross_val_score(model, features_res, target_res, cv=kfold)

# Print cross validation score
print("Accuracy: %.2f%%" % (results.mean()*100))


Accuracy: 78.15%
