In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv('data.csv', delimiter=';')

# Data Preprocessing
# Assuming the preprocessing is needed based on the dataset structure
# Handle missing values, encode categorical variables if any

# Encode target variable
label_encoder = LabelEncoder()
df['Target'] = label_encoder.fit_transform(df['Target'])

# Feature Engineering
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.drop('Target', axis=1))

pca = PCA(n_components=0.95)  # Retain 95% variance
X_pca = pca.fit_transform(X_scaled)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X_pca, df['Target'], test_size=0.3, random_state=42)

# Apply SMOTE
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Model Training and Hyperparameter Tuning
# Random Forest
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}
rf = RandomForestClassifier()
rf_grid = GridSearchCV(estimator=rf, param_grid=rf_params, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
rf_grid.fit(X_train_smote, y_train_smote)
rf_best = rf_grid.best_estimator_

# XGBoost
xgb_params = {
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200],
    'subsample': [0.7, 0.9]
}
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_grid = GridSearchCV(estimator=xgb, param_grid=xgb_params, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
xgb_grid.fit(X_train_smote, y_train_smote)
xgb_best = xgb_grid.best_estimator_

# SVM
svm_params = {
    'C': [1, 10],
    'gamma': [0.001, 0.01]
}
svm = SVC(probability=True)
svm_grid = GridSearchCV(estimator=svm, param_grid=svm_params, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
svm_grid.fit(X_train_smote, y_train_smote)
svm_best = svm_grid.best_estimator_

# Ensemble Method - Voting Classifier
voting_clf = VotingClassifier(
    estimators=[('rf', rf_best), ('xgb', xgb_best), ('svm', svm_best)],
    voting='soft'
)
voting_clf.fit(X_train_smote, y_train_smote)

# Evaluation
y_pred = voting_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Ensemble Model Accuracy: %.2f%%" % (accuracy * 100.0))


Fitting 3 folds for each of 48 candidates, totalling 144 fits
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Ensemble Model Accuracy: 73.64%


In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv('data.csv', delimiter=';')

# Basic Data Preprocessing
# Handle missing values (if any) and encode categorical variables

# Encoding the target variable
label_encoder = LabelEncoder()
df['Target'] = label_encoder.fit_transform(df['Target'])

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.drop('Target', axis=1))

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, df['Target'], test_size=0.3, random_state=42)

# Balancing Classes with SMOTE
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Random Forest Model with Basic Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2]
}

rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train_smote, y_train_smote)
best_rf = grid_search.best_estimator_

# Evaluation
y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Simplified Random Forest Accuracy: %.2f%%" % (accuracy * 100.0))


Fitting 3 folds for each of 36 candidates, totalling 108 fits
Simplified Random Forest Accuracy: 77.03%
