In [10]:
# Install imbalanced-learn library
!pip install imbalanced-learn

import pandas as pd
import time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from collections import Counter

# Measure the time taken for each step
start_time = time.time()

# Load a larger sample of the dataset
file_path = 'retractions.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', nrows=5000)  # Load more rows
print("Data loaded in {:.2f} seconds".format(time.time() - start_time))

# Data preprocessing
data = data.drop(columns=['Unnamed: 20'])
data = data.dropna(subset=['RetractionNature'])
data = data.fillna('Unknown')

label_encoder = LabelEncoder()
data['RetractionNature'] = label_encoder.fit_transform(data['RetractionNature'])

selected_features = ['Journal', 'Publisher', 'Country', 'OriginalPaperDate', 'RetractionNature']
data = data[selected_features]
data = pd.get_dummies(data, columns=['Journal', 'Publisher', 'Country', 'OriginalPaperDate'])

print("Data preprocessed in {:.2f} seconds".format(time.time() - start_time))

# Split the data into features and target
X = data.drop(columns=['RetractionNature'])
y = data['RetractionNature']

# Check the distribution of classes
class_distribution = Counter(y)
print("Class distribution before SMOTE:", class_distribution)

# Apply SMOTE to balance the dataset if there are enough samples per class
min_samples_required = 6  # Minimum number of samples required for SMOTE
if all(count >= min_samples_required for count in class_distribution.values()):
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    print("SMOTE applied.")
else:
    X_resampled, y_resampled = X, y
    print("SMOTE not applied due to insufficient samples in one or more classes.")

# Check the distribution of classes after SMOTE
class_distribution_resampled = Counter(y_resampled)
print("Class distribution after SMOTE:", class_distribution_resampled)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

print("Data split into training and testing sets in {:.2f} seconds".format(time.time() - start_time))


rf_model = GridSearchCV(RandomForestClassifier(random_state=42), param_grid={
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None]
}, cv=3, n_jobs=-1)

# Train Random Forest
rf_start_time = time.time()
rf_model.fit(X_train, y_train)

# Scale the data for Logistic Regression and SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize classifiers with hyperparameter tuning
classifiers = {
    "Random Forest": GridSearchCV(RandomForestClassifier(random_state=42), param_grid={
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None]
    }, cv=3, n_jobs=-1),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Support Vector Machine": GridSearchCV(SVC(random_state=42), param_grid={
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    }, cv=3, n_jobs=-1),
    "Gradient Boosting": GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid={
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1]
    }, cv=3, n_jobs=-1)
}

# Train and evaluate each classifier
results = {}
for name, clf in classifiers.items():
    clf_start_time = time.time()
    if name in ["Logistic Regression", "Support Vector Machine"]:
        clf.fit(X_train_scaled, y_train)
        y_pred = clf.predict(X_test_scaled)
    else:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    clf_end_time = time.time()
    clf_total_time = clf_end_time - clf_start_time
    results[name] = {
        "Accuracy": accuracy,
        "Classification Report": report,
        "Training Time (s)": clf_total_time
    }

end_time = time.time()
total_time = end_time - start_time

# Display the results
for name, result in results.items():
    print(f"\nClassifier: {name}")
    print(f"Accuracy: {result['Accuracy']}")
    print(result["Classification Report"])
    print(f"Training Time: {result['Training Time (s)']:.2f} seconds")

print("Total time taken: {:.2f} seconds".format(total_time))



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Data loaded in 0.07 seconds
Data preprocessed in 0.11 seconds
Class distribution before SMOTE: Counter({3: 4829, 1: 114, 0: 45, 2: 12})
SMOTE applied.
Class distribution after SMOTE: Counter({3: 4829, 0: 4829, 1: 4829, 2: 4829})
Data split into training and testing sets in 0.74 seconds

Classifier: Random Forest
Accuracy: 0.9772256728778468
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       969
           1       0.97      0.95      0.96      1009
           2       1.00      1.00      1.00       917
           3       0.98      0.98      0.98       969

    accuracy                           0.98      3864
   macro avg       0.98      0.98      0.98      3864
weighted avg       0.98   

In [11]:
import pickle

with open('random_forest_model.pkl', 'wb') as rf_file:
    pickle.dump(rf_model, rf_file)