In [None]:
# pip install --upgrade gensim
# pip install sentence-transformers

In [13]:
# Import libraries
import os
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# file path
relative_path_to_file = os.path.join("..", "..", "data", "merged_label.csv")
absolute_path_to_file = os.path.realpath(relative_path_to_file)

# read csv file
data = pd.read_csv(absolute_path_to_file)
data.head()

Unnamed: 0,link,label,text
0,https://www.science.org/doi/10.1126/science.ad...,1,Improved charge extraction in inverted perovsk...
1,https://www.nature.com/articles/s41566-019-0398-2,1,Surface passivation of perovskite film for eff...
2,https://www.nature.com/articles/s41560-020-007...,1,Intact 2D/3D halide junction perovskite solar ...
3,https://www.science.org/doi/10.1126/science.ab...,1,Deterministic fabrication of 3D/2D perovskite ...
4,https://www.nature.com/articles/s41467-021-236...,1,Multication perovskite 2D/3D interfaces form v...


In [8]:
# list to keep record of performance metric
recall_before = []
accuracy_before = []
ber_before = []

recall_after = []
accuracy_after = []
ber_after = []

# Function to calculate Balanced Error Rate
def balanced_error_rate(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    ber = 1 - (sensitivity + specificity) / 2
    return ber

# Model Evaluation Function with Train/Test Accuracy
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name="Model"):
    # Predictions on test and train sets
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    
    # Accuracy scores
    test_accuracy = accuracy_score(y_test, y_pred_test)
    train_accuracy = accuracy_score(y_train, y_pred_train)
    
    # Classification report for recall
    report = classification_report(y_test, y_pred_test, output_dict=True)
    test_recall = report['1']['recall'] 
    
    # Balanced Error Rate for test set
    test_ber = balanced_error_rate(y_test, y_pred_test)
    
    print(f"\nEvaluation Report for {model_name}:\n")
    print("Classification Report (Test Set):\n", classification_report(y_test, y_pred_test))
    print("Confusion Matrix (Test Set):\n", confusion_matrix(y_test, y_pred_test))
    print(f"Test Accuracy: {test_accuracy}")
    print(f"Train Accuracy: {train_accuracy}")
    print(f"Test Recall): {test_recall}")
    print("Balanced Error Rate (Test Set):", test_ber)
    
    # Return metrics as dictionary for further use
    return {
        "model_name": model_name,
        "train_accuracy": train_accuracy,
        "test_accuracy": test_accuracy,
        "test_recall": test_recall,
        "test_ber": test_ber
    }




In [9]:
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np

# Step 1: Train Word2Vec model on your corpus
sentences = [text.split() for text in data['text']]  # Tokenize text
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)

# Step 2: Transform each document into an averaged Word2Vec vector
def document_vector(doc):
    doc = [word for word in doc if word in word2vec_model.wv.index_to_key]
    return np.mean(word2vec_model.wv[doc], axis=0) if len(doc) > 0 else np.zeros(100)

X = np.array([document_vector(text.split()) for text in data['text']])
y = data['label']

# Step 3: Split data and train Random Forest
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Step 4: Evaluate model
predictions = rf_model.predict(X_test)


AttributeError: 'float' object has no attribute 'split'

In [10]:
# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
temp = evaluate_model(lr_model, X_train, X_test, y_train, y_test, model_name="Logistic Regression")
recall_before = recall_before + [temp.get('test_recall')]
accuracy_before = accuracy_before + [temp.get('test_accuracy')]
ber_before = ber_before + [temp.get('test_ber')]

NameError: name 'X_train' is not defined

In [11]:
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
temp = evaluate_model(nb_model, X_train, X_test, y_train, y_test, model_name="Naive Bayes")
recall_before = recall_before + [temp.get('test_recall')]
accuracy_before = accuracy_before + [temp.get('test_accuracy')]
ber_before = ber_before + [temp.get('test_ber')]

NameError: name 'X_train' is not defined

In [None]:
# Support Vector Machine
svm_model = LinearSVC(random_state=42)
svm_model.fit(X_train, y_train)
temp = evaluate_model(svm_model, X_train, X_test, y_train, y_test, model_name="SVM")
recall_before = recall_before + [temp.get('test_recall')]
accuracy_before = accuracy_before + [temp.get('test_accuracy')]
ber_before = ber_before + [temp.get('test_ber')]

In [None]:
# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
temp = evaluate_model(rf_model, X_train, X_test, y_train, y_test, model_name="Random Forest")
recall_before = recall_before + [temp.get('test_recall')]
accuracy_before = accuracy_before + [temp.get('test_accuracy')]
ber_before = ber_before + [temp.get('test_ber')]

In [None]:
# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, random_state=42)
xgb_model.fit(X_train, y_train)
temp = evaluate_model(xgb_model, X_train, X_test, y_train, y_test, model_name="XGBoost")
recall_before = recall_before + [temp.get('test_recall')]
accuracy_before = accuracy_before + [temp.get('test_accuracy')]
ber_before = ber_before + [temp.get('test_ber')]

In [None]:
# from gensim.models import Doc2Vec, TaggedDocument
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Step 1: Prepare data for Doc2Vec
tagged_data = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(data['text'])]
doc2vec_model = Doc2Vec(tagged_data, vector_size=100, window=5, min_count=2, epochs=20)

# Step 2: Create document vectors for each document in the dataset
X = np.array([doc2vec_model.infer_vector(text.split()) for text in data['text']])
y = data['label']

# Step 3: Split data and train Random Forest
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Step 4: Evaluate model
predictions = rf_model.predict(X_test)


In [None]:
# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
temp = evaluate_model(lr_model, X_train, X_test, y_train, y_test, model_name="Logistic Regression")
recall_before = recall_before + [temp.get('test_recall')]
accuracy_before = accuracy_before + [temp.get('test_accuracy')]
ber_before = ber_before + [temp.get('test_ber')]

In [None]:
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
temp = evaluate_model(nb_model, X_train, X_test, y_train, y_test, model_name="Naive Bayes")
recall_before = recall_before + [temp.get('test_recall')]
accuracy_before = accuracy_before + [temp.get('test_accuracy')]
ber_before = ber_before + [temp.get('test_ber')]

In [None]:
# Support Vector Machine
svm_model = LinearSVC(random_state=42)
svm_model.fit(X_train, y_train)
temp = evaluate_model(svm_model, X_train, X_test, y_train, y_test, model_name="SVM")
recall_before = recall_before + [temp.get('test_recall')]
accuracy_before = accuracy_before + [temp.get('test_accuracy')]
ber_before = ber_before + [temp.get('test_ber')]

In [None]:
# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
temp = evaluate_model(rf_model, X_train, X_test, y_train, y_test, model_name="Random Forest")
recall_before = recall_before + [temp.get('test_recall')]
accuracy_before = accuracy_before + [temp.get('test_accuracy')]
ber_before = ber_before + [temp.get('test_ber')]

In [None]:
# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, random_state=42)
xgb_model.fit(X_train, y_train)
temp = evaluate_model(xgb_model, X_train, X_test, y_train, y_test, model_name="XGBoost")
recall_before = recall_before + [temp.get('test_recall')]
accuracy_before = accuracy_before + [temp.get('test_accuracy')]
ber_before = ber_before + [temp.get('test_ber')]

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Step 1: Load SBERT model and create embeddings for each document
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
X = sbert_model.encode(data['text'].tolist())
y = data['label']

# Step 2: Split data and train Random Forest
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Step 3: Evaluate model
predictions = rf_model.predict(X_test)


In [None]:
# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
temp = evaluate_model(lr_model, X_train, X_test, y_train, y_test, model_name="Logistic Regression")
recall_before = recall_before + [temp.get('test_recall')]
accuracy_before = accuracy_before + [temp.get('test_accuracy')]
ber_before = ber_before + [temp.get('test_ber')]

In [None]:
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
temp = evaluate_model(nb_model, X_train, X_test, y_train, y_test, model_name="Naive Bayes")
recall_before = recall_before + [temp.get('test_recall')]
accuracy_before = accuracy_before + [temp.get('test_accuracy')]
ber_before = ber_before + [temp.get('test_ber')]

In [None]:
# Support Vector Machine
svm_model = LinearSVC(random_state=42)
svm_model.fit(X_train, y_train)
temp = evaluate_model(svm_model, X_train, X_test, y_train, y_test, model_name="SVM")
recall_before = recall_before + [temp.get('test_recall')]
accuracy_before = accuracy_before + [temp.get('test_accuracy')]
ber_before = ber_before + [temp.get('test_ber')]

In [None]:
# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
temp = evaluate_model(rf_model, X_train, X_test, y_train, y_test, model_name="Random Forest")
recall_before = recall_before + [temp.get('test_recall')]
accuracy_before = accuracy_before + [temp.get('test_accuracy')]
ber_before = ber_before + [temp.get('test_ber')]

In [None]:
# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, random_state=42)
xgb_model.fit(X_train, y_train)
temp = evaluate_model(xgb_model, X_train, X_test, y_train, y_test, model_name="XGBoost")
recall_before = recall_before + [temp.get('test_recall')]
accuracy_before = accuracy_before + [temp.get('test_accuracy')]
ber_before = ber_before + [temp.get('test_ber')]

In [None]:
import tensorflow_hub as hub
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np

# Step 1: Load USE model from TensorFlow Hub
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Step 2: Generate embeddings for each document
texts = data['text'].tolist()
embeddings = use_model(texts).numpy()  # Convert tensor to numpy array for compatibility

# Step 3: Prepare labels
y = data['label']

# Step 4: Split data and train Random Forest
X_train, X_test, y_train, y_test = train_test_split(embeddings, y, test_size=0.2, random_state=42)
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Step 5: Evaluate the model
predictions = rf_model.predict(X_test)

In [7]:
from transformers import AutoTokenizer, AutoModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import torch
import numpy as np

# Step 1: Load ELECTRA model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator")
model = AutoModel.from_pretrained("google/electra-small-discriminator")

# Step 2: Tokenize the text data and get embeddings
texts = data['text'].tolist()
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)

with torch.no_grad():
    outputs = model(**inputs)

# Use mean pooling to get a single embedding per sentence/document
embeddings = outputs.last_hidden_state.mean(dim=1).numpy()  # Shape: (num_documents, embedding_dim)

# Step 3: Prepare labels
y = data['label']

# Step 4: Split data and train Random Forest
X_train, X_test, y_train, y_test = train_test_split(embeddings, y, test_size=0.2, random_state=42)
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Step 5: Evaluate the model
predictions = rf_model.predict(X_test)


TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [None]:
evaluate_model(rf_model, X_train, X_test, y_train, y_test, model_name="SVM")

In [None]:
# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
temp = evaluate_model(lr_model, X_train, X_test, y_train, y_test, model_name="Logistic Regression")
recall_before = recall_before + [temp.get('test_recall')]
accuracy_before = accuracy_before + [temp.get('test_accuracy')]
ber_before = ber_before + [temp.get('test_ber')]

In [None]:
# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, random_state=42)
xgb_model.fit(X_train, y_train)
temp = evaluate_model(xgb_model, X_train, X_test, y_train, y_test, model_name="XGBoost")
recall_before = recall_before + [temp.get('test_recall')]
accuracy_before = accuracy_before + [temp.get('test_accuracy')]
ber_before = ber_before + [temp.get('test_ber')]

In [12]:
from sklearn.pipeline import Pipeline