## Perovskite Solar Cells 
**Literature Classification using TF-IDF vectorizer**

This notebook documents one of the approaches we've undertaken to classify relevant literature for our project. Using the dataset `merged_label.csv`, which contains articles categorized as "relevant" or "irrelevant" by our mentor, we aim to build a model that effectively differentiates these categories.

The dataframe is as follows:
- `link`: The URL or reference to the article or journal.
- `label`: A binary label indicating whether the article is relevant (1) or irrelevant (0).
- `text`: The article content extracted from PDFs for text-based analysis.

**1. Importing and Preparing Files**

In [14]:
data['link'].nunique()

465

In [9]:
# file path
relative_path_to_file = os.path.join("..", "..", "merged_label.csv")
absolute_path_to_file = os.path.realpath(relative_path_to_file)

# read csv file
data = pd.read_csv(absolute_path_to_file)
data.shape

(466, 3)

In [8]:
# file path
relative_path_to_file = os.path.join("..", "..", "data", "merged_label.csv")
absolute_path_to_file = os.path.realpath(relative_path_to_file)

# read csv file
data = pd.read_csv(absolute_path_to_file)
data.shape

(466, 3)

In [4]:
# Import libraries
import os
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# file path
relative_path_to_file = os.path.join("..", "..", "data", "merged_label.csv")
absolute_path_to_file = os.path.realpath(relative_path_to_file)

# read csv file
data = pd.read_csv(absolute_path_to_file)
data.head()

Unnamed: 0,link,label,text
0,https://www.science.org/doi/10.1126/science.ad...,1,Improved charge extraction in inverted perovsk...
1,https://www.nature.com/articles/s41566-019-0398-2,1,Surface passivation of perovskite film for eff...
2,https://www.nature.com/articles/s41560-020-007...,1,Intact 2D/3D halide junction perovskite solar ...
3,https://www.science.org/doi/10.1126/science.ab...,1,Deterministic fabrication of 3D/2D perovskite ...
4,https://www.nature.com/articles/s41467-021-236...,1,Multication perovskite 2D/3D interfaces form v...


### 2. Data Exploration

In [5]:
data.info()
print("\nMissing Values:\n", data.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466 entries, 0 to 465
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   link    466 non-null    object
 1   label   466 non-null    int64 
 2   text    464 non-null    object
dtypes: int64(1), object(2)
memory usage: 11.0+ KB

Missing Values:
 link     0
label    0
text     2
dtype: int64


### 3. Text Vectorization

In [None]:
pip install transformers

In [6]:
from transformers import AutoTokenizer, AutoModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import torch
import numpy as np

# Step 1: Load ELECTRA model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator")
model = AutoModel.from_pretrained("google/electra-small-discriminator")

# Step 2: Tokenize the text data and get embeddings
texts = data['text'].tolist()
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)

with torch.no_grad():
    outputs = model(**inputs)

# Use mean pooling to get a single embedding per sentence/document
embeddings = outputs.last_hidden_state.mean(dim=1).numpy()  # Shape: (num_documents, embedding_dim)

# Step 3: Prepare labels
y = data['label']

# Step 4: Split data and train Random Forest
X_train, X_test, y_train, y_test = train_test_split(embeddings, y, test_size=0.2)

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

### 4. Model Selection / Exploration

In [None]:
models = ['Logistic Regression', 'SVM', 'Random Forest', 'XGBoost']

# list to keep record of performance metric
recall_before = []
accuracy_before = []
ber_before = []

recall_after = []
accuracy_after = []
ber_after = []

# Function to calculate Balanced Error Rate
def balanced_error_rate(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    ber = 1 - (sensitivity + specificity) / 2
    return ber

# Model Evaluation Function with Train/Test Accuracy
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name="Model"):
    # Predictions on test and train sets
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    
    # Accuracy scores
    test_accuracy = accuracy_score(y_test, y_pred_test)
    train_accuracy = accuracy_score(y_train, y_pred_train)
    
    # Classification report for recall
    report = classification_report(y_test, y_pred_test, output_dict=True)
    test_recall = report['1']['recall'] 
    
    # Balanced Error Rate for test set
    test_ber = balanced_error_rate(y_test, y_pred_test)
    
    print(f"\nEvaluation Report for {model_name}:\n")
    print("Classification Report (Test Set):\n", classification_report(y_test, y_pred_test))
    print("Confusion Matrix (Test Set):\n", confusion_matrix(y_test, y_pred_test))
    print(f"Test Accuracy: {test_accuracy}")
    print(f"Train Accuracy: {train_accuracy}")
    print(f"Test Recall): {test_recall}")
    print("Balanced Error Rate (Test Set):", test_ber)
    
    # Return metrics as dictionary for further use
    return {
        "model_name": model_name,
        "train_accuracy": train_accuracy,
        "test_accuracy": test_accuracy,
        "test_recall": test_recall,
        "test_ber": test_ber
    }

In [None]:
# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
temp = evaluate_model(rf_model, X_train, X_test, y_train, y_test, model_name="Random Forest")
recall_before = recall_before + [temp.get('test_recall')]
accuracy_before = accuracy_before + [temp.get('test_accuracy')]
ber_before = ber_before + [temp.get('test_ber')]

In [None]:
# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False)
xgb_model.fit(X_train, y_train)
temp = evaluate_model(xgb_model, X_train, X_test, y_train, y_test, model_name="XGBoost")
recall_before = recall_before + [temp.get('test_recall')]
accuracy_before = accuracy_before + [temp.get('test_accuracy')]
ber_before = ber_before + [temp.get('test_ber')]

In [3]:
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import torch

# Load dataset
relative_path_to_file = os.path.join("..", "..", "data", "merged_label.csv")
absolute_path_to_file = os.path.realpath(relative_path_to_file)
df = pd.read_csv(absolute_path_to_file)

# Ensure your dataset has 'text' and 'label' columns
texts = df['text'].tolist()
labels = df['label'].tolist()

# Step 1: Load ELECTRA Model and Tokenizer for Classification
tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator")
model = AutoModelForSequenceClassification.from_pretrained("google/electra-small-discriminator", num_labels=2)

# Step 2: Tokenize the dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Step 3: Split into train and test sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)

# Step 4: Define TrainingArguments and Trainer
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Step 5: Train the model
trainer.train()

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`