In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

In [10]:
class PsychologyResearchAnalyzer:
    def __init__(self, data_path):
        self.df = pd.read_csv(data_path)
        self.tokenizer = None
        self.max_length = None
        self.vocab_size = None
        
    def preprocess_data(self, text_column='Abstract', year_column='Year'):

        # Basic text cleaning
        self.df[text_column] = self.df[text_column].astype(str).str.lower()
        self.df[text_column] = self.df[text_column].str.replace(r'[^a-zA-Z\s]', '', regex=True)
        
        # Tokenization
        self.tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
        self.tokenizer.fit_on_texts(self.df[text_column])
        
        # Set vocab size explicitly
        self.vocab_size = len(self.tokenizer.word_index) + 1
        
        # Convert text to sequences
        sequences = self.tokenizer.texts_to_sequences(self.df[text_column])
        
        # Padding sequences
        self.max_length = max(len(seq) for seq in sequences)
        self.padded_sequences = pad_sequences(sequences, maxlen=self.max_length, padding='post')
        
        # Encode publication years
        label_encoder = LabelEncoder()
        self.df['year_encoded'] = label_encoder.fit_transform(self.df[year_column])
        
        return self.padded_sequences, self.df['year_encoded']
    
    def create_transformer_model(self, embedding_dim=128, num_heads=8, ff_dim=32):
        """
        Create a Transformer-based model for time series prediction
        """
        inputs = tf.keras.Input(shape=(self.max_length,))
        
        # Embedding Layer with explicit vocab_size and input_length
        x = tf.keras.layers.Embedding(
            input_dim=self.vocab_size, 
            output_dim=embedding_dim,
            input_length=self.max_length
        )(inputs)
        
        # Flatten the embedding layer
        x = tf.keras.layers.Flatten()(x)
        
        # Dense layers
        x = tf.keras.layers.Dense(ff_dim, activation='relu')(x)
        x = tf.keras.layers.Dense(ff_dim//2, activation='relu')(x)
        
        outputs = tf.keras.layers.Dense(1, activation='linear')(x)
        
        model = tf.keras.Model(inputs=inputs, outputs=outputs)
        model.compile(optimizer='adam', loss='mse')
        
        return model
    
    def train_model(self, X, y, test_size=0.2, epochs=50):

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42
        )
        
        model = self.create_transformer_model()
        history = model.fit(
            X_train, y_train, 
            validation_data=(X_test, y_test),
            epochs=epochs
        )
        
        return model, history


In [5]:
analyzer = PsychologyResearchAnalyzer('articles_tokenize.csv')
X, y = analyzer.preprocess_data()
model, training_history = analyzer.train_model(X, y)



Epoch 1/50
[1m754/754[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 40ms/step - loss: 15.3282 - val_loss: 7.4309
Epoch 2/50
[1m754/754[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 32ms/step - loss: 4.0388 - val_loss: 6.0377
Epoch 3/50
[1m754/754[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 33ms/step - loss: 0.9565 - val_loss: 5.8227
Epoch 4/50
[1m754/754[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 34ms/step - loss: 0.3907 - val_loss: 6.0750
Epoch 5/50
[1m754/754[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 35ms/step - loss: 0.3182 - val_loss: 5.9556
Epoch 6/50
[1m754/754[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 33ms/step - loss: 0.4395 - val_loss: 5.9854
Epoch 7/50
[1m754/754[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 33ms/step - loss: 0.4729 - val_loss: 5.8825
Epoch 8/50
[1m754/754[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 34ms/step - loss: 0.3857 - val_loss: 5.8020
Epoch 9/50
[1m754/754

In [9]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from sklearn.preprocessing import LabelEncoder

class PsychologyResearchAnalyzer:
    def __init__(self, data_path):

        self.df = pd.read_csv(data_path)
        
        # Load XLM-RoBERTa NER model and tokenizer
        self.ner_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
        self.ner_model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
        self.ner_pipeline = pipeline("ner", model=self.ner_model, tokenizer=self.ner_tokenizer)
        
        # Additional attributes for ML processing
        self.max_length = None
        self.vocab_size = None
    
    def extract_named_entities(self, text_column='Abstract_tokens'):

        # Apply NER to each abstract
        self.df['named_entities'] = self.df[text_column].apply(
            lambda x: self.ner_pipeline(str(x))
        )
        
        # Create separate columns for different entity types
        self.df['persons'] = self.df['named_entities'].apply(
            lambda entities: [e['word'] for e in entities if e['entity'] == 'I-PER']
        )
        self.df['locations'] = self.df['named_entities'].apply(
            lambda entities: [e['word'] for e in entities if e['entity'] == 'I-LOC']
        )
        
        return self.df[['persons', 'locations']]
    
    def preprocess_data(self, text_column='Abstract', year_column='Year'):
        """
        Enhanced preprocessing with XLM-RoBERTa tokenization
        """
        # Clean text
        self.df[text_column] = self.df[text_column].astype(str).str.lower()
        
        # Tokenize using XLM-RoBERTa tokenizer
        encoded_inputs = self.ner_tokenizer(
            self.df[text_column].tolist(), 
            padding=True, 
            truncation=True, 
            return_tensors='tf'
        )
        
        # Set max length and vocab size
        self.max_length = encoded_inputs['input_ids'].shape[1]
        self.vocab_size = self.ner_tokenizer.vocab_size
        
        # Encode publication years
        label_encoder = LabelEncoder()
        self.df['year_encoded'] = label_encoder.fit_transform(self.df[year_column])
        
        return encoded_inputs, self.df['year_encoded']
    
    def create_transformer_model(self, embedding_dim=128, num_heads=8, ff_dim=32):
        """
        Create a Transformer-based model with XLM-RoBERTa inspired architecture
        """
        inputs = tf.keras.Input(shape=(self.max_length,))
        
        # Use pre-trained embedding layer concept from XLM-RoBERTa
        x = tf.keras.layers.Embedding(
            input_dim=self.vocab_size, 
            output_dim=embedding_dim,
            input_length=self.max_length
        )(inputs)
        
        # Multi-head attention inspired layers
        x = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads, 
            key_dim=embedding_dim
        )(x, x)
        
        # Flatten and dense layers
        x = tf.keras.layers.Flatten()(x)
        x = tf.keras.layers.Dense(ff_dim, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.3)(x)
        x = tf.keras.layers.Dense(ff_dim//2, activation='relu')(x)
        
        outputs = tf.keras.layers.Dense(1, activation='linear')(x)
        
        model = tf.keras.Model(inputs=inputs, outputs=outputs)
        model.compile(optimizer='adam', loss='mse')
        
        return model
    
    def train_model(self, X, y, test_size=0.2, epochs=50):

        X_train, X_test, y_train, y_test = train_test_split(
            X['input_ids'], y, test_size=test_size, random_state=42
        )
        
        model = self.create_transformer_model()
        history = model.fit(
            X_train, y_train, 
            validation_data=(X_test, y_test),
            epochs=epochs
        )
        
        return model, history

# Example usage
analyzer = PsychologyResearchAnalyzer('articles_tokenize.csv')

# Extract named entities
named_entities = analyzer.extract_named_entities()

# Preprocess data
X, y = analyzer.preprocess_data()

# Train model
model, training_history = analyzer.train_model(X, y)

ImportError: 
AutoModelForTokenClassification requires the PyTorch library but it was not found in your environment.
However, we were able to find a TensorFlow installation. TensorFlow classes begin
with "TF", but are otherwise identically named to our PyTorch classes. This
means that the TF equivalent of the class you tried to import would be "TFAutoModelForTokenClassification".
If you want to use TensorFlow, please use TF classes instead!

If you really do want to use PyTorch please go to
https://pytorch.org/get-started/locally/ and follow the instructions that
match your environment.


In [6]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.48.1-py3-none-any.whl.metadata (44 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.17.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.24.0 (from transformers)
  Downloading huggingface_hub-0.27.1-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.5.2-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.24.0->transformers)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading transformers-4.48.1-py3-none-any.whl (9.7 MB)
   ---------------------------------------- 0.0/9.7 MB ? eta -:--:--
   --------------- ------------------------ 3.7/9.7 MB 18.2 MB/s eta 0:00:01
   ------------------------------------ --- 8.9/9.7 MB 22.2 MB/s eta 0:00

In [11]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import KFold, train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score

class PsychologyResearchAnalyzer:
    def __init__(self, data_path):
        self.df = pd.read_csv(data_path)
        
        # Load XLM-RoBERTa NER model and tokenizer
        self.ner_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
        self.ner_model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
        self.ner_pipeline = pipeline("ner", model=self.ner_model, tokenizer=self.ner_tokenizer)
        
        # Additional attributes for ML processing
        self.max_length = None
        self.vocab_size = None
    
    def preprocess_data(self, text_column='Abstract_tokens', year_column='Year'):
        """
        Enhanced preprocessing with XLM-RoBERTa tokenization
        """
        # Clean text
        self.df[text_column] = self.df[text_column].astype(str).str.lower()
        
        # Tokenize using XLM-RoBERTa tokenizer
        encoded_inputs = self.ner_tokenizer(
            self.df[text_column].tolist(), 
            padding=True, 
            truncation=True, 
            return_tensors='tf'
        )
        
        # Set max length and vocab size
        self.max_length = encoded_inputs['input_ids'].shape[1]
        self.vocab_size = self.ner_tokenizer.vocab_size
        
        # Encode publication years
        label_encoder = LabelEncoder()
        self.df['year_encoded'] = label_encoder.fit_transform(self.df[year_column])
        
        return encoded_inputs, self.df['year_encoded']
    
    def create_transformer_model(self, embedding_dim=128, num_heads=8, ff_dim=32):
        """
        Create a Transformer-based model with XLM-RoBERTa inspired architecture
        """
        inputs = tf.keras.Input(shape=(self.max_length,))
        
        x = tf.keras.layers.Embedding(
            input_dim=self.vocab_size, 
            output_dim=embedding_dim,
            input_length=self.max_length
        )(inputs)
        
        x = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads, 
            key_dim=embedding_dim
        )(x, x)
        
        x = tf.keras.layers.Flatten()(x)
        x = tf.keras.layers.Dense(ff_dim, activation='relu')(x)
        x = tf.keras.layers.Dropout(0.3)(x)
        x = tf.keras.layers.Dense(ff_dim//2, activation='relu')(x)
        
        outputs = tf.keras.layers.Dense(1, activation='linear')(x)
        
        model = tf.keras.Model(inputs=inputs, outputs=outputs)
        model.compile(optimizer='adam', loss='mse')
        
        return model
    
    def cross_validate(self, X, y, n_splits=5, epochs=50):
        """
        Perform k-fold cross-validation with comprehensive metrics
        """
        # Prepare input data
        input_ids = X['input_ids']
        
        # Initialize cross-validation
        kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
        
        # Metrics storage
        cv_scores = {
            'mse': [],
            'r2': []
        }
        
        # Fold-wise model training and evaluation
        for fold, (train_indices, val_indices) in enumerate(kfold.split(input_ids), 1):
            # Split data
            X_train, X_val = input_ids[train_indices], input_ids[val_indices]
            y_train, y_val = y[train_indices], y[val_indices]
            
            # Create and train model
            model = self.create_transformer_model()
            model.fit(
                X_train, y_train, 
                validation_data=(X_val, y_val),
                epochs=epochs,
                verbose=0
            )
            
            # Predict and evaluate
            y_pred = model.predict(X_val).flatten()
            
            # Calculate metrics
            mse = mean_squared_error(y_val, y_pred)
            r2 = r2_score(y_val, y_pred)
            
            # Store metrics
            cv_scores['mse'].append(mse)
            cv_scores['r2'].append(r2)
            
            print(f"Fold {fold}: MSE = {mse:.4f}, R² = {r2:.4f}")
        
        # Compute average cross-validation scores
        avg_mse = np.mean(cv_scores['mse'])
        avg_r2 = np.mean(cv_scores['r2'])
        
        print("\nCross-Validation Results:")
        print(f"Average MSE: {avg_mse:.4f}")
        print(f"Average R²: {avg_r2:.4f}")
        
        return cv_scores

# Example usage
analyzer = PsychologyResearchAnalyzer('articles_tokenize.csv')

# Preprocess data
X, y = analyzer.preprocess_data()

# Perform cross-validation
cv_results = analyzer.cross_validate(X, y, n_splits=5)
cv_results

ImportError: 
AutoModelForTokenClassification requires the PyTorch library but it was not found in your environment.
However, we were able to find a TensorFlow installation. TensorFlow classes begin
with "TF", but are otherwise identically named to our PyTorch classes. This
means that the TF equivalent of the class you tried to import would be "TFAutoModelForTokenClassification".
If you want to use TensorFlow, please use TF classes instead!

If you really do want to use PyTorch please go to
https://pytorch.org/get-started/locally/ and follow the instructions that
match your environment.
