In [1]:
! pip install spacy scikit-learn
! python -m spacy download en_core_web_sm


2023-11-26 13:57:09.596926: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-26 13:57:09.596999: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-26 13:57:09.597034: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-26 13:57:09.609856: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-sm==3.6.0
  Downloading htt

In [3]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

df = pd.read_csv('news_price.csv')
# Drop rows where 'summary', 'subject', or 'action' are NaN
df = df.dropna(subset=['summary', 'subject', 'action'])

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Preprocessing function using spaCy
def preprocess(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# Apply preprocessing to the 'summary' column
df['processed_summary'] = df['summary'].apply(preprocess)

# Combine 'subject' and 'processed_summary' as features
df['features'] = df['subject'] + " " + df['processed_summary']

# Feature extraction using TF-IDF
tfidf = TfidfVectorizer(max_features=1000)
X = tfidf.fit_transform(df['features'])
y = df['action']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training using RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Model evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

        long       0.60      0.70      0.64       172
       short       0.46      0.35      0.40       125

    accuracy                           0.55       297
   macro avg       0.53      0.52      0.52       297
weighted avg       0.54      0.55      0.54       297



In [4]:
# Combine 'title' and 'summary' as features
df['combined_features'] = df['title'] + " " + df['summary']
df['combined_features'] = df['combined_features'].apply(preprocess)

# Feature extraction using TF-IDF
tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1,2), min_df=2)
X = tfidf.fit_transform(df['combined_features'])
y = df['action']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Trying a different model, e.g., Logistic Regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

        long       0.60      0.69      0.64       172
       short       0.47      0.38      0.42       125

    accuracy                           0.56       297
   macro avg       0.53      0.53      0.53       297
weighted avg       0.54      0.56      0.55       297



In [5]:
! pip install transformers torch datasets transformers[torch] accelerate -U


Collecting torch
  Downloading torch-2.1.1-cp310-cp310-manylinux1_x86_64.whl (670.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.10

In [1]:
! pip install seqeval


Collecting seqeval
  Using cached seqeval-1.2.2.tar.gz (43 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=451ea9e71d432ce36096c6c3dbf1b6fec1c192474fd9e399119879bee6a0a57b
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import Dataset

# Load your DataFrame
df = pd.read_csv('news_price.csv')
# Drop rows where 'summary', 'subject', or 'action' are NaN
df = df.dropna(subset=['summary', 'subject', 'action'])

# Combine 'title' and 'summary' into a single text column
df['text'] = df['title'] + " " + df['summary']

# Map your labels to integers if they are not already
label_dict = {'long': 0, 'short': 1}
df['label'] = df['action'].map(label_dict)

# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert DataFrame to Hugging Face's Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenize the input (text)
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

# Define training arguments
training_args = TrainingArguments(
    output_dir='.',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='.',
    load_best_model_at_end=True,
    save_strategy="steps",  # or "epoch" depending on your preference
    evaluation_strategy="steps",  # should match save_strategy
    logging_steps=50,  # How often to log loss; adjust to your preference
    eval_steps=50,  # How often to run evaluation; adjust to your preference
)


# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Evaluate the model
#trainer.evaluate()


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1184 [00:00<?, ? examples/s]

Map:   0%|          | 0/297 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss
50,0.7062,0.684932
100,0.6946,0.690787
150,0.6864,0.694001
200,0.6905,0.676002
250,0.6887,0.688612
300,0.6842,0.682856
350,0.6695,0.67715
400,0.6805,0.679466


TrainOutput(global_step=444, training_loss=0.6866910006548907, metrics={'train_runtime': 24571.3267, 'train_samples_per_second': 0.145, 'train_steps_per_second': 0.018, 'total_flos': 898063809707520.0, 'train_loss': 0.6866910006548907, 'epoch': 3.0})

In [3]:
results = trainer.evaluate()

{'eval_loss': 0.6935545802116394,
 'eval_runtime': 428.3735,
 'eval_samples_per_second': 0.693,
 'eval_steps_per_second': 0.044,
 'epoch': 3.0}

In [None]:
import numpy as np
from transformers import EvalPrediction
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define the trainer with the compute_metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results)


Step,Training Loss,Validation Loss
