In [1]:
import requests
from bs4 import BeautifulSoup
import os, psutil
import re
import numpy as np
import pandas as pd
import datetime
import nltk
from nltk.tokenize import sent_tokenize
import glob
# nltk.download('punkt_tab')

import torch
from transformers import BertTokenizer, BertForSequenceClassification, AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
# from .autonotebook import tqdm as notebook_tqdm

  from .autonotebook import tqdm as notebook_tqdm


# Initalize Model

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
model_paths = ['/cluster/home/hlee37/git/gutenberg/phase3/best-sentiment-bert']

In [4]:
model_list = []

for p in model_paths:
    model = AutoModelForSequenceClassification.from_pretrained(p)
    model.eval()
    model.to(device)
    model_list.append(model)

tokenizer = AutoTokenizer.from_pretrained(model_paths[0])

# Define Functions for Operation

In [5]:
def classify_batch(sentences, threshold=0.5):
    if not sentences:
        return []

    encodings = tokenizer(
        sentences,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    ).to(device)

    with torch.no_grad():
        probs_per_model = []
        for model in model_list:
            logits = model(**encodings).logits  # [B, C]
            probs = torch.softmax(logits, dim=1)
            probs_per_model.append(probs)

        avg_probs = torch.mean(torch.stack(probs_per_model), dim=0)  # [B, C]
        pos_probs = avg_probs[:, 1].cpu().numpy()  # [B]
        neg_probs = 1 - pos_probs
        # preds = (pos_probs >= threshold).long()

    results = []
    for sentence, pos, neg in zip(sentences, pos_probs, neg_probs):
        results.append({
            'sentence': sentence,
            'positive_probability': float(pos),
            'negative_probability': float(neg)
        })
    return results
 

In [9]:
def mount_diary(file_path):
    now = datetime.datetime.now()

    labeled_file = pd.read_csv(file_path)
    all_sentences = labeled_file['sentence'].dropna().tolist()

    try:
        filename = os.path.basename(file_path)
        parts = filename.replace(".csv", "").split("_")
        id_ = parts[0]
        title = parts[1]
        batch_num = parts[2] if len(parts) > 2 else "0"
    except IndexError:
        print(f"‚ùå Could not extract metadata: {file_path}")
        return

    print(f"‚úÖ Extracted {len(all_sentences)} sentences from {title} (ID: {id_})")
    print(f"‚è±Ô∏è Time taken: {datetime.datetime.now() - now}")

    return all_sentences, id_, title, batch_num, labeled_file


In [12]:
import os
import glob
import datetime
import pandas as pd
from tqdm import tqdm

def main(folder_path, batch_size=32):
    file_path_list = glob.glob(os.path.join(folder_path, "*.csv"))
    print(f"üìÇ Found {len(file_path_list)} files: {file_path_list}")

    for file_path in tqdm(file_path_list, desc="üìñ Processing Diaries"):
        file_start_time = datetime.datetime.now()
        try:
            all_sentences, diary_id, diary_title, batch_num, input_df = mount_diary(file_path)
            len_sentences = len(all_sentences)

            output_dir = "./output"
            os.makedirs(output_dir, exist_ok=True)
            output_path = os.path.join(output_dir, f"{diary_id}_{diary_title}_{batch_num}.csv")

            if os.path.exists(output_path):
                print(f"‚è© {diary_id}_{diary_title}_{batch_num} already exists. Skipping.")
                continue
            
            print(f"üìù Number of Sentences: {len_sentences}")
            classified_sentences = {}
            book_names = []  # store book_name per sentence
            i = 1


            with tqdm(total=len_sentences, desc="üíø Classifying Sentences", mininterval=10) as pbar:
                for start in range(0, len_sentences, batch_size):
                    batch = all_sentences[start:start + batch_size]
                    batch_results = classify_batch(batch)

                    for j, res in enumerate(batch_results):
                        classified_sentences[i] = res
                        book_names.append(input_df.iloc[start + j]['book_name'])
                        i += 1

                    pbar.update(len(batch))

            df = pd.DataFrame.from_dict(classified_sentences, orient='index')
            df.columns = ['sentence', 'pos_prob', 'neg_prob']
            df['book_name'] = book_names
            df.to_csv(output_path, index=False)
            print(f"üèÅ Finished: {diary_id} , {diary_title}, batch {batch_num} | Time: {datetime.datetime.now() - file_start_time}")

        except Exception as e:
            print(f"üî• Error: {e}, file: {file_path}")


# Run Model

In [14]:
main('/cluster/home/hlee37/git/gutenberg/phase2/1.1/filtered_results')

üìÇ Found 1 files: ['/cluster/home/hlee37/git/gutenberg/phase2/1.1/filtered_results/filtered_sentences.csv']


üìñ Processing Diaries:   0%|                                                                                                                                      | 0/1 [00:00<?, ?it/s]

‚úÖ Extracted 3115 sentences from sentences (ID: filtered)
‚è±Ô∏è Time taken: 0:00:00.009335
üìù Number of Sentences: 3115



üíø Classifying Sentences: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3115/3115 [00:04<00:00, 683.15it/s][A
üìñ Processing Diaries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:04<00:00,  4.60s/it]

üèÅ Finished: filtered , sentences, batch 0 | Time: 0:00:04.602858





In [16]:
result=pd.read_csv('/cluster/home/hlee37/git/gutenberg/phase4/output/filtered_sentences_0.csv')

In [27]:
result['pred']=result[['pos_prob']].applymap(lambda x: 'positive' if x >= 0.5 else 'negative')

In [28]:
result.groupby(['book_name','pred']).count()[['sentence']]

Unnamed: 0_level_0,Unnamed: 1_level_0,sentence
book_name,pred,Unnamed: 2_level_1
A. C. Gregory,negative,80
A. C. Gregory,positive,28
Allan Cunningham,negative,1
Allan Cunningham,positive,1
Burke & Wills,negative,26
Burke & Wills,positive,70
Capt. Chas. Sturt,negative,28
Capt. Chas. Sturt,positive,28
D. Lindsay,negative,44
D. Lindsay,positive,73


In [30]:
pivot_result = result.pivot_table(index='book_name', columns='pred', values='sentence', aggfunc='count', fill_value=0)
pivot_result

pred,negative,positive
book_name,Unnamed: 1_level_1,Unnamed: 2_level_1
A. C. Gregory,80,28
Allan Cunningham,1,1
Burke & Wills,26,70
Capt. Chas. Sturt,28,28
D. Lindsay,44,73
Edmund B. Kennedy,3,3
Edward John Eyre,320,225
Ernest Giles,229,125
Frank & Alexander Jardine,49,17
Hon. D.C. Carnegie,84,55
