In [1]:
import os
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
import torch # if on apple silicon, must be imported or model will fall to cpu

In [2]:
os.chdir('/Users/Kayla/Library/CloudStorage/OneDrive-ThePennsylvaniaStateUniversity/RA_SPR_2024/data/kayla_data/')
df = pd.read_csv('tweets_final_cleaned_july23.csv')

In [3]:
candidate_labels = ["takes a position", "does not take a position"]
hypothesis_template = "This tweet {} on a political issue"
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/deberta-v3-large-zeroshot-v2.0", device=torch.device('mps'), batch_size = 32)



defining the classification. saving files every 8000 in case of failure.

In [4]:
def process_batch(batch, id_col, text_col, hypothesis_template, candidate_labels):
    texts = batch[text_col].tolist()  # Convert batch texts to a list
    results = classifier(texts, candidate_labels=candidate_labels, hypothesis_template=hypothesis_template, multi_label=False)
    
    # Extract results for each text
    df_results = pd.DataFrame({
        'tweet_id': batch[id_col],  # Include the tweet ID
        'text': texts,  # Include the original text
        'predicted_label': [result['labels'][0] for result in results],
        'score': [result['scores'][0] for result in results]
    })
    
    return df_results

def predict_political(df, id_col, text_col, hypothesis_template, candidate_labels, batch_size=8000, checkpoint_dir='checkpoints_large'):
    os.makedirs(checkpoint_dir, exist_ok=True)
    num_batches = len(df) // batch_size + (1 if len(df) % batch_size != 0 else 0)
    
    # Determine starting point based on existing checkpoint files
    existing_files = sorted([f for f in os.listdir(checkpoint_dir) if f.startswith('batch_')])
    start_batch = len(existing_files)
    
    for i in tqdm(range(start_batch, num_batches), desc="Processing Batches"):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))

        batch = df.iloc[start_idx:end_idx]

        # Process the batch
        batch_results = process_batch(batch, id_col, text_col, hypothesis_template, candidate_labels)

        # Save the current batch's results to a separate file
        batch_filename = os.path.join(checkpoint_dir, f'batch_{i}.csv')
        batch_results.to_csv(batch_filename, index=False)
        torch.mps.empty_cache()
    
    # Concatenate all batch files into a single file at the end
    batch_files = sorted([os.path.join(checkpoint_dir, f) for f in os.listdir(checkpoint_dir) if f.startswith('batch_')])
    final_result_df = pd.concat((pd.read_csv(f) for f in batch_files), ignore_index=True)

    # Save the final results to a CSV file
    final_output_path = 'final_classified_tweets.csv'
    final_result_df.to_csv(final_output_path, index=False)
    
    return final_result_df
    


In [None]:
text_col = 'text'
candidate_labels = ['takes a position', 'does not take a position']
hypothesis_template = 'This tweet {} on a political issue'
checkpoint_dir = '/kayla_data/checkpoints/checkpoints_large_aug'
final_results = predict_political(df, id_col='tweet_id', text_col=text_col, hypothesis_template=hypothesis_template,candidate_labels=candidate_labels, batch_size=5000, checkpoint_dir=checkpoint_dir)

Processing Batches: 100%|██████████| 144/144 [6:00:25<00:00, 150.18s/it]  


In [None]:
# redundant df save just in case
final_results.to_csv('deblarge_classified_tweets_aug.csv')

# Getting the positives

In [None]:

df = pd.read_csv('/kayla_data/final_classified_tweets_aug19.csv')

In [31]:
#df.loc[df['predicted_label'] == 'takes a position'].to_csv('classified_deblarge_positives.csv')
len(df.loc[df['predicted_label'] == 'takes a position'])

431606