In [2]:
import os
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
import torch # if on apple, must be imported or model will fall to cpu

In [None]:
os.chdir('/data/kayla_data/')
df = pd.read_csv('tweets_final_cleaned_july23.csv')

In [None]:
candidate_labels = ["takes a position", "does not take a position"]
hypothesis_template = "This tweet {} on a political issue"
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/deberta-v3-large-zeroshot-v2.0", device=torch.device('mps'), batch_size = 32)

##### Define the functions for classification. Saving file every 8000 in case of failure

In [None]:
def process_batch(batch, id_col, text_col, hypothesis_template, candidate_labels):
    texts = batch[text_col].tolist()  # convert documents in batch to a list
    results = classifier(texts, candidate_labels=candidate_labels, hypothesis_template=hypothesis_template, multi_label=False)
    
    # what we will want for each document
    df_results = pd.DataFrame({
        'tweet_id': batch[id_col], 
        'text': texts,  
        'predicted_label': [result['labels'][0] for result in results],
        'score': [result['scores'][0] for result in results]
    })
    
    return df_results

# defining function to apply classification
def predict_political(df, id_col, text_col, hypothesis_template, candidate_labels, batch_size=8000, checkpoint_dir='checkpoints_large'):
    os.makedirs(checkpoint_dir, exist_ok=True)
    num_batches = len(df) // batch_size + (1 if len(df) % batch_size != 0 else 0)
    
    # in case model fails after hours of running, start after most recent batch
    existing_files = sorted([f for f in os.listdir(checkpoint_dir) if f.startswith('batch_')])
    start_batch = len(existing_files)
    
    for i in tqdm(range(start_batch, num_batches), desc="Processing Batches"):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))

        batch = df.iloc[start_idx:end_idx]

        # process batch
        batch_results = process_batch(batch, id_col, text_col, hypothesis_template, candidate_labels)

        # save current batch results as df
        batch_filename = os.path.join(checkpoint_dir, f'batch_{i}.csv')
        batch_results.to_csv(batch_filename, index=False)
        torch.mps.empty_cache()
    
    # concatenate all batch files into a single file at the end
    batch_files = sorted([os.path.join(checkpoint_dir, f) for f in os.listdir(checkpoint_dir) if f.startswith('batch_')])
    final_result_df = pd.concat((pd.read_csv(f) for f in batch_files), ignore_index=True)

    # save final df to file
    final_output_path = 'final_classified_tweets.csv'
    final_result_df.to_csv(final_output_path, index=False)
    
    return final_result_df
    


In [None]:
text_col = 'text'
candidate_labels = ['takes a position', 'does not take a position']
hypothesis_template = 'This tweet {} on a political issue'
checkpoint_dir = '/checkpoints/checkpoints_large_aug'
# note: overwriting batch size from above chunk to smaller size
final_results = predict_political(df, id_col='tweet_id', text_col=text_col, hypothesis_template=hypothesis_template,candidate_labels=candidate_labels, batch_size=5000, checkpoint_dir=checkpoint_dir)

Processing Batches: 100%|██████████| 144/144 [6:00:25<00:00, 150.18s/it]  


this is redundant but just in case any data loss save full dataframe from fully run model

In [6]:
final_results.to_csv('deblarge_classified_tweets_aug.csv')

# Getting the positives

In [None]:

df = pd.read_csv('/final_classified_tweets_aug19.csv')

In [31]:
#df.loc[df['predicted_label'] == 'takes a position'].to_csv('classified_deblarge_positives.csv')
len(df.loc[df['predicted_label'] == 'takes a position'])

431606