In [2]:
import os
import json
import pandas as pd
from googleapiclient import discovery
from dotenv import load_dotenv
from tqdm import tqdm
import time

# Load environment variables
load_dotenv()
API_KEY = os.getenv("API_KEY")

# Initialize Perspective API client
client = discovery.build(
    "commentanalyzer",
    "v1alpha1",
    developerKey=API_KEY,
    discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
    static_discovery=False,
)

# Load the training dataset
file_path = './civility_data/test.tsv'
train_data = pd.read_csv(file_path, sep='\t')

# Function to get the toxicity score from Perspective API
def get_toxicity_score(text):
    analyze_request = {
        'comment': { 'text': text },
        'requestedAttributes': {'TOXICITY': {}},
        'languages': ['en']
    }
    try:
        response = client.comments().analyze(body=analyze_request).execute()
        score = response['attributeScores']['TOXICITY']['summaryScore']['value']
        return score
    except Exception as e:
        print(f"Error for text: {text[:30]}... -> {e}")
        return None

# Apply the function to each text in the dataset
# tqdm is used to show a progress bar as it can take some time to process all texts
toxicity_scores = []
for text in tqdm(train_data['text']):
    score = get_toxicity_score(text)
    toxicity_scores.append(score)
    time.sleep(1)  # Pause to avoid rate limiting; adjust based on API restrictions

# Add the scores to the dataset
train_data['toxicity_score'] = toxicity_scores

# Save the updated dataset to a new file
train_data.to_csv('./civility_data/test_with_perspectiveScore.tsv', sep='\t', index=False)
print("Updated dataset saved with toxicity scores.")
train_data

100%|██████████| 6396/6396 [1:57:41<00:00,  1.10s/it]  

Updated dataset saved with toxicity scores.





Unnamed: 0,text,toxicity_score
0,Black Female Democrats Call for Changes in Par...,0.242826
1,After making fun of myself (and @USER a little...,0.162453
2,You know what I hate? Negros.,0.853334
3,:(( I want him to rest 4ever :((( He is such a...,0.114090
4,I wish I can blink my eyes and I'll magically ...,0.288572
...,...,...
6391,Celtics lose to Bobcats,0.083999
6392,Does anyone know if things got easier out ther...,0.249241
6393,"Nothing like girl time with junk food, warm su...",0.304398
6394,@USER This is a guy who weaponized the judicia...,0.743089


In [1]:
import os
import json
import pandas as pd
from googleapiclient import discovery
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# Load environment variables
load_dotenv()
API_KEY = os.getenv("API_KEY")

# Initialize Perspective API client
client = discovery.build(
    "commentanalyzer",
    "v1alpha1",
    developerKey=API_KEY,
    discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
    static_discovery=False,
)

# Load the training dataset
file_path = './civility_data/train.tsv'
train_data = pd.read_csv(file_path, sep='\t')

# Function to get the toxicity score from Perspective API
def get_toxicity_score(text):
    analyze_request = {
        'comment': { 'text': text },
        'requestedAttributes': {'TOXICITY': {}}
    }
    try:
        response = client.comments().analyze(body=analyze_request).execute()
        score = response['attributeScores']['TOXICITY']['summaryScore']['value']
        return score
    except Exception as e:
        print(f"Error for text: {text[:30]}... -> {e}")
        return None

# Process texts concurrently
toxicity_scores = []
with ThreadPoolExecutor(max_workers=10) as executor:
    # Submit all texts to the executor for concurrent processing
    future_to_text = {executor.submit(get_toxicity_score, text): text for text in train_data['text']}
    for future in as_completed(future_to_text):
        try:
            toxicity_scores.append(future.result())
        except Exception as exc:
            print(f"Generated an exception: {exc}")
            toxicity_scores.append(None)

# Add the scores to the dataset
train_data['toxicity_score'] = toxicity_scores

# Save the updated dataset to a new file
train_data.to_csv('./civility_data/train_with_toxicity_scores.tsv', sep='\t', index=False)
print("Updated dataset saved with toxicity scores.")


Error for text: @USER @USER @USER @USER Fascis... -> [SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:2559)Error for text: @USER I second this they’re fu... -> [SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:2559)

Error for text: @USER BREAKING: 9th Circuit Or... -> [SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:2559)Error for text: @USER Why cant ISIS blow up th... -> [SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:2559)

Error for text: @USER @USER @USER @USER @USER ... -> [SSL: DECRYPTION_FAILED_OR_BAD_RECORD_MAC] decryption failed or bad record mac (_ssl.c:2559)Error for text: @USER @USER Free milk is a hea... -> [SSL: DECRYPTION_FAILED_OR_BAD_RECORD_MAC] decryption failed or bad record mac (_ssl.c:2559)

Error for text: @USER Holder is worse than Joh... -> [SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:2559)
Error for text: @USER &amp; this statement by ... -> [SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:2559)
Error for te

: 