In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

from nltk import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jaredfeldman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jaredfeldman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
aiid_data = pd.read_csv("aiid_data_pull_20231213.csv")
aiid_data.head(3)

Unnamed: 0,id,title,description,date,alleged_deployer_of_ai_system,alleged_developer_of_ai_system,alleged_harmed_or_nearly_harmed_parties
0,Incident 1,Google’s YouTube Kids App Presents Inappropria...,YouTube’s content filtering and recommendation...,5/19/15,YouTube,YouTube,Children
1,Incident 2,Warehouse robot ruptures can of bear spray and...,Twenty-four Amazon workers in New Jersey were ...,12/5/18,Amazon,Amazon,Warehouse Workers
2,Incident 3,Crashes with Maneuvering Characteristics Augme...,"A Boeing 737 crashed into the sea, killing 189...",10/27/18,Boeing,Boeing,"Airplane Passengers, Airplane Crew"


In [3]:
aiid_data.dtypes

id                                         object
title                                      object
description                                object
date                                       object
alleged_deployer_of_ai_system              object
alleged_developer_of_ai_system             object
alleged_harmed_or_nearly_harmed_parties    object
dtype: object

In [4]:
aiid_data['alleged_deployer_of_ai_system'].value_counts()

alleged_deployer_of_ai_system
Tesla                                            38
Facebook                                         36
Google                                           28
unknown                                          22
Amazon                                           21
                                                 ..
Pasco Sheriff's Office                            1
unnamed Australian telecommunications company     1
Estée Lauder                                      1
Naver                                             1
The Arena Group, Sports Illustrated               1
Name: count, Length: 351, dtype: int64

In [5]:
aiid_data['alleged_developer_of_ai_system'].value_counts()

alleged_developer_of_ai_system
unknown                                                                  86
Tesla                                                                    42
Facebook                                                                 36
OpenAI                                                                   27
Google                                                                   27
                                                                         ..
Sarawak Information Systems                                               1
Airbnb, Trooly                                                            1
SecurOS                                                                   1
Spanish Secretary of State for Security, Spanish Ministry of Interior     1
Google Bard                                                               1
Name: count, Length: 266, dtype: int64

In [6]:
aiid_data['alleged_harmed_or_nearly_harmed_parties'].value_counts()

alleged_harmed_or_nearly_harmed_parties
Facebook users                                                           7
Tesla drivers                                                            6
Women, Minority Groups                                                   5
unknown                                                                  4
Microsoft                                                                4
                                                                        ..
Facebook content moderators                                              1
Black students                                                           1
XPeng Motors customers                                                   1
Indian voters, Indian social media users, Indian women journalists       1
General public, Readers of Sports Illustrated, Journalistic integrity    1
Name: count, Length: 553, dtype: int64

In [7]:
# convert type of date field
aiid_data['date'] = aiid_data['date'].astype('datetime64[ns]')
aiid_data.dtypes

  aiid_data['date'] = aiid_data['date'].astype('datetime64[ns]')


id                                                 object
title                                              object
description                                        object
date                                       datetime64[ns]
alleged_deployer_of_ai_system                      object
alleged_developer_of_ai_system                     object
alleged_harmed_or_nearly_harmed_parties            object
dtype: object

In [8]:
# add day, month, and year to data
datetimes = pd.to_datetime(aiid_data['date'])

aiid_data['day'] = datetimes.dt.day
aiid_data['month'] = datetimes.dt.month
aiid_data['year'] = datetimes.dt.year

aiid_data.head(2)

Unnamed: 0,id,title,description,date,alleged_deployer_of_ai_system,alleged_developer_of_ai_system,alleged_harmed_or_nearly_harmed_parties,day,month,year
0,Incident 1,Google’s YouTube Kids App Presents Inappropria...,YouTube’s content filtering and recommendation...,2015-05-19,YouTube,YouTube,Children,19,5,2015
1,Incident 2,Warehouse robot ruptures can of bear spray and...,Twenty-four Amazon workers in New Jersey were ...,2018-12-05,Amazon,Amazon,Warehouse Workers,5,12,2018


In [9]:
# top 5 years of incident count
aiid_data['year'].value_counts().head(5)

year
2023    114
2022     93
2020     82
2021     71
2017     48
Name: count, dtype: int64

In [10]:
# explore 522
print(f'**Title**: {aiid_data["title"][522]}')
print(f"**Description**: {aiid_data['description'][522]}")
print(f"**Deployer**: {aiid_data['alleged_deployer_of_ai_system'][522]}")
print(f"**Developer**: {aiid_data['alleged_developer_of_ai_system'][522]}")
print(f"**Date**: {aiid_data['date'][522]}")
print(f"**Victim(s)**: {aiid_data['alleged_harmed_or_nearly_harmed_parties'][522]}")

**Title**: Tesla FSD Misidentified Truck Hauling Traffic Lights as Trail of Traffic Lights
**Description**: A Tesla driver posted on Twitter his Tesla FSD's "glitch," misidentifying deactivated traffic lights being carried by a truck as a constant trail of traffic lights while traveling at high speed on a highway.
**Deployer**: Tesla
**Developer**: Tesla
**Date**: 2021-06-02 00:00:00
**Victim(s)**: Tesla drivers


In [11]:
aiid_data.sort_values(by = 'date', ascending = False).head(2)

Unnamed: 0,id,title,description,date,alleged_deployer_of_ai_system,alleged_developer_of_ai_system,alleged_harmed_or_nearly_harmed_parties,day,month,year
604,Incident 616,Sports Illustrated Is Alleged to Have Used AI ...,"Sports Illustrated, managed by The Arena Group...",2023-11-27,"The Arena Group, Sports Illustrated",unknown,"General public, Readers of Sports Illustrated,...",27,11,2023
601,Incident 613,AI-Generated Images Available through Adobe St...,AI-generated images available through Adobe St...,2023-11-23,Adobe Stock,Various AI image generators,"General public, Journalistic integrity, News s...",23,11,2023


In [12]:
aiid_data['alleged_harmed_or_nearly_harmed_parties'] = aiid_data['alleged_harmed_or_nearly_harmed_parties'].str.replace(', ', ',').str.strip()


In [13]:
# Extract numeric part and convert to integers
aiid_data['id_num'] = aiid_data['id'].str.extract(r'(\d+)').astype(int)

# Move 'id_num' to the first position
aiid_data = aiid_data[['id_num'] + [col for col in aiid_data.columns if col != 'id_num']]

# Sort by id_num
aiid_data.sort_values('id_num', inplace=True)

aiid_data

Unnamed: 0,id_num,id,title,description,date,alleged_deployer_of_ai_system,alleged_developer_of_ai_system,alleged_harmed_or_nearly_harmed_parties,day,month,year
0,1,Incident 1,Google’s YouTube Kids App Presents Inappropria...,YouTube’s content filtering and recommendation...,2015-05-19,YouTube,YouTube,Children,19,5,2015
1,2,Incident 2,Warehouse robot ruptures can of bear spray and...,Twenty-four Amazon workers in New Jersey were ...,2018-12-05,Amazon,Amazon,Warehouse Workers,5,12,2018
2,3,Incident 3,Crashes with Maneuvering Characteristics Augme...,"A Boeing 737 crashed into the sea, killing 189...",2018-10-27,Boeing,Boeing,"Airplane Passengers,Airplane Crew",27,10,2018
3,4,Incident 4,Uber AV Killed Pedestrian in Arizona,An Uber autonomous vehicle (AV) in autonomous ...,2018-03-18,Uber,Uber,"Elaine Herzberg,pedestrians",18,3,2018
4,5,Incident 5,Collection of Robotic Surgery Malfunctions,Study on database reports of robotic surgery m...,2015-07-13,"Hospitals, Doctors",Intuitive Surgical,patients,13,7,2015
...,...,...,...,...,...,...,...,...,...,...,...
600,612,Incident 612,Microsoft AI Poll Allegedly Causes Reputationa...,"An AI-generated poll by Microsoft, displayed a...",2023-10-31,Microsoft,Microsoft,"The Guardian,Family of Lilie James",31,10,2023
601,613,Incident 613,AI-Generated Images Available through Adobe St...,AI-generated images available through Adobe St...,2023-11-23,Adobe Stock,Various AI image generators,"General public,Journalistic integrity,News sou...",23,11,2023
602,614,Incident 614,Google Bard Allegedly Generates False Allegati...,Australian academics reportedly used Google Ba...,2023-11-02,James Guthrie,Google Bard,"James Guthrie,James Guthrie's co-authors,Parli...",2,11,2023
603,615,Incident 615,Colorado Lawyer Filed a Motion Citing Hallucin...,"A Colorado Springs attorney, Zachariah Crabill...",2023-06-13,Zachariah Crabill,"OpenAI, ChatGPT","Legal system,Zachariah Crabill's client,Zachar...",13,6,2023


In [14]:
aiid_data_unique_harmed = pd.DataFrame(columns=aiid_data.columns)

for _, row in aiid_data.iterrows():
    harmed_parties = row['alleged_harmed_or_nearly_harmed_parties'].split(',')
    
    for party in harmed_parties:
        new_row = row.copy()
        new_row['alleged_harmed_or_nearly_harmed_parties'] = party.strip().lower()
        aiid_data_unique_harmed = pd.concat([aiid_data_unique_harmed, pd.DataFrame([new_row])], ignore_index=True)

# Sort the DataFrame by 'id_num' and reset the index
aiid_data_unique_harmed.sort_values('id_num', inplace=True)
aiid_data_unique_harmed.reset_index(drop=True, inplace=True)

aiid_data_unique_harmed

Unnamed: 0,id_num,id,title,description,date,alleged_deployer_of_ai_system,alleged_developer_of_ai_system,alleged_harmed_or_nearly_harmed_parties,day,month,year
0,1,Incident 1,Google’s YouTube Kids App Presents Inappropria...,YouTube’s content filtering and recommendation...,2015-05-19,YouTube,YouTube,children,19,5,2015
1,2,Incident 2,Warehouse robot ruptures can of bear spray and...,Twenty-four Amazon workers in New Jersey were ...,2018-12-05,Amazon,Amazon,warehouse workers,5,12,2018
2,3,Incident 3,Crashes with Maneuvering Characteristics Augme...,"A Boeing 737 crashed into the sea, killing 189...",2018-10-27,Boeing,Boeing,airplane passengers,27,10,2018
3,3,Incident 3,Crashes with Maneuvering Characteristics Augme...,"A Boeing 737 crashed into the sea, killing 189...",2018-10-27,Boeing,Boeing,airplane crew,27,10,2018
4,4,Incident 4,Uber AV Killed Pedestrian in Arizona,An Uber autonomous vehicle (AV) in autonomous ...,2018-03-18,Uber,Uber,elaine herzberg,18,3,2018
...,...,...,...,...,...,...,...,...,...,...,...
1178,615,Incident 615,Colorado Lawyer Filed a Motion Citing Hallucin...,"A Colorado Springs attorney, Zachariah Crabill...",2023-06-13,Zachariah Crabill,"OpenAI, ChatGPT",legal system,13,6,2023
1179,615,Incident 615,Colorado Lawyer Filed a Motion Citing Hallucin...,"A Colorado Springs attorney, Zachariah Crabill...",2023-06-13,Zachariah Crabill,"OpenAI, ChatGPT",zachariah crabill,13,6,2023
1180,616,Incident 616,Sports Illustrated Is Alleged to Have Used AI ...,"Sports Illustrated, managed by The Arena Group...",2023-11-27,"The Arena Group, Sports Illustrated",unknown,readers of sports illustrated,27,11,2023
1181,616,Incident 616,Sports Illustrated Is Alleged to Have Used AI ...,"Sports Illustrated, managed by The Arena Group...",2023-11-27,"The Arena Group, Sports Illustrated",unknown,general public,27,11,2023


In [15]:
### Split the values in the 'alleged_harmed_or_nearly_harmed_parties' column
# and stack them into a new DataFrame
aiid_data_unique_harmed_counts = aiid_data_unique_harmed['alleged_harmed_or_nearly_harmed_parties']

# Rename the resulting Series to 'alleged_harmed_or_nearly_harmed_party'
aiid_data_unique_harmed_counts.name = 'alleged_harmed_or_nearly_harmed_party'

# Count the occurrences of each party
harmed_party_occurrences = aiid_data_unique_harmed_counts.value_counts()

# Display the result
harmed_party_occurrences_df = pd.DataFrame(harmed_party_occurrences).reset_index()
harmed_party_occurrences_df.head(20)

Unnamed: 0,alleged_harmed_or_nearly_harmed_party,count
0,facebook users,28
1,general public,24
2,tesla drivers,20
3,twitter users,15
4,women,12
5,minority groups,12
6,tiktok users,10
7,youtube users,8
8,microsoft,8
9,pedestrians,7


In [16]:
mapping_dict = {
    'user': 'platform_users',
    'users': 'platform_users',
    'children': 'minors',
    'minors': 'minors',
    'teenagers': 'minors',
    'black people': 'minority_groups',
    'jewish people': 'minority_groups',
    'women': 'minority_groups',
    'gender minority groups': 'minority_groups',
    'transgender people': 'minority_groups',
    'black patients': 'minority_groups',
    'communities of color': 'minority_groups',
    'black students': 'minority_groups',
    'neighborhoods of color': 'minority_groups',
    'minority': 'minority_groups'

}

def map_categories(text):
    for term, category in mapping_dict.items():
        if term in text:
            return category
    return text

aiid_data_unique_harmed['alleged_harmed_or_nearly_harmed_generalized'] = aiid_data_unique_harmed['alleged_harmed_or_nearly_harmed_parties'].apply(map_categories)
aiid_data_unique_harmed[['alleged_harmed_or_nearly_harmed_parties','alleged_harmed_or_nearly_harmed_generalized']].head(2)

Unnamed: 0,alleged_harmed_or_nearly_harmed_parties,alleged_harmed_or_nearly_harmed_generalized
0,children,minors
1,warehouse workers,warehouse workers


In [17]:
z = pd.DataFrame(aiid_data_unique_harmed['alleged_harmed_or_nearly_harmed_generalized'].value_counts())
z.iloc[:15]

Unnamed: 0_level_0,count
alleged_harmed_or_nearly_harmed_generalized,Unnamed: 1_level_1
platform_users,214
minority_groups,76
general public,24
tesla drivers,20
minors,17
microsoft,8
pedestrians,7
openai,6
traffic participants,4
uber drivers,4


In [18]:
# Vectorize the text data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(aiid_data_unique_harmed['title'])

# Fit Nearest Neighbors model
nn_model = NearestNeighbors(metric='cosine', algorithm='brute')
nn_model.fit(X)

# Choose a sample row for similarity search (e.g., row with id_num 1)
sample_row_index = 0
sample_row = aiid_data_unique_harmed.iloc[sample_row_index]

# Vectorize the sample row
sample_vector = vectorizer.transform([sample_row['title']])

# Find the 5 most similar rows
_, indices = nn_model.kneighbors(sample_vector, n_neighbors=5)

# Display the most similar rows
similar_rows = aiid_data_unique_harmed.iloc[indices[0]]
z = pd.DataFrame(similar_rows[['id_num', 'title']])
z


Unnamed: 0,id_num,title
0,1,Google’s YouTube Kids App Presents Inappropria...
536,305,YouTube’s Recommendation Algorithm Allegedly P...
537,305,YouTube’s Recommendation Algorithm Allegedly P...
1082,581,Google Ads Allegedly Serving Content on AI-Gen...
1080,581,Google Ads Allegedly Serving Content on AI-Gen...


In [19]:
# explore 581
print(f'**Title**: {aiid_data["title"][581]}')
print(f"**Description**: {aiid_data['description'][581]}")
print(f"**Deployer**: {aiid_data['alleged_deployer_of_ai_system'][581]}")
print(f"**Developer**: {aiid_data['alleged_developer_of_ai_system'][581]}")
print(f"**Date**: {aiid_data['date'][581]}")
print(f"**Victim(s)**: {aiid_data['alleged_harmed_or_nearly_harmed_parties'][581]}")

**Title**: AI Photo Filter Lightens Skin, Changes Eye Color in Student's 'Professional' Image
**Description**: An AI application modified an MIT student's photo to appear 'professional' by lightening her skin and changing her eye color to blue, highlighting the racial bias in the training data of the program.
**Deployer**: Playground AI
**Developer**: Playground AI
**Date**: 2023-07-21 00:00:00
**Victim(s)**: Rona Wang,Racial minorities who may have experienced the same result


In [20]:
# Specify the column to analyze
column_to_analyze = 'description'

# Function to perform text analysis
def analyze_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    
    # Perform sentiment analysis
    sid = SentimentIntensityAnalyzer()
    sentiment_score = sid.polarity_scores(text)['compound']
    
    # Check for specific keywords related to casualties
    keywords = ['kill', 'killing', 'killed', 'fatality', 'fatalities',
                'deaths', 'death', 'casualty', 'casualties',
                'dead', 'deadly', 'lethal']
    casualty_indicator = any(keyword in tokens for keyword in keywords)
    
    return sentiment_score, casualty_indicator

# Print rows with True outcomes
print("Rows with True outcomes for Casualty Indicator:")
for index, row in aiid_data.iterrows():
    text = row[column_to_analyze]
    sentiment_score, casualty_indicator = analyze_text(text)
    
    # Output the results for True outcomes
    if not casualty_indicator:
        print(f"Row {index}: Description - {text}, Sentiment Score - {sentiment_score}")

Rows with True outcomes for Casualty Indicator:
Row 0: Description - YouTube’s content filtering and recommendation algorithms exposed children to disturbing and inappropriate videos., Sentiment Score - -0.5574
Row 1: Description - Twenty-four Amazon workers in New Jersey were hospitalized after a robot punctured a can of bear repellent spray in a warehouse., Sentiment Score - 0.1779
Row 5: Description - Microsoft's Tay, an artificially intelligent chatbot, was released on March 23, 2016 and removed within 24 hours due to multiple racist, sexist, and anit-semitic tweets generated by the bot., Sentiment Score - -0.25
Row 6: Description - Wikipedia bots meant to remove vandalism clash with each other and form feedback loops of repetitve undoing of the other bot's edits., Sentiment Score - 0.0
Row 7: Description - Uber vehicles equipped with technology allowing for autonomous driving running red lights in San Francisco street testing., Sentiment Score - 0.0
Row 8: Description - An algorit

In [21]:
# Counter for True outcomes
true_outcomes_count = 0

# Print rows with True outcomes and count them
print("Rows with True outcomes for Casualty Indicator:")
for index, row in aiid_data.iterrows():
    text = row[column_to_analyze]
    sentiment_score, casualty_indicator = analyze_text(text)
    
    # Output the results for True outcomes
    if casualty_indicator:
        true_outcomes_count += 1
        print(f"Row {index}: Description - {text}, Sentiment Score - {sentiment_score}")

# Print the count of True outcomes
print(f"\nNumber of True outcomes for Casualty Indicator: {true_outcomes_count}")


Rows with True outcomes for Casualty Indicator:
Row 2: Description - A Boeing 737 crashed into the sea, killing 189 people, after faulty sensor data caused an automated manuevering system to repeatedly push the plane's nose downward., Sentiment Score - -0.7717
Row 3: Description - An Uber autonomous vehicle (AV) in autonomous mode struck and killed a pedestrian in Tempe, Arizona., Sentiment Score - -0.7579
Row 4: Description - Study on database reports of robotic surgery malfunctions (8,061), including those ending in injury (1,391) and death (144), between 2000 and 2013., Sentiment Score - -0.7717
Row 23: Description - A Volkswagen plant robot "crushed to death" a worker by pinning him to a metal plate., Sentiment Score - -0.7717
Row 51: Description - A Tesla Model S on autopilot crashed into a white articulated tractor-trailer on Highway US 27A in Williston, Florida, killing the driver., Sentiment Score - -0.6597
Row 68: Description - A factory robot at the SKH Metals Factory in Mane