# CleanLab Task

In [1]:
!pip install "cleanlab[datalab]"



In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from cleanlab.classification import CleanLearning

In [3]:
# Load the Iris dataset
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target
print(df.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  


In [24]:
# Introduce anomalies by altering some values
np.random.seed(42)
anomaly_indices = np.random.choice(df.index, size=10, replace=False)
df.loc[anomaly_indices, 'petal length (cm)'] = np.random.uniform(5, 7, size=10)
anomaly_indices

array([ 73,  18, 118,  78,  76,  31,  64, 141,  68,  82])

In [5]:
X = iris.data
y = iris.target

# Use CleanLearning for anomaly detection
clf = CleanLearning()
clf.fit(X, y)

# Find potential anomalies in labels
label_issues = clf.find_label_issues(X, y)

# Output the anomalies
anomalies = np.where(label_issues["is_label_issue"])[0]
print(f"Anomalies detected at indices: {anomalies}")
print(f"Suspected anomaly values: {X[anomalies]}")

Anomalies detected at indices: [ 18  31  68  82 106 119]
Suspected anomaly values: [[5.7        3.8        5.82076585 0.3       ]
 [5.4        3.4        5.57950291 0.4       ]
 [6.2        2.2        6.61624076 1.5       ]
 [5.8        2.7        6.26680751 1.2       ]
 [4.9        2.5        4.5        1.7       ]
 [6.         2.2        5.         1.5       ]]


In [6]:
# Create an empty list to store DataFrames
suspect_dfs = []

flower_species = {0.0: "Setosa", 1.0: "Versicolor", 2.0: "Virginica"}

# Loop over the indices and create a structured DataFrame for each
for idx in anomalies:
    # Create a DataFrame for the suspected anomaly data point
    df_suspect = pd.DataFrame([df.iloc[idx][iris.feature_names].values], columns=iris.feature_names)
    df_suspect.insert(0, "Index", idx)  # Insert index column

    df_suspect["True Label"] = df.iloc[idx]["target"]
    df_suspect["Flower Species"] = flower_species[y[idx]]  # Map label to flower species

    # Append the current suspect DataFrame to the list
    suspect_dfs.append(df_suspect)

# Combine all the suspect DataFrames into a single DataFrame
df_all_suspects = pd.concat(suspect_dfs, ignore_index=True)

# Print the full table of suspected anomalies
print("\n                                      Suspected Anomalous Data Points")
print("-----------------------------------------------------------------------------------------------------------")

print(df_all_suspects.to_string(index=False))


                                      Suspected Anomalous Data Points
-----------------------------------------------------------------------------------------------------------
 Index  sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  True Label Flower Species
    18                5.7               3.8           5.820766               0.3         0.0         Setosa
    31                5.4               3.4           5.579503               0.4         0.0         Setosa
    68                6.2               2.2           6.616241               1.5         1.0     Versicolor
    82                5.8               2.7           6.266808               1.2         1.0     Versicolor
   106                4.9               2.5           4.500000               1.7         2.0      Virginica
   119                6.0               2.2           5.000000               1.5         2.0      Virginica


In [18]:
# Combine data and labels
iris = load_iris(as_frame=True)
df = iris.data
df['target'] = iris.target

In [16]:
df.groupby('target').mean()

Unnamed: 0_level_0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5.006,3.428,1.462,0.246
1,5.936,2.77,4.26,1.326
2,6.588,2.974,5.552,2.026


In [17]:
df.groupby('target').std()

Unnamed: 0_level_0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.35249,0.379064,0.173664,0.105386
1,0.516171,0.313798,0.469911,0.197753
2,0.63588,0.322497,0.551895,0.27465
