In [1]:
pip install openai==0.28
import SECRETS

Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
from openAI import OpenAI  # Adjust the import statement according to your project structure

# Initializing the OpenAI classifier with APA API key
classifier = OpenAI(SECRETS.API_KEY)

# Loading the sampled data CSV, constaining 250 text samples from every label
df = pd.read_csv('sampled_data.csv')

In [7]:
# Function to safely classify text and handle potential errors
def safe_classify(text):
    try:
        return classifier.classify_openai(text)
    except Exception as e:
        print(f"Error classifying text: {text}. Error: {e}")
        return "Error"  # Return a default or error indicator

# Apply the classification function to each text in the DataFrame
df['classification'] = df['text'].apply(safe_classify)

In [5]:
# Saving the updated DataFrame back to CSV
df.to_csv('classified_data35.csv', index=False)

In [6]:
df.head()

Unnamed: 0,newlabels,text,unikey,classification
0,0,Baustelle beendet: Die U4 fährt wieder durch V...,6053033342970432378,0
1,0,Das ist der neue Albertina-Chef Ralph Gleis wi...,7940722851744635373,3
2,0,Ein Fußbad mit Thymian Vorbeugend gegen Infekt...,15362306225343275194,3
3,0,Immer mehr künstliche Intelligenz im Alltag:.....,2260202472747209338,3
4,0,gefunden wurde dieser junge Amstaff-Rüde in Gu...,1217519113130961336,0 (keine dieser Kategorien)


In [4]:
#checking labels returned from chatGPT, there are some reply exclusions
label_counts = df['classification'].value_counts()
label_counts

classification
3                                                                                         622
0                                                                                         194
1                                                                                          76
2                                                                                          65
Kommentar                                                                                  26
Dieser Text gehört zur Kategorie 'Kommentar' (3).                                           2
Das ist ein Interview.                                                                      2
1.                                                                                          1
Kommentar.                                                                                  1
Leserbriefe                                                                                 1
Das ist ein Leserbrief (2).                  

In [10]:
#setting function for cleaning dataset from reply exclusions
def clean_classification(value):
    valid_classifications = {'0', '1', '2', '3'}  # Set of valid classification values
    if value.strip() in valid_classifications:
        return value.strip()
    else:
        return 'Invalid'

# Apply this function to the dataset
df['classification'] = df['classification'].apply(clean_classification)

In [9]:
label_counts = df['classification'].value_counts()
label_counts

classification
3          622
0          194
1           76
2           65
Invalid     43
Name: count, dtype: int64

In [10]:
# Removing all rows where 'classification' is 'Invalid'
df = df[df['classification'] != 'Invalid']

In [11]:
label_counts = df['classification'].value_counts()
label_counts

classification
3    622
0    194
1     76
2     65
Name: count, dtype: int64

In [12]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [13]:
df.to_csv('classified_data35_clean.csv', index=False)

In [12]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import pandas as pd

In [2]:
df = pd.read_csv('classified_data35_clean.csv')

In [3]:
df['newlabels'] = df['newlabels'].astype(str)
df['classification'] = df['classification'].astype(str)

In [4]:
# Calculating accuracy
accuracy = accuracy_score(df['newlabels'], df['classification'])
print(f"Accuracy: {accuracy}")

Accuracy: 0.477533960292581


In [5]:
# Calculating precision, recall, and F1-score
precision, recall, f1, _ = precision_recall_fscore_support(df['newlabels'], df['classification'], average='weighted')
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Precision: 0.6195798793429245
Recall: 0.477533960292581
F1 Score: 0.44995161450729937


In [6]:
# Generating confusion matrix
conf_matrix = confusion_matrix(df['newlabels'], df['classification'], labels=['0', '1', '2', '3'])
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[139   2   3  93]
 [ 24  71  12 136]
 [ 25   2  31 177]
 [  6   1  19 216]]


In [6]:
df2 = pd.read_csv('sampled_data.csv')

In [8]:
df2['classification'] = df2['text'].apply(safe_classify)

In [9]:
#checking labels returned from chatGPT, there are some reply exclusions
label_counts2 = df2['classification'].value_counts()
label_counts2

classification
3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             515
2                                                                                                                                                                                                                                                                                                                                                      

In [11]:
df2['classification'] = df2['classification'].apply(clean_classification)
df2 = df2[df2['classification'] != 'Invalid']
label_counts2 = df2['classification'].value_counts()
label_counts2

classification
3    515
2    243
0    122
1     94
Name: count, dtype: int64

In [13]:
df2['newlabels'] = df2['newlabels'].astype(str)
df2['classification'] = df2['classification'].astype(str)

In [14]:
accuracy = accuracy_score(df2['newlabels'], df2['classification'])
print(f"Accuracy: {accuracy}")

Accuracy: 0.48254620123203285


In [15]:
precision, recall, f1, _ = precision_recall_fscore_support(df2['newlabels'], df2['classification'], average='weighted')
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Precision: 0.5934192251183218
Recall: 0.48254620123203285
F1 Score: 0.4766869299536646
