In [1]:
import pandas as pd

# Load the datasets
domain1_df = pd.read_json('domain1_train_data.json', lines=True)
domain2_df = pd.read_json('domain2_train_data.json', lines=True)

# Calculate label frequencies for domain 1
label_counts_d1 = domain1_df['label'].value_counts()

# Calculate label frequencies for domain 2
label_counts_d2 = domain2_df['label'].value_counts()

print("Domain 1 label frequencies:\n", label_counts_d1)
print("Domain 2 label frequencies:\n", label_counts_d2)


Domain 1 label frequencies:
 1    2500
0    2500
Name: label, dtype: int64
Domain 2 label frequencies:
 0    11500
1     1500
Name: label, dtype: int64


In [2]:
import joblib
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier, IsolationForest, RandomForestClassifier, StackingClassifier, VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier


In [3]:
# Load datasets
def load_data(file_path):
    return pd.read_json(file_path, lines=True)

df_domain1 = load_data('domain1_train_data.json')
df_domain2 = load_data('domain2_train_data.json')
df_test = load_data('test_data.json')

In [4]:
# Combine texts from both domains and test data for vectorization
all_texts = pd.concat([df_domain1['text'], df_domain2['text'], df_test['text']]).apply(lambda x: ' '.join(map(str, x)))
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_all = vectorizer.fit_transform(all_texts)

# Split the vectorized data back into domain1, domain2, and test sets
X_domain1 = X_all[:len(df_domain1)]
X_domain2 = X_all[len(df_domain1):-len(df_test)]
X_test = X_all[-len(df_test):]

# Save vectorizer for later use
joblib.dump(vectorizer, 'vectorizer.pkl')


['vectorizer.pkl']

In [5]:
# Create domain labels (0 for domain1, 1 for domain2)
y_domains = [0]*len(df_domain1) + [1]*len(df_domain2)
X_domains = vectorizer.transform(pd.concat([df_domain1['text'], df_domain2['text']]).apply(lambda x: ' '.join(map(str, x))))

# Train-test split
X_train_domain, X_val_domain, y_train_domain, y_val_domain = train_test_split(X_domains, y_domains, test_size=0.2, random_state=42)

# Train domain classifier
clf1 = LogisticRegression(random_state=42)
clf2 = RandomForestClassifier(random_state=42)
clf3 = SVC(probability=True, random_state=42)

domain_classifier = VotingClassifier(
    estimators=[('lr', clf1), ('rf', clf2), ('svc', clf3)],
    voting='soft'
)

domain_classifier.fit(X_train_domain, y_train_domain)

# Save domain classifier
joblib.dump(domain_classifier, 'domain_classifier.pkl')

y_pred_domain = domain_classifier.predict(X_val_domain)
print("Domain Classifier Accuracy:", accuracy_score(y_val_domain, y_pred_domain))


Domain Classifier Accuracy: 0.9975


In [6]:
# Vectorizing text data for domain 1
df_domain1['text_str'] = df_domain1['text'].apply(lambda tokens: ' '.join(map(str, tokens)))
tfidf_vectorizer_d1 = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_domain1 = tfidf_vectorizer_d1.fit_transform(df_domain1['text_str'])

# Splitting the data into training and validation sets for domain 1
X_train_d1, X_val_d1, y_train_d1, y_val_d1 = train_test_split(X_domain1, df_domain1['label'], test_size=0.2, random_state=42)

In [7]:
# Define the stacking classifier
base_learners = [
    ('dt', DecisionTreeClassifier(random_state=42)), 
    ('svc', SVC(probability=True, random_state=42)), 
    ('lr', LogisticRegression(random_state=42)), 
    ('knn', KNeighborsClassifier())
]

# Meta-learner
meta_learner = RandomForestClassifier(n_estimators=100, random_state=42)

# Stacking classifier
stacking_cls_d1 = StackingClassifier(estimators=base_learners, final_estimator=meta_learner, cv=5)

# Train the stacking classifier
stacking_cls_d1.fit(X_train_d1, y_train_d1)

# Prediction on the test set
y_pred_d1 = stacking_cls_d1.predict(X_val_d1)

# Save classifiers
joblib.dump(stacking_cls_d1, 'ai_human_classifier_d1.pkl')
joblib.dump(tfidf_vectorizer_d1,'tfidf_vectorizer_d1.pkl')


['tfidf_vectorizer_d1.pkl']

In [8]:
# Evaluation (example for domain 1)
print("Accuracy (Domain 1):", accuracy_score(y_val_d1, y_pred_d1))
print("F1-Score (Domain 1):", f1_score(y_val_d1, y_pred_d1))
print("ROC-AUC (Domain 1):", roc_auc_score(y_val_d1, y_pred_d1))

Accuracy (Domain 1): 0.828
F1-Score (Domain 1): 0.8323586744639376
ROC-AUC (Domain 1): 0.8280000000000001


In [9]:
# Vectorizing text data for domain 2
df_domain2['text_str'] = df_domain2['text'].apply(lambda tokens: ' '.join(map(str, tokens)))
tfidf_vectorizer_d2 = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_domain2 = tfidf_vectorizer_d2.fit_transform(df_domain2['text_str'])
y_domain2 = df_domain2['label']

# Splitting data into training and validation sets
X_train_d2, X_val_d2, y_train_d2, y_val_d2 = train_test_split(X_domain2, y_domain2, test_size=0.2, random_state=42)

In [10]:
# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_d2, y_train_d2)

# Initialize a classifier, for example, RandomForest
stacking_cls_d2 = StackingClassifier(estimators=base_learners, final_estimator=meta_learner, cv=5)

# Train the classifier on the oversampled training data
stacking_cls_d2.fit(X_train_smote, y_train_smote)

# Make predictions on the test data
y_pred_d2 = stacking_cls_d2.predict(X_val_d2)

# Save classifiers
joblib.dump(stacking_cls_d2, 'ai_human_classifier_d2.pkl')
joblib.dump(tfidf_vectorizer_d2,'tfidf_vectorizer_d2.pkl')

['tfidf_vectorizer_d2.pkl']

In [11]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_val_d2, y_pred_d2))
print("Accuracy (Domain 2):", accuracy_score(y_val_d2, y_pred_d2))
print("F1-Score (Domain 2):", f1_score(y_val_d2, y_pred_d2))
print("ROC-AUC (Domain 2):", roc_auc_score(y_val_d2, y_pred_d2))

Accuracy: 0.9084615384615384
Accuracy (Domain 2): 0.9084615384615384
F1-Score (Domain 2): 0.4079601990049751
ROC-AUC (Domain 2): 0.6298668734433801


In [12]:
# Load saved models
domain_classifier = joblib.load('domain_classifier.pkl')
ai_human_classifier_d1 = joblib.load('ai_human_classifier_d1.pkl')
ai_human_classifier_d2 = joblib.load('ai_human_classifier_d2.pkl')



In [13]:
# Correctly load pre-fitted vectorizers
tfidf_vectorizer_domain = joblib.load('vectorizer.pkl')  # Adjust file path as necessary

# Assuming df_test is your test DataFrame and it includes a column 'text' containing tokenized texts
df_test['text_str'] = df_test['text'].apply(lambda tokens: ' '.join(map(str, tokens)))

# Correctly use the pre-fitted vectorizers to transform the test data
X_test_domain = tfidf_vectorizer_domain.transform(df_test['text_str'])



In [14]:
# Predict domain for test data
test_domain_preds = domain_classifier.predict(X_test_domain)

# Splitting the test data based on domain predictions
df_test['predicted_domain'] = test_domain_preds  # Assign domain predictions to a new column in the test DataFrame
df_test_d1 = df_test[df_test['predicted_domain'] == 0]  # Filter rows predicted as Domain 1
df_test_d2 = df_test[df_test['predicted_domain'] == 1]  # Filter rows predicted as Domain 2

# Assuming df_test is your test DataFrame and it includes a column 'text' containing tokenized texts
df_test_d1['text_str'] = df_test_d1['text'].apply(lambda tokens: ' '.join(map(str, tokens)))
df_test_d2['text_str'] = df_test_d2['text'].apply(lambda tokens: ' '.join(map(str, tokens)))

# Assuming df_test already has a 'text_str' column from previous steps
# Transform the test data for AI vs Human classification
X_test_d1 = tfidf_vectorizer_domain.transform(df_test_d1['text_str'])
X_test_d2 = tfidf_vectorizer_domain.transform(df_test_d2['text_str'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_d1['text_str'] = df_test_d1['text'].apply(lambda tokens: ' '.join(map(str, tokens)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_d2['text_str'] = df_test_d2['text'].apply(lambda tokens: ' '.join(map(str, tokens)))


In [15]:
# Classify between AI and Human 
test_ai_human_preds_d1 = ai_human_classifier_d1.predict(X_test_d1)
test_ai_human_preds_d2 = ai_human_classifier_d2.predict(X_test_d2)

# Add prediced labels to CSV
df_test_d1['Predicted_Label'] = test_ai_human_preds_d1
df_test_d2['Predicted_Label'] = test_ai_human_preds_d2

# Combine both predictions
df_final_predictions = pd.concat([df_test_d1, df_test_d2]).sort_index()

# Select only 'id', 'predicted_domain', and 'Predicted_Label' columns
df_final_predictions = df_final_predictions[['id', 'predicted_domain', 'Predicted_Label']]

# Save to CSV
df_final_predictions.to_csv('final_predictions_with_domain.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_d1['Predicted_Label'] = test_ai_human_preds_d1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_d2['Predicted_Label'] = test_ai_human_preds_d2


In [16]:
# Assuming 'final_predictions.csv' is your saved CSV file
csv_path = 'final_predictions_with_domain.csv'
df = pd.read_csv(csv_path)

# Assuming 'Predicted_Label' is the column with your labels
domain_counts = df['predicted_domain'].value_counts()
label_counts = df['Predicted_Label'].value_counts()

print(domain_counts)
print(label_counts)


1    2007
0    1993
Name: predicted_domain, dtype: int64
0    2512
1    1488
Name: Predicted_Label, dtype: int64


In [1]:
import joblib
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from tensorflow.keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
from tensorflow.keras.models import load_model, Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import L1L2




In [2]:
# Load datasets
def load_data(file_path):
    return pd.read_json(file_path, lines=True)

df_domain1 = load_data('domain1_train_data.json')
df_domain2 = load_data('domain2_train_data.json')
df_test = load_data('test_data.json')

# Preprocessing function to convert lists of tokens into a single string per text
def preprocess_texts(df):
    df['text_str'] = df['text'].apply(lambda tokens: ' '.join(map(str, tokens)))
    return df

df_domain1 = preprocess_texts(df_domain1)
df_domain2 = preprocess_texts(df_domain2)
df_test = preprocess_texts(df_test)

# Combine texts for domain classification
all_texts = pd.concat([df_domain1['text_str'], df_domain2['text_str']])
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_all = vectorizer.fit_transform(all_texts)
y_domains = np.array([0]*len(df_domain1) + [1]*len(df_domain2))

# Train-test split for domain classification
X_train_domain, X_val_domain, y_train_domain, y_val_domain = train_test_split(X_all, y_domains, test_size=0.2, random_state=42)

# Domain classification model
domain_classifier = LogisticRegression(random_state=42)
domain_classifier.fit(X_train_domain, y_train_domain)
joblib.dump(domain_classifier, 'domain_classifier.pkl')



['domain_classifier.pkl']

In [3]:
# Tokenization and sequence padding for LSTM
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df_domain1['text_str'].tolist() + df_domain2['text_str'].tolist())
sequences_domain1 = tokenizer.texts_to_sequences(df_domain1['text_str'].tolist())
sequences_domain2 = tokenizer.texts_to_sequences(df_domain2['text_str'].tolist())
max_len = max(max([len(seq) for seq in sequences_domain1]), max([len(seq) for seq in sequences_domain2]))
X_seq_domain1 = pad_sequences(sequences_domain1, maxlen=max_len)
X_seq_domain2 = pad_sequences(sequences_domain2, maxlen=max_len)
y_seq_domain1 = df_domain1['label'].values
y_seq_domain2 = df_domain2['label'].values



In [4]:
# Train-test split for domain classification
X_train_domain, X_val_domain, y_train_domain, y_val_domain = train_test_split(X_all, y_domains, test_size=0.2, random_state=42)

# Define the classifiers for the VotingClassifier
clf1 = LogisticRegression(random_state=42)
clf2 = RandomForestClassifier(n_estimators=100, random_state=42)
clf3 = SVC(probability=True, random_state=42)

# Train domain classifier using VotingClassifier
domain_classifier = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft')
domain_classifier.fit(X_train_domain, y_train_domain)

# Save domain classifier
joblib.dump(domain_classifier, 'domain_classifier.pkl')

# Evaluate domain classifier
y_pred_domain = domain_classifier.predict(X_val_domain)
print(f"Domain Classifier Accuracy: {accuracy_score(y_val_domain, y_pred_domain)}")

Domain Classifier Accuracy: 0.9975


In [5]:
# Define LSTM Model
def define_lstm_model(vocab_size):
    
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=100))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2, kernel_regularizer=L1L2(l1=1e-5, l2=1e-4)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


In [6]:
# Train LSTM models for each domain
vocab_size = len(tokenizer.word_index) + 1
model_domain1 = define_lstm_model(vocab_size)
model_domain1.fit(X_seq_domain1, y_seq_domain1, epochs=10, batch_size=64, validation_split=0.2)
model_domain1.save('lstm_domain1.h5')



Epoch 1/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 2s/step - accuracy: 0.6087 - loss: 0.6953 - val_accuracy: 0.0540 - val_loss: 0.9801
Epoch 2/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 1s/step - accuracy: 0.7013 - loss: 0.5877 - val_accuracy: 0.4500 - val_loss: 0.8200
Epoch 3/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 1s/step - accuracy: 0.7763 - loss: 0.5016 - val_accuracy: 0.5280 - val_loss: 0.8216
Epoch 4/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 1s/step - accuracy: 0.8317 - loss: 0.4052 - val_accuracy: 0.5520 - val_loss: 0.8837
Epoch 5/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 1s/step - accuracy: 0.8469 - loss: 0.3783 - val_accuracy: 0.5090 - val_loss: 1.0242
Epoch 6/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 1s/step - accuracy: 0.8706 - loss: 0.3346 - val_accuracy: 0.5550 - val_loss: 1.0155
Epoch 7/10
[1m63/63[0m [32m━━━━━━━━━━



In [7]:
# Tokenization and sequence padding
tokenizer.fit_on_texts(df_domain2['text_str'].tolist())
sequences_domain2 = tokenizer.texts_to_sequences(df_domain2['text_str'].tolist())
X_seq_domain2 = pad_sequences(sequences_domain2, maxlen=max_len)
y_seq_domain2 = df_domain2['label'].values

# Find the number of samples in the minority and majority classes
counter = Counter(y_seq_domain2)
minority_class = min(counter, key=counter.get)
majority_class = max(counter, key=counter.get)

# Calculate replication factor for minority class to match majority class count
replication_factor = counter[majority_class] // counter[minority_class]

# Oversample the minority class
minority_indices = np.where(y_seq_domain2 == minority_class)[0]
oversampled_minority_seq = np.repeat(X_seq_domain2[minority_indices], replication_factor, axis=0)
oversampled_minority_labels = np.repeat(y_seq_domain2[minority_indices], replication_factor)

# Combine the oversampled minority class with the original dataset
X_seq_domain2_oversampled = np.vstack((X_seq_domain2, oversampled_minority_seq))
y_seq_domain2_oversampled = np.hstack((y_seq_domain2, oversampled_minority_labels))

# Train-test split
X_train_seq, X_val_seq, y_train_seq, y_val_seq = train_test_split(X_seq_domain2_oversampled, y_seq_domain2_oversampled, test_size=0.2, random_state=42)

# Train the LSTM model on the balanced dataset
vocab_size = len(tokenizer.word_index) + 1
max_length = max_len  # Make sure max_len is correctly defined to cover all sequences
model_domain2 = define_lstm_model(vocab_size)
model_domain2.fit(X_train_seq, y_train_seq, epochs=10, batch_size=64, validation_data=(X_val_seq, y_val_seq))

# Save your trained model
model_domain2.save('lstm_domain2.h5')

Epoch 1/10
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m426s[0m 1s/step - accuracy: 0.7237 - loss: 0.5527 - val_accuracy: 0.9017 - val_loss: 0.2717
Epoch 2/10
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m422s[0m 1s/step - accuracy: 0.9160 - loss: 0.2492 - val_accuracy: 0.9145 - val_loss: 0.2449
Epoch 3/10
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m437s[0m 1s/step - accuracy: 0.9467 - loss: 0.1713 - val_accuracy: 0.9445 - val_loss: 0.1745
Epoch 4/10
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m457s[0m 2s/step - accuracy: 0.9611 - loss: 0.1306 - val_accuracy: 0.9368 - val_loss: 0.2115
Epoch 5/10
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m447s[0m 2s/step - accuracy: 0.9764 - loss: 0.0934 - val_accuracy: 0.9472 - val_loss: 0.1815
Epoch 6/10
[1m294/294[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m443s[0m 2s/step - accuracy: 0.9701 - loss: 0.1069 - val_accuracy: 0.9506 - val_loss: 0.1717
Epoch 7/10
[1m294/294



In [8]:
ai_human_classifier_d1 = load_model('lstm_domain1.h5')
ai_human_classifier_d2 = load_model('lstm_domain2.h5')



In [9]:
# Load test data
df_test['text_str'] = df_test['text'].apply(lambda tokens: ' '.join(map(str, tokens)))

# Transform test data for domain classification
X_test_domain = vectorizer.transform(df_test['text_str'])

# Predict domain for test data
test_domain_preds = domain_classifier.predict(X_test_domain)

# Splitting the test data based on domain predictions
df_test['predicted_domain'] = test_domain_preds
df_test_d1 = df_test[df_test['predicted_domain'] == 0]
df_test_d2 = df_test[df_test['predicted_domain'] == 1]

# Transform text for AI vs. Human classification using LSTMs
sequences_test_d1 = tokenizer.texts_to_sequences(df_test_d1['text_str'].tolist())
sequences_test_d2 = tokenizer.texts_to_sequences(df_test_d2['text_str'].tolist())

max_sequence_length_d1 = ai_human_classifier_d1.input_shape[1]
max_sequence_length_d2 = ai_human_classifier_d2.input_shape[1]

X_test_d1_seq = pad_sequences(sequences_test_d1, maxlen=max_sequence_length_d1)
X_test_d2_seq = pad_sequences(sequences_test_d2, maxlen=max_sequence_length_d2)



In [10]:
# Predict AI vs Human for split test data
test_ai_human_preds_d1 = ai_human_classifier_d1.predict(X_test_d1_seq)
test_ai_human_preds_d2 = ai_human_classifier_d2.predict(X_test_d2_seq)

# Convert predictions to binary labels
test_ai_human_preds_d1 = (test_ai_human_preds_d1.flatten() > 0.5).astype(int)
test_ai_human_preds_d2 = (test_ai_human_preds_d2.flatten() > 0.5).astype(int)

# Combine predictions and add to CSV
df_test_d1['Predicted_Label'] = test_ai_human_preds_d1
df_test_d2['Predicted_Label'] = test_ai_human_preds_d2
df_final_predictions = pd.concat([df_test_d1[['id', 'predicted_domain', 'Predicted_Label']], df_test_d2[['id', 'predicted_domain', 'Predicted_Label']]]).sort_index()

# Save to CSV
df_final_predictions.to_csv('final_predictions_with_domain.csv', index=False)


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 281ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 286ms/step


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_d1['Predicted_Label'] = test_ai_human_preds_d1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_d2['Predicted_Label'] = test_ai_human_preds_d2


In [11]:
# Assuming 'final_predictions.csv' is your saved CSV file
csv_path = 'final_predictions_with_domain.csv'
df = pd.read_csv(csv_path)

# Assuming 'Predicted_Label' is the column with your labels
domain_counts = df['predicted_domain'].value_counts()
label_counts = df['Predicted_Label'].value_counts()

print(domain_counts)
print(label_counts)


1    2007
0    1993
Name: predicted_domain, dtype: int64
0    2377
1    1623
Name: Predicted_Label, dtype: int64
