In [1]:
import pandas as pd

# Load the datasets
domain1_df = pd.read_json('domain1_train_data.json', lines=True)
domain2_df = pd.read_json('domain2_train_data.json', lines=True)

# Calculate label frequencies for domain 1
label_counts_d1 = domain1_df['label'].value_counts()

# Calculate label frequencies for domain 2
label_counts_d2 = domain2_df['label'].value_counts()

print("Domain 1 label frequencies:\n", label_counts_d1)
print("Domain 2 label frequencies:\n", label_counts_d2)


Domain 1 label frequencies:
 1    2500
0    2500
Name: label, dtype: int64
Domain 2 label frequencies:
 0    11500
1     1500
Name: label, dtype: int64


In [2]:
import joblib
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier, IsolationForest, RandomForestClassifier, StackingClassifier, VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier


In [3]:
# Load datasets
def load_data(file_path):
    return pd.read_json(file_path, lines=True)

df_domain1 = load_data('domain1_train_data.json')
df_domain2 = load_data('domain2_train_data.json')
df_test = load_data('test_data.json')

In [4]:
# Combine texts from both domains and test data for vectorization
all_texts = pd.concat([df_domain1['text'], df_domain2['text'], df_test['text']]).apply(lambda x: ' '.join(map(str, x)))
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_all = vectorizer.fit_transform(all_texts)

# Split the vectorized data back into domain1, domain2, and test sets
X_domain1 = X_all[:len(df_domain1)]
X_domain2 = X_all[len(df_domain1):-len(df_test)]
X_test = X_all[-len(df_test):]

# Save vectorizer for later use
joblib.dump(vectorizer, 'vectorizer.pkl')


['vectorizer.pkl']

In [5]:
# Create domain labels (0 for domain1, 1 for domain2)
y_domains = [0]*len(df_domain1) + [1]*len(df_domain2)
X_domains = vectorizer.transform(pd.concat([df_domain1['text'], df_domain2['text']]).apply(lambda x: ' '.join(map(str, x))))

# Train-test split
X_train_domain, X_val_domain, y_train_domain, y_val_domain = train_test_split(X_domains, y_domains, test_size=0.2, random_state=42)

# Train domain classifier
clf1 = LogisticRegression(random_state=42)
clf2 = RandomForestClassifier(random_state=42)
clf3 = SVC(probability=True, random_state=42)

domain_classifier = VotingClassifier(
    estimators=[('lr', clf1), ('rf', clf2), ('svc', clf3)],
    voting='soft'
)

domain_classifier.fit(X_train_domain, y_train_domain)

# Save domain classifier
joblib.dump(domain_classifier, 'domain_classifier.pkl')

y_pred_domain = domain_classifier.predict(X_val_domain)
print("Domain Classifier Accuracy:", accuracy_score(y_val_domain, y_pred_domain))


Domain Classifier Accuracy: 0.9975


In [6]:
# Vectorizing text data for domain 1
df_domain1['text_str'] = df_domain1['text'].apply(lambda tokens: ' '.join(map(str, tokens)))
tfidf_vectorizer_d1 = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_domain1 = tfidf_vectorizer_d1.fit_transform(df_domain1['text_str'])

# Splitting the data into training and validation sets for domain 1
X_train_d1, X_val_d1, y_train_d1, y_val_d1 = train_test_split(X_domain1, df_domain1['label'], test_size=0.2, random_state=42)

In [8]:
# Define the stacking classifier
base_learners = [
    ('dt', DecisionTreeClassifier(random_state=42)), 
    ('svc', SVC(probability=True, random_state=42)), 
    ('lr', LogisticRegression(random_state=42)), 
    ('knn', KNeighborsClassifier())
]

# Meta-learner
meta_learner = RandomForestClassifier(n_estimators=100, random_state=42)

# Stacking classifier
stacking_cls_d1 = StackingClassifier(estimators=base_learners, final_estimator=meta_learner, cv=5)

# Train the stacking classifier
stacking_cls_d1.fit(X_train_d1, y_train_d1)

# Prediction on the test set
y_pred_d1 = stacking_cls_d1.predict(X_val_d1)

# Save classifiers
joblib.dump(stacking_cls_d1, 'ai_human_classifier_d1.pkl')
joblib.dump(tfidf_vectorizer_d1,'tfidf_vectorizer_d1.pkl')


['tfidf_vectorizer_d1.pkl']

In [9]:
# Evaluation (example for domain 1)
print("Accuracy (Domain 1):", accuracy_score(y_val_d1, y_pred_d1))
print("F1-Score (Domain 1):", f1_score(y_val_d1, y_pred_d1))
print("ROC-AUC (Domain 1):", roc_auc_score(y_val_d1, y_pred_d1))

Accuracy (Domain 1): 0.828
F1-Score (Domain 1): 0.8323586744639376
ROC-AUC (Domain 1): 0.8280000000000001


In [10]:
# Vectorizing text data for domain 2
df_domain2['text_str'] = df_domain2['text'].apply(lambda tokens: ' '.join(map(str, tokens)))
tfidf_vectorizer_d2 = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_domain2 = tfidf_vectorizer_d2.fit_transform(df_domain2['text_str'])
y_domain2 = df_domain2['label']

# Splitting data into training and validation sets
X_train_d2, X_val_d2, y_train_d2, y_val_d2 = train_test_split(X_domain2, y_domain2, test_size=0.2, random_state=42)

In [14]:
# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_d2, y_train_d2)

# Initialize a classifier, for example, RandomForest
stacking_cls_d2 = StackingClassifier(estimators=base_learners, final_estimator=meta_learner, cv=5)

# Train the classifier on the oversampled training data
stacking_cls_d2.fit(X_train_smote, y_train_smote)

# Make predictions on the test data
y_pred_d2 = stacking_cls_d2.predict(X_val_d2)

# Save classifiers
joblib.dump(stacking_cls_d2, 'ai_human_classifier_d2.pkl')
joblib.dump(tfidf_vectorizer_d2,'tfidf_vectorizer_d2.pkl')

['tfidf_vectorizer_d2.pkl']

In [15]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_val_d2, y_pred_d2))
print("Accuracy (Domain 2):", accuracy_score(y_val_d2, y_pred_d2))
print("F1-Score (Domain 2):", f1_score(y_val_d2, y_pred_d2))
print("ROC-AUC (Domain 2):", roc_auc_score(y_val_d2, y_pred_d2))

Accuracy: 0.9084615384615384
Accuracy (Domain 2): 0.9084615384615384
F1-Score (Domain 2): 0.4079601990049751
ROC-AUC (Domain 2): 0.6298668734433801


In [16]:
# Load saved models
domain_classifier = joblib.load('domain_classifier.pkl')
ai_human_classifier_d1 = joblib.load('ai_human_classifier_d1.pkl')
ai_human_classifier_d2 = joblib.load('ai_human_classifier_d2.pkl')



In [17]:
# Correctly load pre-fitted vectorizers
tfidf_vectorizer_domain = joblib.load('vectorizer.pkl')  # Adjust file path as necessary

# Assuming df_test is your test DataFrame and it includes a column 'text' containing tokenized texts
df_test['text_str'] = df_test['text'].apply(lambda tokens: ' '.join(map(str, tokens)))

# Correctly use the pre-fitted vectorizers to transform the test data
X_test_domain = tfidf_vectorizer_domain.transform(df_test['text_str'])



In [18]:
# Predict domain for test data
test_domain_preds = domain_classifier.predict(X_test_domain)

# Splitting the test data based on domain predictions
df_test['predicted_domain'] = test_domain_preds  # Assign domain predictions to a new column in the test DataFrame
df_test_d1 = df_test[df_test['predicted_domain'] == 0]  # Filter rows predicted as Domain 1
df_test_d2 = df_test[df_test['predicted_domain'] == 1]  # Filter rows predicted as Domain 2

# Assuming df_test is your test DataFrame and it includes a column 'text' containing tokenized texts
df_test_d1['text_str'] = df_test_d1['text'].apply(lambda tokens: ' '.join(map(str, tokens)))
df_test_d2['text_str'] = df_test_d2['text'].apply(lambda tokens: ' '.join(map(str, tokens)))

# Assuming df_test already has a 'text_str' column from previous steps
# Transform the test data for AI vs Human classification
X_test_d1 = tfidf_vectorizer_domain.transform(df_test_d1['text_str'])
X_test_d2 = tfidf_vectorizer_domain.transform(df_test_d2['text_str'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_d1['text_str'] = df_test_d1['text'].apply(lambda tokens: ' '.join(map(str, tokens)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_d2['text_str'] = df_test_d2['text'].apply(lambda tokens: ' '.join(map(str, tokens)))


In [20]:
# Classify between AI and Human 
test_ai_human_preds_d1 = ai_human_classifier_d1.predict(X_test_d1)
test_ai_human_preds_d2 = ai_human_classifier_d2.predict(X_test_d2)

# Add prediced labels to CSV
df_test_d1['Predicted_Label'] = test_ai_human_preds_d1
df_test_d2['Predicted_Label'] = test_ai_human_preds_d2

# Combine both predictions
df_final_predictions = pd.concat([df_test_d1, df_test_d2]).sort_index()

# Select only 'id', 'predicted_domain', and 'Predicted_Label' columns
df_final_predictions = df_final_predictions[['id', 'predicted_domain', 'Predicted_Label']]

# Save to CSV
df_final_predictions.to_csv('final_predictions_with_domain.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_d1['Predicted_Label'] = test_ai_human_preds_d1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_d2['Predicted_Label'] = test_ai_human_preds_d2


In [21]:
# Assuming 'final_predictions.csv' is your saved CSV file
csv_path = 'final_predictions_with_domain.csv'
df = pd.read_csv(csv_path)

# Assuming 'Predicted_Label' is the column with your labels
domain_counts = df['predicted_domain'].value_counts()
label_counts = df['Predicted_Label'].value_counts()

print(domain_counts)
print(label_counts)


1    2007
0    1993
Name: predicted_domain, dtype: int64
0    2512
1    1488
Name: Predicted_Label, dtype: int64


In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def feature_select(texts, method="tfidf", sparse=False, **kwargs):
    """
    Converts a list of texts to a feature matrix using specified vectorization method.
    
    Args:
        texts (list[str]): List of text documents.
        method (str, optional): Method for vectorization, "tfidf" or "countvectorize". Defaults to "tfidf".
        sparse (bool, optional): Whether to return a sparse matrix or convert it to a dense dataframe. Defaults to False.
        **kwargs: Additional keyword arguments for the vectorizer.
        
    Returns:
        X (sparse matrix or DataFrame): Transformed feature matrix.
        vectorizer (Vectorizer): Fitted vectorizer object.
    """
    # Choose the vectorization method
    if method == "tfidf":
        vectorizer = TfidfVectorizer(**kwargs)
    elif method == "countvectorize":
        vectorizer = CountVectorizer(**kwargs)
    else:
        raise ValueError(f"Unsupported method: {method}")
    
    # Fit and transform the texts
    X = vectorizer.fit_transform(texts)
    
    if sparse:
        return X, vectorizer
    else:
        feature_names = vectorizer.get_feature_names_out()
        df = pd.DataFrame(X.toarray(), columns=feature_names)
        return df, vectorizer


In [None]:
# Combine the datasets and prepare the text data for vectorization by converting lists of tokens to strings
texts_combined = pd.concat([df_domain1['text'], df_domain2['text']]).apply(lambda x: ' '.join(x)).tolist()

# Now, use the feature_select function to vectorize the combined texts
X_combined, vectorizer_combined = feature_select(texts_combined, method="tfidf", sparse=True)


In [None]:
from sklearn.linear_model import LogisticRegression

domain_classifier = LogisticRegression(max_iter=1000)
domain_classifier.fit(X_train_combined, y_train_combined)

# Evaluate the classifier
domain_accuracy = domain_classifier.score(X_test_combined, y_test_combined)
print(f"Domain classification accuracy: {domain_accuracy}")


In [None]:
# Simplified example for Domain 1 using SVM for AI vs. human classification
from sklearn.svm import SVC

# Prepare data for Domain 1 (you need to split your data into training and test sets)
# X_train_domain1, X_test_domain1, y_train_domain1, y_test_domain1 = train_test_split(...)

svm_domain1 = SVC(probability=True)
svm_domain1.fit(X_train_domain1, y_train_domain1)

# For Domain 2, you might use a One-Class SVM or other anomaly detection methods


In [None]:
# This step involves transforming your test data using the same feature extraction
# process used for training data, predicting the domain, and then applying the
# appropriate domain-specific model for final classification.


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Split combined domain dataset
X_train_domain, X_test_domain, y_train_domain, y_test_domain = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

# Train a Random Forest classifier for domain classification
domain_classifier = RandomForestClassifier()
domain_classifier.fit(X_train_domain, y_train_domain)


In [None]:
from sklearn.svm import SVC

# Assuming X_domain1, y_domain1 are features and labels for domain 1
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_domain1, y_domain1)


In [None]:
from sklearn.ensemble import IsolationForest

# Assuming X_domain2 are features for domain 2
# Note: Isolation Forest doesn't need labels for fitting
iso_forest = IsolationForest()
iso_forest.fit(X_domain2)


In [None]:
# Assuming X_test are features of the test set
test_domain_predictions = domain_classifier.predict(X_test)


In [None]:
# For each test sample, check predicted domain and classify accordingly
ai_human_predictions = []
for i, domain in enumerate(test_domain_predictions):
    if domain == 1:  # Domain 1
        pred = svm_classifier.predict([X_test[i]])
    else:  # Domain 2 (using anomaly detection scores to determine class)
        score = iso_forest.decision_function([X_test[i]])
        pred = [1 if s < 0 else 0 for s in score]  # Assuming negative scores are anomalies (human)
    ai_human_predictions.append(pred)
