In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

from openai import OpenAI
import tiktoken

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, auc
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

import os
import glob
import pickle

In [2]:
def check_AUC_ROC(df, show_plot=True):
    HC_edits = df[df['edited'] == 1]['prediction'].values
    HC_original = df[df['edited'] == 0]['prediction'].values
    
    y_test = [1 for _ in range(len(HC_edits))] + [0 for _ in range(len(HC_original))]
    y_prob = list(HC_edits) + list(HC_original)

    # Calculate the ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    
    # Find the index of the point on the ROC curve closest to (0, 1)
    best_threshold_index = np.argmax(tpr - fpr)

    # Get the threshold corresponding to the point on the ROC curve
    best_threshold = thresholds[best_threshold_index]
    
    if show_plot:
        # Calculate the AUC-ROC score
        auc_roc = auc(fpr, tpr)

        # Plot the ROC curve
        plt.figure(figsize=(8, 8))
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'AUC = {auc_roc:.2f} Threshold = {best_threshold:.2f}')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate (1 - Specificity)')
        plt.ylabel('True Positive Rate (Sensitivity)')
        plt.title('Receiver Operating Characteristic (ROC) Curve')
        plt.legend(loc="lower right")
        plt.show()
    
    return best_threshold

def check_results(df):
    if 'edited' not in df.columns:
        raise RuntimeError('Column "edited" is missing')
        
    if 'prediction' not in df.columns:
        raise RuntimeError('Column "prediction" is missing')
        
    y_true = df['edited'].values.astype('int')
    y_pred = df['prediction'].values.astype('int')

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    conf_matrix = confusion_matrix(y_true, y_pred)
    fp = conf_matrix[0, 1]
    tn = conf_matrix[0, 0]
    fn = conf_matrix[1, 0]
    tp = conf_matrix[1, 1]
    fpr = fp / (fp + tn)
    fnr = fn / (fn + tp)

    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("FPR:", fpr)
    print("FNR:", fnr)

#### **SVM(word counts + tfidf)**

In [3]:
# Read article from the json string
def get_text_article(article_str, get_edits=False, edit_ratio=None, save_number_sentences=False):
    text = ''
    number_of_sentences = article_str.count('sentence":')
    article_obj = eval(article_str)
    if edit_ratio is not None:
        num_of_edits = number_of_sentences * edit_ratio // 1
    else:
        num_of_edits = article_str.count('alternative":')
                
    for sub_title in article_obj['sub_titles']:
        for sentence in sub_title['sentences']:
            current_text = f"{sentence['sentence']}\n"
            if get_edits and num_of_edits > 0 and 'alternative' in sentence:
                if save_number_sentences: # If save the number of sentences as the original article
                    current_text = f"{sentence['alternative']}\n"
                else:
                    current_text += f"{sentence['alternative']}\n"
                num_of_edits -= 1
            text += current_text
    return text

def get_articles_text_lst(df):
    article_text_lst = []
    article_label_lst = []
    
    for i in range(len(df)):
        article_obj = eval(df.iloc[i]['article_json'])
        for edited in [False, True]:
            article_text_lst.append(get_text_article(article_obj, edited))
            article_label_lst.append(int(edited))
    
    return article_text_lst, article_label_lst

In [51]:
#train SVM on combined dataset
number_of_sentences =200
edit_ratio = 0.1

topics = ['locations_articles']#'AbstractDataset', 'NewsDataset', 'WikiDataset']

for topic in topics:
    print(topic)
    files_path = f'D:\\.Idan\\תואר שני\\תזה\\mainDataset\\generatedArticles\\{topic}'
    
    df_train = pd.read_csv(f'{files_path}\\{topic}_null.csv').sample(frac=1)
    df_validation = df_train[int(len(df_train) * 0.8 // 1):]
    df_train = df_train[:int(len(df_train) * 0.8 // 1)]
    df_test = pd.read_csv(f'{files_path}\\{topic}_test.csv')
    
    X_train, y_train = get_articles_text_lst(df_train)
    X_val  , y_val = get_articles_text_lst(df_validation)
    X_test , y_test = get_articles_text_lst(df_test)
    
    best_model = SVC(C=0.1, gamma=1, kernel='poly', probability=True, random_state=555)
    pipe = Pipeline([('count', CountVectorizer(stop_words='english', max_df=0.75, min_df=5, max_features=10000, ngram_range=(1, 2))),
                     ('tfid', TfidfTransformer())]).fit(X_train)

    X_train = pipe.transform(X_train).toarray()
    X_train = StandardScaler().fit_transform(X_train)
    y_train = y_train
    best_model.fit(X_train, y_train)
    
    pred = best_model.predict(X_train)
    print(f'Train accuracy {accuracy_score(y_train, pred)}')
    
    # Get best threshold for the validation set
    X_val = pipe.transform(X_val).toarray()
    X_val = StandardScaler().fit_transform(X_val)
    y_pred = best_model.predict_proba(X_val)
    y_pred = y_pred[:, 1]

    auc_roc, best_threshold = check_AUC_ROC(y_val, y_pred, threshold_FPR_at=None)
    _, best_threshold_at_fpr_005 = check_AUC_ROC(y_val, y_pred, threshold_FPR_at=0.05)
    
    # Evaluate the final model on the test set
    X_test = pipe.transform(X_test).toarray()
    X_test = StandardScaler().fit_transform(X_test)
    y_pred = best_model.predict_proba(X_test)
    y_pred = y_pred[:, 1]
    
    y_pred_threshold = list((y_pred >= best_threshold).astype(int))
    y_pred_threshold_fpr_005 = list((y_pred >= best_threshold_at_fpr_005).astype(int))
    
    print(f'Test accuracy {accuracy_score(y_test, y_pred_threshold)}')
    print(f'Test accuracy FPR@0.05 {accuracy_score(y_test, y_pred_threshold_fpr_005)}')

locations_articles
Train accuracy 0.5
Test accuracy 0.5
Test accuracy FPR@0.05 0.5


#### **embedding-3-small**

In [13]:
def normalize_l2(x):
    x = np.array(x)
    if x.ndim == 1:
        norm = np.linalg.norm(x)
        if norm == 0:
            return x
        return x / norm
    else:
        norm = np.linalg.norm(x, 2, axis=1, keepdims=True)
        return np.where(norm == 0, x, x / norm)
    
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def create_df_embedding(file_path, embedding_size=370, get_edits=False, edit_ratio=None):
    df = pd.read_csv(file_path)

    client = OpenAI(api_key='XXXXXXXXX')
    columns = ['article_index'] + [f'dim_{i+1}' for i in range(embedding_size)]
    embedding_df = pd.DataFrame(columns=columns)

    for i in tqdm(range(len(df))):
        article_str = df.iloc[i]['article_json']
        article_text = get_text_article(article_str, get_edits, edit_ratio, save_number_sentences=True) #save_number_sentences=True

        # Max token
        if num_tokens_from_string(article_text, "cl100k_base") > 8191:
            continue

        try:
            response = client.embeddings.create(model="text-embedding-3-small", input=article_text, encoding_format="float")
        except:
            continue

        cut_dim = response.data[0].embedding[:embedding_size]
        norm_dim = normalize_l2(cut_dim)

        embedding_df = pd.concat([embedding_df, pd.DataFrame([[i] + list(norm_dim)], columns=columns)])
        
    return embedding_df

In [19]:
# Create embedded data main dataset
edit_ratios = [0.05, 0.1, 0.15]
topics = ['video_games_series_movies_articles', 'war_articles'] #'characters_articles', 'locations_articles', 'nature_articles', 
has_edits = [False, True]

for topic in topics:
    for edit_ratio in edit_ratios:
        for get_edits in has_edits:
            print(f'topic: {topic} edit_ratio: {edit_ratio} get_edits: {get_edits}')
            suffix = 'edited' if get_edits else 'not_edited'
            file_path = f'D:\\.Idan\\תואר שני\\תזה\\mainDataset\\generatedArticles\\Cross_validation\\{topic}\\edit_ratio_{edit_ratio}\\{topic}.csv'
            dest_path = f'D:\\.Idan\\תואר שני\\תזה\\mainDataset\\generatedArticles\\Cross_validation\\{topic}\\edit_ratio_{edit_ratio}\\{topic}_embedded_{suffix}.csv'

            embedding_df = create_df_embedding(file_path, embedding_size=370, get_edits=get_edits, edit_ratio=None)
            embedding_df.to_csv(dest_path)
            
#             file_path = f'D:\\.Idan\\תואר שני\\תזה\\mainDataset\\generatedArticles\\Cross_validation\\{topic}\\{topic}_test.csv'
#             dest_path = f'D:\\.Idan\\תואר שני\\תזה\\mainDataset\\generatedArticles\\Cross_validation\\{topic}\\edit_ratio_{edit_ratio}\\{topic}_test_embedded_{suffix}.csv'

#             embedding_df = create_df_embedding(file_path, embedding_size=370, get_edits=get_edits, edit_ratio=None)
#             embedding_df.to_csv(dest_path)

topic: video_games_series_movies_articles edit_ratio: 0.05 get_edits: False


100%|████████████████████████████████████████████████████████████████████████████████| 183/183 [01:53<00:00,  1.61it/s]


topic: video_games_series_movies_articles edit_ratio: 0.05 get_edits: True


100%|████████████████████████████████████████████████████████████████████████████████| 183/183 [01:55<00:00,  1.58it/s]


topic: video_games_series_movies_articles edit_ratio: 0.1 get_edits: False


100%|████████████████████████████████████████████████████████████████████████████████| 183/183 [01:58<00:00,  1.54it/s]


topic: video_games_series_movies_articles edit_ratio: 0.1 get_edits: True


100%|████████████████████████████████████████████████████████████████████████████████| 183/183 [01:54<00:00,  1.60it/s]


topic: video_games_series_movies_articles edit_ratio: 0.15 get_edits: False


100%|████████████████████████████████████████████████████████████████████████████████| 183/183 [01:51<00:00,  1.64it/s]


topic: video_games_series_movies_articles edit_ratio: 0.15 get_edits: True


100%|████████████████████████████████████████████████████████████████████████████████| 183/183 [01:45<00:00,  1.73it/s]


topic: war_articles edit_ratio: 0.05 get_edits: False


100%|████████████████████████████████████████████████████████████████████████████████| 195/195 [01:48<00:00,  1.79it/s]


topic: war_articles edit_ratio: 0.05 get_edits: True


100%|████████████████████████████████████████████████████████████████████████████████| 195/195 [01:53<00:00,  1.72it/s]


topic: war_articles edit_ratio: 0.1 get_edits: False


100%|████████████████████████████████████████████████████████████████████████████████| 195/195 [01:51<00:00,  1.76it/s]


topic: war_articles edit_ratio: 0.1 get_edits: True


100%|████████████████████████████████████████████████████████████████████████████████| 195/195 [01:54<00:00,  1.70it/s]


topic: war_articles edit_ratio: 0.15 get_edits: False


100%|████████████████████████████████████████████████████████████████████████████████| 195/195 [01:51<00:00,  1.75it/s]


topic: war_articles edit_ratio: 0.15 get_edits: True


100%|████████████████████████████████████████████████████████████████████████████████| 195/195 [01:54<00:00,  1.70it/s]


In [52]:
# Create embedding small data
for file_type in ['null', 'test']:
    for get_edits in [True, False]:
        for edit_ratio in [0.05, 0.1, 0.15]: 
            suffix = 'edited' if get_edits else 'not_edited'
            file_path = f'D:\\.Idan\\תואר שני\\תזה\\mainDataset\\generatedArticles\\characters_articles\\edit_ratio_{edit_ratio}\\test_somthing\\{file_type}_data_chars.csv'
            dest_path = f'D:\\.Idan\\תואר שני\\תזה\\mainDataset\\generatedArticles\\characters_articles\\edit_ratio_{edit_ratio}\\test_somthing\\{file_type}_data_chars_embedded_{suffix}.csv'

            embedding_df = create_df_embedding(file_path, embedding_size=370, get_edits=get_edits, edit_ratio=None)
            embedding_df.to_csv(dest_path)
            
for edit_ratio in [0.05, 0.1, 0.15]: 
    for file_type in ['null', 'test']:
        df_lst = []
        for i, get_edits in enumerate([False, True]):
            suffix = 'edited' if get_edits else 'not_edited'
            dest_path = f'D:\\.Idan\\תואר שני\\תזה\\mainDataset\\generatedArticles\\characters_articles\\edit_ratio_{edit_ratio}\\test_somthing\\{file_type}_data_chars_embedded_{suffix}.csv'
            temp_df = pd.read_csv(dest_path)
            temp_df['has_edits'] = i # not edited = 0 edit = 1
            df_lst.append(temp_df)
        df = pd.concat(df_lst, ignore_index=True)
        df.to_csv(f'D:\\.Idan\\תואר שני\\תזה\\mainDataset\\generatedArticles\\characters_articles\\edit_ratio_{edit_ratio}\\test_somthing\\{file_type}_data_chars_embedded.csv')

100%|████████████████████████████████████████████████████████████████████████████████| 121/121 [01:13<00:00,  1.65it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 121/121 [01:30<00:00,  1.33it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 121/121 [02:57<00:00,  1.47s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 121/121 [02:39<00:00,  1.32s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 121/121 [01:16<00:00,  1.58it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 121/121 [00:59<00:00,  2.03it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 81/81 [00:53<00:00,  1.52it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 81/81 [05:54<00:00,  4.38s/it]
100%|███████████████████████████████████

In [1]:
# Create embedded data second dataset
edit_ratios = [10] #, 20
num_of_sentences = [200] #50, 100, 
topics = ['AbstractDataset'] #'WikiDataset', 'NewsDataset', 
has_edits = [False, True]

for topic in topics:
    for num in num_of_sentences:
        for edit_ratio in edit_ratios:
            for get_edits in has_edits:
                print(f'topic: {topic} edit_ratio: {edit_ratio} get_edits: {get_edits}')
                suffix = 'edited' if get_edits else 'not_edited'
                
                file_name = 'model_name_Research_Abstracts_null';
                file_name = 'model_name_news_articles_null' if topic == 'NewsDataset' else file_name
                file_name = 'model_name_wiki_intro_null' if topic == 'WikiDataset' else file_name
                
                
                file_path = f'D:\\.Idan\\תואר שני\\תזה\\SecondDataset\\{topic}\\{num}_sentences\\{edit_ratio}\\{file_name}.csv'
                dest_path = f'D:\\.Idan\\תואר שני\\תזה\\SecondDataset\\{topic}\\{num}_sentences\\{edit_ratio}\\{file_name}_embedded_{suffix}.csv'

                embedding_df = create_df_embedding(file_path, embedding_size=370, get_edits=get_edits)
                embedding_df.to_csv(dest_path)

In [50]:
# Combine the files
edit_ratios = [20] #, 20
num_of_sentences = [200] #, 100, 200
topics = ['AbstractDataset', 'NewsDataset', 'WikiDataset'] # 
has_edits = [False, True]
df = None
for topic in topics:
    for num in num_of_sentences:
        for edit_ratio in edit_ratios:
            for get_edits in has_edits:
                suffix = 'edited' if get_edits else 'not_edited'
                
                file_name = 'model_name_Research_Abstracts_null';
                file_name = 'model_name_news_articles_null' if topic == 'NewsDataset' else file_name
                file_name = 'model_name_wiki_intro_null' if topic == 'WikiDataset' else file_name
                
                file_path = f'D:\\.Idan\\תואר שני\\תזה\\SecondDataset\\{topic}\\{num}_sentences\\{edit_ratio}\\{file_name}_embedded_{suffix}.csv'
                if df is None:
                    df = pd.read_csv(file_path)
                else:
                    df = pd.concat([df, pd.read_csv(file_path)])

dest_path = f'D:\\.Idan\\תואר שני\\תזה\\SecondDataset\\Combined\\{num_of_sentences[0]}_sentences\\{edit_ratios[0]}'
df = df.sample(frac=1)
df.to_csv(f'{dest_path}\\model_name_combined_train.csv')

In [4]:
def get_datasets(file_name_train, file_name_test):
    df_not_edited = pd.read_csv(f'{file_name_train}_embedded_not_edited.csv')
    df_edited = pd.read_csv(f'{file_name_train}_embedded_edited.csv')

    df_not_edited.drop(columns=['Unnamed: 0'], inplace=True)
    df_edited.drop(columns=['Unnamed: 0'], inplace=True)

    df = pd.concat([df_not_edited, df_edited])
    df['y'] = [0 for _ in range(len(df_not_edited))] + [1 for _ in range(len(df_edited))]
    df = df.sample(frac=1).reset_index(drop=True)

    df_not_edited_test = pd.read_csv(f'{file_name_test}_embedded_not_edited.csv')
    df_edited_test = pd.read_csv(f'{file_name_test}_embedded_edited.csv')

    df_not_edited_test.drop(columns=['Unnamed: 0'], inplace=True)
    df_edited_test.drop(columns=['Unnamed: 0'], inplace=True)

    df_test = pd.concat([df_not_edited_test, df_edited_test])
    df_test['y'] = [0 for _ in range(len(df_not_edited_test))] + [1 for _ in range(len(df_edited_test))]
    df_test = df_test.sample(frac=1).reset_index(drop=True)

    return df, df_test

In [5]:
def find_threhsold_at_FPR(y_test, y_prob, threshold_FPR_at, steps=0.01):
    # Find the threshold for FPR = threshold_FPR_at
    min_dist_type = y_prob.min()
    max_dist_type = y_prob.max()
    best_threshold = max_dist_type
    best_acc = 0
    
    for threshold in np.arange(min_dist_type, max_dist_type, steps):
        y_score = (y_prob >= threshold).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_test, y_score).ravel()
        TPR = fp / (fp + tn)
        if TPR <= threshold_FPR_at:
            acc = (tp + tn) / (tp + tn + fp + fn)
            if acc > best_acc:
                best_acc = acc
                best_threshold = threshold 
            
    return best_threshold

def find_threhsold_best_accuracy(y_test, y_prob, threshold_FPR_at, steps=0.01):
    # Find the threshold for FPR = threshold_FPR_at
    min_dist_type = y_prob.min()
    max_dist_type = y_prob.max()
    best_threshold = max_dist_type
    best_acc = 0
    
    for threshold in np.arange(min_dist_type, max_dist_type, steps):
        y_score = (y_prob >= threshold).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_test, y_score).ravel()
        acc = (tp + tn) / (tp + tn + fp + fn)
        if acc > best_acc:
            best_acc = acc
            best_threshold = threshold 
            
    return best_threshold

def check_AUC_ROC(y_test, y_prob, title='', dist_type='HC', show_plot=False, threshold_FPR_at=None, steps=0.01):
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)

    # Calculate the ROC curve
    if threshold_FPR_at == None:
        # Find the index of the point on the ROC curve closest to (0, 1)
        best_threshold_index = np.argmax(tpr - fpr)
        # Get the threshold corresponding to the point on the ROC curve
        best_threshold = thresholds[best_threshold_index]
    elif threshold_FPR_at == -1:
        best_threshold = find_threhsold_best_accuracy(y_test, y_prob, threshold_FPR_at, steps=steps)
    else:
        best_threshold = find_threhsold_at_FPR(y_test, y_prob, threshold_FPR_at, steps=steps)
    
    # Calculate the AUC-ROC score
    auc_roc = auc(fpr, tpr)
    if show_plot:
        # Plot the ROC curve
        plt.figure(figsize=(8, 8))
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'AUC = {auc_roc:.2f} Threshold = {best_threshold:.2f}')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate (1 - Specificity)')
        plt.ylabel('True Positive Rate (Sensitivity)')
        plt.title(f'Receiver Operating Characteristic (ROC) Curve on {dist_type} \n{title}')
        plt.legend(loc="lower right")
        plt.show()
    
    return best_threshold, auc_roc

def get_AUC(df, dist_type='HC', label='edited'):
    HC_edits = df[df[label] == 1][dist_type].values
    HC_original = df[df[label] == 0][dist_type].values

    y_test = [1 for _ in range(len(HC_edits))] + [0 for _ in range(len(HC_original))]
    y_prob = list(HC_edits) + list(HC_original)
    
    NaN_list = []
    # Check for NaN
    for i, v in enumerate(y_prob):
        if np.isnan(v):
            NaN_list.append(i)
            
    for i in NaN_list:
        y_test.pop(i)
        y_prob.pop(i)

    # Calculate the ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    auc_roc = auc(fpr, tpr)
    return auc_roc

def calculate_fpr(y, pred):
    # Calculate True Negatives (TN) and False Positives (FP)
    tn = np.sum((y == 0) & (pred == 0))
    fp = np.sum((y == 0) & (pred == 1))
    
    # Calculate False Positive Rate
    fpr = fp / (fp + tn) if (fp + tn) != 0 else 0
    return fpr

In [55]:
# Do a grid search for hyper parameters
RANDOM_STATE=555

df_train = pd.read_csv('D:\\.Idan\\תואר שני\\תזה\\mainDataset\\generatedArticles\\characters_articles\\edit_ratio_0.15\\test_somthing\\null_data_chars_embedded.csv')
df_test = pd.read_csv('D:\\.Idan\\תואר שני\\תזה\\mainDataset\\generatedArticles\\characters_articles\\edit_ratio_0.15\\test_somthing\\test_data_chars_embedded.csv')

df_train = df_train.sample(frac=1, random_state=RANDOM_STATE)
df_test = df_test.sample(frac=1, random_state=RANDOM_STATE)

dim_coulmns = [column for column in df_train.columns if 'dim_' in column]

X_train = df_train[dim_coulmns]
X_test  = df_test[dim_coulmns]
X_val = X_test[int(len(X_test) // 2):]
X_test = X_test[:int(len(X_test) // 2)]

y_train = df_train['has_edits']
y_test  = df_test['has_edits']
y_val = y_test[int(len(y_test) // 2):]
y_test = y_test[:int(len(y_test) // 2)]

# Set the parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}

# Train SVM model
clf = SVC(random_state=RANDOM_STATE)

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print(f"Best hyperparameters: {grid_search.best_params_}")

# Train the final model on the combined train and validation sets
final_model = SVC(C=grid_search.best_params_['C'], gamma=grid_search.best_params_['gamma'], kernel=grid_search.best_params_['kernel']
                  , probability=True, random_state=RANDOM_STATE)

final_model.fit(X_train, y_train)

# Evaluate the final model on the test set
y_pred = final_model.predict_proba(X_val)
y_pred = y_pred[:, 1]

best_threshold, auc_roc = check_AUC_ROC(y_val, y_pred, threshold_FPR_at=None)
best_threshold_at_fpr_005, _ = check_AUC_ROC(y_val, y_pred, threshold_FPR_at=0.05)

# Evaluate the final model on the test set
y_pred = final_model.predict_proba(X_test)
y_pred = y_pred[:, 1]

y_pred_threshold = list((y_pred >= best_threshold).astype(int))
y_pred_threshold_fpr_005 = list((y_pred >= best_threshold_at_fpr_005).astype(int))

tn, fp, fn, tp = confusion_matrix(y_test, y_pred_threshold).ravel()
FPR = fp / (fp + tn)

print(f'AUC {auc_roc}')
print(f'Test accuracy {accuracy_score(y_test, y_pred_threshold)} FPR@{FPR}')
print(f'Test accuracy {accuracy_score(y_test, y_pred_threshold_fpr_005)} FPR@0.05')

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best hyperparameters: {'C': 100, 'gamma': 1, 'kernel': 'linear'}
AUC 0.8165007112375533
Test accuracy 0.7567567567567568 FPR@0.32432432432432434
Test accuracy 0.6216216216216216 FPR@0.05


In [38]:
# Run SVM on specifi data SecondDataset asd
number_of_sentences = 50
edit_ratio = 10

topics = ['AbstractDataset']#, 'NewsDataset', 'WikiDataset']
auc_avg = 0
thrshld = 0
thrshld_005 = 0
models = []

for topic in topics:
    print(topic)
    
    files_path = f'D:\\.Idan\\תואר שני\\תזה\\SecondDataset\\Combined\\{number_of_sentences}_sentences\\{edit_ratio}'
    
#     # Read data
#     df_edited_train = pd.read_csv(f'{files_path}\\{topic}_embedded_edited.csv')
#     df_not_edited_train = pd.read_csv(f'{files_path}\\{topic}_embedded_not_edited.csv')
#     df_edited_test = pd.read_csv(f'{files_path}\\{topic}_test_embedded_edited.csv')
#     df_not_edited_test = pd.read_csv(f'{files_path}\\{topic}_test_embedded_not_edited.csv')
    
#     # Add label
#     df_edited_train['has_edits'] = 1
#     df_not_edited_train['has_edits'] = 0
#     df_edited_test['has_edits'] = 1
#     df_not_edited_test['has_edits'] = 0
    
#     #Combine the data
#     df_train = pd.concat([df_edited_train, df_not_edited_train], ignore_index=True)
#     df_train = df_train.sample(frac=1)
    
    df_train = pd.read_csv(f'{files_path}\\model_name_combined_train.csv')
    df_validation = pd.read_csv(f'{files_path}\\model_name_combined_validation.csv')
    df_test = pd.read_csv(f'{files_path}\\model_name_combined_test.csv')
    
    df_train = df_train[df_train['topic']==topic]
    df_validation = df_validation[df_validation['topic']==topic]
    df_test = df_test[df_test['topic']==topic]
    
    df_train = df_train.sample(frac=1)[:700]
    df_validation = df_validation.sample(frac=1)[:700]
    df_test = df_test.sample(frac=1)[:700]
    
    dim_coulmns = [column for column in df_train.columns if 'dim_' in column]

    X_train = df_train[dim_coulmns] #df_train.drop(columns=['Unnamed: 0', 'has_edits', 'article_index', 'topic', 'edit_ratio', 'num_of_sentences'])
    X_val   = df_validation[dim_coulmns] #df_validation.drop(columns=['Unnamed: 0', 'has_edits', 'article_index', 'topic', 'edit_ratio', 'num_of_sentences'])
    X_test  = df_test[dim_coulmns] #df_test.drop(columns=['Unnamed: 0','has_edits', 'article_index', 'topic', 'edit_ratio', 'num_of_sentences'])

    y_train = df_train['has_edits']
    y_val   = df_validation['has_edits']
    y_test  = df_test['has_edits']

    # Train the final model on the combined train and validation sets
    final_model = SVC(C=100, gamma=1, kernel='rbf', probability=True, random_state=555)
    final_model.fit(X_train, y_train)
    models.append(final_model)

    # Evaluate the final model on the test set
    y_pred = final_model.predict_proba(X_val)
    y_pred = y_pred[:, 1]

    auc_roc, best_threshold = check_AUC_ROC(y_val, y_pred, threshold_FPR_at=None)
    _, best_threshold_at_fpr_005 = check_AUC_ROC(y_val, y_pred, threshold_FPR_at=0.05)
    
    # Evaluate the final model on the test set
    y_pred = final_model.predict_proba(X_test)
    y_pred = y_pred[:, 1]
    
    y_pred_threshold = list((y_pred >= best_threshold).astype(int))
    y_pred_threshold_fpr_005 = list((y_pred >= best_threshold_at_fpr_005).astype(int))
    
    print(f'Test accuracy {accuracy_score(y_test, y_pred_threshold)}')
    print(f'Test accuracy FPR@0.05 {accuracy_score(y_test, y_pred_threshold_fpr_005)}')

AbstractDataset
Test accuracy 0.6135458167330677
Test accuracy FPR@0.05 0.5936254980079682


In [64]:
# Run SVM on specific data mainDataset
number_of_sentences = 200
edit_ratio = 0.1
RANDOM_STATE=555

topics = ['war_articles']
auc_avg = 0
thrshld = 0
thrshld_005 = 0
models = []

for topic in topics:
    print(topic)
    
    files_path = f'D:\\.Idan\\תואר שני\\תזה\\mainDataset\\generatedArticles\\{topic}\\edit_ratio_{edit_ratio}'
    
    # Read data
    df_edited_train = pd.read_csv(f'{files_path}\\{topic}_embedded_edited.csv')
    df_not_edited_train = pd.read_csv(f'{files_path}\\{topic}_embedded_not_edited.csv')
    df_edited_test = pd.read_csv(f'{files_path}\\{topic}_test_embedded_edited.csv')
    df_not_edited_test = pd.read_csv(f'{files_path}\\{topic}_test_embedded_not_edited.csv')
    
    # Add label
    df_edited_train['has_edits'] = 1
    df_not_edited_train['has_edits'] = 0
    df_edited_test['has_edits'] = 1
    df_not_edited_test['has_edits'] = 0
    
    #Combine the data
    df_train = pd.concat([df_edited_train, df_not_edited_train], ignore_index=True).sample(frac=1, random_state=RANDOM_STATE)
    df_val = df_train[int(len(df_train) * 0.8):]
    df_train = df_train[:int(len(df_train) * 0.8)]
    df_test = pd.concat([df_edited_test, df_not_edited_test], ignore_index=True).sample(frac=1)
    
    dim_coulmns = [column for column in df_train.columns if 'dim_' in column]

    X_train = df_train[dim_coulmns]
    X_val   = df_val[dim_coulmns]
    X_test  = df_test[dim_coulmns]

    y_train = df_train['has_edits']
    y_val   = df_val['has_edits']
    y_test  = df_test['has_edits']

    # Train the final model on the combined train and validation sets
    final_model = SVC(C=100, gamma='scale', kernel='linear', probability=True, random_state=RANDOM_STATE)
    final_model.fit(X_train, y_train)
    models.append(final_model)

    # Evaluate the final model on the test set
    y_pred = final_model.predict_proba(X_val)
    y_pred = y_pred[:, 1]

    auc_roc, best_threshold = check_AUC_ROC(y_val, y_pred, threshold_FPR_at=None)
    _, best_threshold_at_fpr_005 = check_AUC_ROC(y_val, y_pred, threshold_FPR_at=0.05)
    
    # Evaluate the final model on the test set
    y_pred = final_model.predict_proba(X_test)
    y_pred = y_pred[:, 1]
    
    y_pred_threshold = list((y_pred <= best_threshold).astype(int))
    y_pred_threshold_fpr_005 = list((y_pred <= best_threshold_at_fpr_005).astype(int))
    
    print(f'Test accuracy {accuracy_score(y_test, y_pred_threshold)}')
    print(f'Test accuracy FPR@0.05 {accuracy_score(y_test, y_pred_threshold_fpr_005)}')

war_articles
Test accuracy 0.5235294117647059
Test accuracy FPR@0.05 0.5235294117647059


### **Cross Validation main dataset**

In [23]:
# corss validation Maindataset
edit_ratios = [0.05, 0.1, 0.15] 
topics = ['characters_articles', 'locations_articles', 'nature_articles', 'video_games_series_movies_articles', 'war_articles'] 
k_folds = 10
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}
SEED = 555

for topic in topics:
    for edit_ratio in edit_ratios:
#         print(f'topic: {topic} edit_ratio: {edit_ratio}')
        
        # Get data
        files_path = f'D:\\.Idan\\תואר שני\\תזה\\mainDataset\\generatedArticles\\Cross_validation\\{topic}\\edit_ratio_{edit_ratio}'
        file_edited_path = f'{files_path}\\{topic}_embedded_edited.csv'
        file_not_edited_path = f'{files_path}\\{topic}_embedded_not_edited.csv'
        
        df_edited = pd.read_csv(file_edited_path)
        df_not_edited = pd.read_csv(file_not_edited_path)
        
        # Add label
        df_edited['edited'] = 1
        df_not_edited['edited'] = 0
        
        dim_columns = [column for column in df_edited.columns if 'dim_' in column]
        results_df = pd.DataFrame(columns=['topic', 'model', 'edit_ratio', 'fold', 'C', 'gamma', 'kernel', 'accuracy', 'accuracy_at_FPR_005'])
        
        # Get splits
        splits = [i for i in range(0, len(df_edited), len(df_edited) // k_folds)]

        # Do it k_folds times
        for fold in tqdm(range(k_folds)):

            # Create the split
            df_null = None
            df_test = None
            window_size = 1 # Determine the window size, make it easy to adjust the train/test ratio. Example 1 => 90-10 when k_folds=10

            for i in range(len(splits) - 1):
                temp_df = pd.concat([df_edited.iloc[splits[i]:splits[i+1]], df_not_edited.iloc[splits[i]:splits[i+1]]], ignore_index=True)
                if i == fold: # Take test
                    if df_test is None:
                        df_test = temp_df
                    else:
                        df_test = pd.concat([df_test, temp_df])
                else: # Take null
                    if window_size > 1:
                        if df_test is None:
                            df_test = temp_df
                        else:
                            df_test = pd.concat([df_test, temp_df])
                        window_size -= 1
                        continue

                    if df_null is None:
                        df_null = temp_df
                    else:
                        df_null = pd.concat([df_null, temp_df])
               
            if df_test is None:
                continue

            df_null = df_null.sample(frac=1, random_state=SEED)
            df_test = df_test.sample(frac=1, random_state=SEED)

            df_val = df_null[int(len(df_null) * 0.2):]                
            df_null = df_null[:int(len(df_null) * 0.2)]
            
            best_parameters = {
                'topic': [topic], 
                'model': ['embedding'],
                'edit_ratio': [edit_ratio],
                'fold': [fold],
                'C': [0],
                'gamma': [0],
                'kernel': [''],
                'accuracy': [0]
            }
                  
            if df_test is None:
                continue
                
            # Do a grid search
#             for c in param_grid['C']:
#                 for gamma in param_grid['gamma']:
#                     for kernel in param_grid['kernel']:
            ###
            # Train the model on the train set
            model = LogisticRegression(random_state=SEED)
#             model = SVC(C=c, gamma=gamma, kernel=kernel, probability=True, random_state=SEED)
            model.fit(df_null[dim_columns], df_null['edited'].values)

            pred = model.predict_proba(df_val[dim_columns])[:, 1]

            # Compute the ROC curve
            fpr, tpr, thresholds = roc_curve(df_val['edited'].values, pred)

            # Find the threshold that corresponds to the desired FPR (0.05)
            desired_fpr = 0.05
            threshold = thresholds[np.where(fpr <= desired_fpr)[0][-1]]

            pred = model.predict_proba(df_test[dim_columns])[:, 1]

            # Apply the threshold to get final predictions
            pred_adjusted = (pred >= threshold).astype(int)

            acc_at_fpr_005 = accuracy_score(df_test['edited'].values, pred_adjusted)

            pred = model.predict(df_test[dim_columns])
            acc = accuracy_score(df_test['edited'].values, pred)

            fpr = calculate_fpr(df_test['edited'].values, pred)

            if acc > best_parameters['accuracy'][0]:
                best_parameters['C'] = [c]
                best_parameters['gamma'] = [gamma]
                best_parameters['kernel'] = [kernel]
                best_parameters['accuracy'] = [acc]
                best_parameters['FPR'] = [fpr]
                best_parameters['accuracy_at_FPR_005'] = [acc_at_fpr_005]
            ###
            
            # Save reults for the current fold
            results_df = pd.concat([results_df, pd.DataFrame(best_parameters)])

        results_df.reset_index(drop=True)
        results_df.to_csv(f'{files_path}\\folds_results_embedding.csv')

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 28.86it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 28.52it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 29.20it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 29.93it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 28.18it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 27.91it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 27.72it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 28.42it/s]
100%|███████████████████████████████████

In [24]:
# Append results to the results file
# results_path = 'D:\\.Idan\\תואר שני\\תזה\\mainDataset_results_.csv'
# df_results = pd.read_csv(results_path)

edit_ratios = [0.05, 0.1, 0.15] 
topics = ['characters_articles', 'locations_articles', 'nature_articles', 'video_games_series_movies_articles', 'war_articles'] 
lst_df = []
for topic in topics:
    for edit_ratio in edit_ratios:
        df = pd.read_csv(f'D:\\.Idan\\תואר שני\\תזה\\mainDataset\\generatedArticles\\Cross_validation\\{topic}\\edit_ratio_{edit_ratio}\\folds_results_embedding.csv')
        results_obj = {
            'topic': [topic],
            'model': ['embedding'],
            'edit_ratio': [edit_ratio],
            'accuracy': [df['accuracy'].mean()],
            'FPR': [df['FPR'].mean()],
            'accuracy_std': [df['accuracy'].std()], 
            'accuracy_005': [df['accuracy_at_FPR_005'].mean()],
            'accuracy_005_std': [df['accuracy_at_FPR_005'].std()]
        }
        lst_df.append(pd.DataFrame(results_obj))
#         df_results = pd.concat([df_results, pd.DataFrame(results_obj)], ignore_index=True)

df_results = pd.concat(lst_df, ignore_index=True)
df_results.to_csv('SVM_results.csv', index=False)

In [30]:
# corss validation Maindataset combine topics
edit_ratios = [0.05, 0.1, 0.15] 
topics = ['characters_articles', 'locations_articles', 'nature_articles', 'video_games_series_movies_articles', 'war_articles'] 
k_folds = 10
SEED = 555
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

for edit_ratio in edit_ratios:
    
    # Combine topics
    lst_edited = []
    lst_not_edited = []
    for topic in topics:
        files_path = f'D:\\.Idan\\תואר שני\\תזה\\mainDataset\\generatedArticles\\Cross_validation\\{topic}\\edit_ratio_{edit_ratio}'
        file_edited_path = f'{files_path}\\{topic}_embedded_edited.csv'
        file_not_edited_path = f'{files_path}\\{topic}_embedded_not_edited.csv'
        
        df_edited = pd.read_csv(file_edited_path)
        df_not_edited = pd.read_csv(file_not_edited_path)
        
        # Add label
        df_edited['edited'] = 1
        df_not_edited['edited'] = 0
        
        lst_edited.append(df_edited)
        lst_not_edited.append(df_not_edited)
        
    df_edited = pd.concat(lst_edited, ignore_index=True)
    df_not_edited = pd.concat(lst_not_edited, ignore_index=True)
    
    df_edited = df_edited.sample(frac=1, random_state=SEED)[:int(len(df_edited)*0.2)]
    df_not_edited = df_not_edited.sample(frac=1, random_state=SEED)[:int(len(df_not_edited)*0.2)]
        
    dim_columns = [column for column in df_edited.columns if 'dim_' in column]
    results_df = pd.DataFrame(columns=['topic', 'model', 'edit_ratio', 'fold', 'accuracy', 'accuracy_at_FPR_005'])

    # Get splits
    splits = [i for i in range(0, len(df_edited), len(df_edited) // k_folds)]

    # Do it k_folds times
    for fold in tqdm(range(k_folds)):

        # Create the split
        df_null = None
        df_test = None
        window_size = 1 # Determine the window size, make it easy to adjust the train/test ratio. Example 1 => 90-10

        for i in range(len(splits) - 1):
            temp_df = pd.concat([df_edited.iloc[splits[i]:splits[i+1]], df_not_edited.iloc[splits[i]:splits[i+1]]], ignore_index=True)
            if i == fold: # Take test
                if df_test is None:
                    df_test = temp_df
                else:
                    df_test = pd.concat([df_test, temp_df])
            else: # Take null
                if window_size > 1:
                    if df_test is None:
                        df_test = temp_df
                    else:
                        df_test = pd.concat([df_test, temp_df])
                    window_size -= 1
                    continue

                if df_null is None:
                    df_null = temp_df
                else:
                    df_null = pd.concat([df_null, temp_df])
                    
        if df_test is None:
            continue

        df_null = df_null.sample(frac=1, random_state=SEED)
        df_test = df_test.sample(frac=1, random_state=SEED)
        
        df_val = df_null[int(len(df_null) * 0.2):]                
        df_null = df_null[:int(len(df_null) * 0.2)]

        best_parameters = {
            'topic': ['combine'], 
            'model': ['embedding'],
            'edit_ratio': [edit_ratio],
            'fold': [fold],
            'accuracy': [0]
        }

        
        # Do a grid search
        for c in param_grid['C']:
            for gamma in param_grid['gamma']:
                for kernel in param_grid['kernel']:
                    ###
                    # Train the model on the train set
                    model = LogisticRegression(random_state=SEED)
#                     model = SVC(C=c, gamma=gamma, kernel=kernel, probability=True, random_state=SEED)
                    model.fit(df_null[dim_columns], df_null['edited'].values)

                    pred = model.predict_proba(df_val[dim_columns])[:, 1]

                    # Compute the ROC curve
                    fpr, tpr, thresholds = roc_curve(df_val['edited'].values, pred)

                    # Find the threshold that corresponds to the desired FPR (0.05)
                    desired_fpr = 0.05
                    threshold = thresholds[np.where(fpr <= desired_fpr)[0][-1]]

                    pred = model.predict_proba(df_test[dim_columns])[:, 1]

                    # Apply the threshold to get final predictions
                    pred_adjusted = (pred >= threshold).astype(int)

                    acc_at_fpr_005 = accuracy_score(df_test['edited'].values, pred_adjusted)

                    pred = model.predict(df_test[dim_columns])
                    acc = accuracy_score(df_test['edited'].values, pred)

                    fpr = calculate_fpr(df_test['edited'].values, pred)

                    if acc > best_parameters['accuracy'][0]:
                        best_parameters['accuracy'] = [acc]
                        best_parameters['FPR'] = [fpr]
                        best_parameters['accuracy_at_FPR_005'] = [acc_at_fpr_005]
                    ###
        # Save reults for the current fold
        results_df = pd.concat([results_df, pd.DataFrame(best_parameters)])

    print({
        'topic': ['combine'],
        'model': ['embedding'],
        'edit_ratio': [edit_ratio],
        'accuracy': [results_df['accuracy'].mean()],
        'FPR': [results_df['FPR'].mean()],
        'accuracy_std': [results_df['accuracy'].std()], 
        'accuracy_005': [results_df['accuracy_at_FPR_005'].mean()],
        'accuracy_005_std': [results_df['accuracy_at_FPR_005'].std()]
    })

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:08<00:00,  1.23it/s]


{'topic': ['combine'], 'model': ['embedding'], 'edit_ratio': [0.05], 'accuracy': [0.5222222222222221], 'FPR': [0.8222222222222222], 'accuracy_std': [0.05037581669902016], 'accuracy_005': [0.4916666666666667], 'accuracy_005_std': [0.03220838357578906]}


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:07<00:00,  1.30it/s]


{'topic': ['combine'], 'model': ['embedding'], 'edit_ratio': [0.1], 'accuracy': [0.5123456790123456], 'FPR': [0.6666666666666665], 'accuracy_std': [0.1021670207903422], 'accuracy_005': [0.5030864197530864], 'accuracy_005_std': [0.025776686864953817]}


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:08<00:00,  1.15it/s]

{'topic': ['combine'], 'model': ['embedding'], 'edit_ratio': [0.15], 'accuracy': [0.511764705882353], 'FPR': [0.3941176470588236], 'accuracy_std': [0.09112901991076276], 'accuracy_005': [0.49411764705882355], 'accuracy_005_std': [0.02702754657272593]}





In [100]:
# Run Cross validation on specifi data SecondDataset
number_of_sentences = 200
edit_ratio = 10
k_folds = 10
topics = ['WikiDataset']#'AbstractDataset', 'NewsDataset', 'WikiDataset']
results_df = pd.DataFrame(columns=['topic', 'model', 'edit_ratio', 'fold', 'C', 'gamma', 'kernel', 'accuracy', 'accuracy_at_FPR_005'])

for topic in topics:
    print(topic)
    
    # Get data
    files_path = f'D:\\.Idan\\תואר שני\\תזה\\SecondDataset\\Combined\\{number_of_sentences}_sentences\\{edit_ratio}'
    df_train = pd.read_csv(f'{files_path}\\model_name_combined_train.csv')
    df_validation = pd.read_csv(f'{files_path}\\model_name_combined_validation.csv')
    df_test = pd.read_csv(f'{files_path}\\model_name_combined_test.csv')
    
    # Get topics' data
    df_train = df_train[df_train['topic']==topic]
    df_validation = df_validation[df_validation['topic']==topic]
    df_test = df_test[df_test['topic']==topic]
    
    df_combined = pd.concat([df_train, df_validation, df_test], ignore_index=True).sample(frac=1)
    dim_coulmns = [column for column in df_combined.columns if 'dim_' in column]
    
    # Do cross validation split and test
    splits = [i for i in range(0, len(df_combined), len(df_combined) // k_folds)]
    
    # Do it k_folds times
    for fold in tqdm(range(k_folds)):

        # Create the split
        df_null = None
        df_test = None

        for i in range(len(splits) - 1):
            tem_df = df_combined.iloc[splits[i]:splits[i+1]]
            if i == fold: # Take test
                df_test = tem_df
            else: # Take null
                if df_null is None:
                    df_null = tem_df
                else:
                    df_null = pd.concat([df_null, tem_df])

        df_val = df_null[int(len(df_null) * 0.8):]
        df_null = df_null[:int(len(df_null) * 0.8)]

        best_parameters = {
            'topic': [topic], 
            'model': ['embedding'],
            'edit_ratio': [edit_ratio],
            'fold': [fold],
            'C': [0],
            'gamma': [0],
            'kernel': [''],
            'accuracy': [0]
        }

        if df_test is None:
            continue

        # Do a grid search
        for c in param_grid['C']:
            for gamma in param_grid['gamma']:
                for kernel in param_grid['kernel']:

                    # Train the model on the train set
                    model = SVC(C=c, gamma=gamma, kernel=kernel, probability=True, random_state=SEED)
                    model.fit(df_null[dim_columns], df_null['has_edits'])

                    # Evaluate the final model on the validation set
                    y_pred = model.predict_proba(df_val[dim_columns])[:, 1]
                    
                    auc_roc, best_threshold = check_AUC_ROC(df_val['has_edits'], y_pred, threshold_FPR_at=None)
                    _, best_threshold_at_fpr_005 = check_AUC_ROC(df_val['has_edits'], y_pred, threshold_FPR_at=0.05)
                    
                    # Evaluate the final model on the test set
                    y_pred = model.predict_proba(df_test[dim_columns])[:, 1]

                    y_pred_threshold = list((y_pred >= best_threshold).astype(int))
                    y_pred_threshold_fpr_005 = list((y_pred >= best_threshold_at_fpr_005).astype(int))
                    
                    acc = accuracy_score(df_test['has_edits'], y_pred_threshold)
                    acc_at_fpr_005 = accuracy_score(df_test['has_edits'], y_pred_threshold_fpr_005)

                    if acc > best_parameters['accuracy'][0]:
                        best_parameters['C'] = [c]
                        best_parameters['gamma'] = [gamma]
                        best_parameters['kernel'] = [kernel]
                        best_parameters['accuracy'] = [acc]
                        best_parameters['accuracy_at_FPR_005'] = [acc_at_fpr_005]

        # Save reults for the current fold
        results_df = pd.concat([results_df, pd.DataFrame(best_parameters)])

    results_df.reset_index(drop=True)
    results_df.to_csv(f'{files_path}\\folds_results_embedding.csv')

WikiDataset


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:05<00:00,  1.88it/s]
