# Percentage of Test Data prediction
In this experiment, we varied the percentage (10-80%) of test data used alongside the training data. The model was trained independently using both Urdu and English, Italian, German languages and evaluated by adjusting the proportion of the testing language included in the training data.

In [20]:
# dependencies
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

In [21]:
# data
df_urdu = pd.read_csv(r'..\Data\Urdu\features_urdu.csv')
df_english = pd.read_csv(r'..\Data\English\features_english.csv')
df_german = pd.read_csv(r'..\Data\German\features_german.csv')
df_italian = pd.read_csv(r'..\Data\Italian\features_italian.csv')

western_dfs = [df_english, df_german, df_italian]

Define Functions for better overview and easier loops

In [22]:
def add_decile_test_data_to_train_data(df_train, df_test, decile: int = 0):
    shuffled_test = df_test.sample(frac=1, random_state=42).reset_index(drop=True)
    if not (0 <= decile <= 8):
        return ValueError("Value Error: Input out of range")
    else:
        # Split up test data into 10 deciles for gradual increase 
        df_test_parts = np.array_split(shuffled_test, 10)
        
        # 1 to 8 deciles (= 10-80%) shall be used for training
        if decile == 0:
            df_test_for_training = df_test_parts[decile]
        else:
            df_test_for_training = pd.concat([df for df in df_test_parts[:decile]])
    
        df_train = pd.concat([df_train, df_test_for_training], ignore_index=True)

        # remove used data from test data
        df_test = pd.concat([df for df in df_test_parts[decile:]], ignore_index=True)

        return df_train, df_test

def separate_feats_labs(df_train, df_test):
    X_train = df_train.drop(["emotion", "speaker_id", "filename", "valence"], axis=1) # Features
    y_train = df_train['valence']

    X_test = df_test.drop(["emotion", "speaker_id", "filename", "valence"], axis=1) # Features
    y_test = df_test['valence']
    return X_train, y_train, X_test, y_test

def compare_models(models: dict, X_train, y_train, X_test, y_test, results):
    
    for name, model in models.items():
    
        # Define pipeline: Scaling + Model
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', model)
            ])
   
        # Fit the model on training data
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
    
        # Metrics calculation
        accuracy = accuracy_score(y_test, y_pred)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
    
        # Save results
        results.append({
            'Model': name,
            'Accuracy': accuracy,
            'Balanced Accuracy': balanced_accuracy,
            'Precision (Weighted)': precision,
            'Recall (Weighted)': recall,
            'F1-Score (Weighted)': f1
        })

    return results

def add_random_dummy(y_train, y_test, results):
    # Generate random predictions from the existing labels
    y_random = np.random.choice(y_train.unique(), size=len(y_test), replace=True)

    # Metrics for Random Baseline
    random_accuracy = accuracy_score(y_test, y_random)
    random_balanced_accuracy = balanced_accuracy_score(y_test, y_random)
    random_precision = precision_score(y_test, y_random, average='weighted', zero_division=0)
    random_recall = recall_score(y_test, y_random, average='weighted')
    random_f1 = f1_score(y_test, y_random, average='weighted')

    # Save Random Baseline results
    results.append({
        'Model': 'stratified Dummy',
        'Accuracy': random_accuracy,
        'Balanced Accuracy': random_balanced_accuracy,
        'Precision (Weighted)': random_precision,
        'Recall (Weighted)': random_recall,
        'F1-Score (Weighted)': random_f1
    })

    return results

def extract_language_from_df(df):
    filename = df["filename"][0]
    language = filename.split("\\")[2].lower()
    return language

In [23]:
# Define models for comparison
models = {
    'SVM (linear)': SVC(kernel='linear', C=1.0, random_state=42, probability=True),
    'SVM (rbf)': SVC(kernel='rbf', C=1.0, random_state=42, probability=True),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='mlogloss')
}

In [None]:
def train_with_test_data(df_train, df_test):
    output_df = pd.DataFrame()
    for decile in range(8):
        # Define train and test dfs
        df_train, df_test = add_decile_test_data_to_train_data(df_train, df_test, decile)

        # Separate features and labels
        X_train, y_train, X_test, y_test = separate_feats_labs(df_train, df_test)

        # Save results
        results = []

        # Compare different models
        results = compare_models(models, X_train, y_train, X_test, y_test, results)

        # add random baseline/dummy
        results = add_random_dummy(y_train, y_test, results)

        # convert results to dataframe
        results_df = pd.DataFrame(results)

        percentage = (decile + 1) * 10

        results_df['Percentage'] = percentage

        output_df = pd.concat([output_df, results_df], ignore_index=True)
        
    file_path = f"../Evaluation/Percentage/train_{extract_language_from_df(df_train)}_test_{extract_language_from_df(df_test)}_percentage.csv"

    # save the data as csv
    output_df.to_csv(file_path, index=False)

In [25]:
for df in western_dfs:
    train_with_test_data(df_train=df, df_test=df_urdu)
    train_with_test_data(df_train=df_urdu, df_test=df)

  return bound(*args, **kwds)
Parameters: { "use_label_encoder" } are not used.

  return bound(*args, **kwds)
Parameters: { "use_label_encoder" } are not used.

  return bound(*args, **kwds)
Parameters: { "use_label_encoder" } are not used.

  return bound(*args, **kwds)
Parameters: { "use_label_encoder" } are not used.

  return bound(*args, **kwds)
Parameters: { "use_label_encoder" } are not used.

  return bound(*args, **kwds)
Parameters: { "use_label_encoder" } are not used.

  return bound(*args, **kwds)
Parameters: { "use_label_encoder" } are not used.

  return bound(*args, **kwds)


KeyboardInterrupt: 