# Cross lingual prediction

In [140]:
# dependencies
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

In [141]:
# data
df_urdu = pd.read_csv(r'..\Data\Urdu\features_urdu.csv')
df_english = pd.read_csv(r'..\Data\English\features_english.csv')
df_german = pd.read_csv(r'..\Data\German\features_german.csv')
df_italian = pd.read_csv(r'..\Data\Italian\features_italian.csv')

df_western = pd.concat([df_german, df_english, df_italian], ignore_index=True)

In [142]:
def add_decile_test_data_to_train_data(df_train, df_test, decile: int = 0):
    if not (0 <= decile <= 8):
        return ValueError("Value Error: Input out of range")
    else:
        # Split up test data into 10 deciles for gradual increase 
        df_test_parts = np.array_split(df_test, 10)
        
        # 1 to 8 deciles (= 10-80%) shall be used for training
        if decile == 0:
            df_test_for_training = df_test_parts[decile]
        else:
            df_test_for_training = pd.concat([df for df in df_test_parts[:decile]])
    
        df_train = pd.concat([df_train, df_test_for_training], ignore_index=True)

        # remove used data from test data
        df_test = pd.concat([df for df in df_test_parts[decile:]], ignore_index=True)

        return df_train, df_test

In [143]:
# Define train and test dfs
df_train, df_test = add_decile_test_data_to_train_data(df_train=df_urdu, df_test=df_italian, decile=7)

# Separate features and labels
X_train = df_train.drop(["emotion", "speaker_id", "filename", "valence"], axis=1) # Features
y_train = df_train['valence']

X_test = df_test.drop(["emotion", "speaker_id", "filename", "valence"], axis=1) # Features
y_test = df_test['valence']

# Define models for comparison
models = {
    'SVM (linear)': SVC(kernel='linear', C=1.0, random_state=42, probability=True),
    'SVM (rbf)': SVC(kernel='rbf', C=1.0, random_state=42, probability=True),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='mlogloss')
}

# Save results
results = []

# Compare different models
for name, model in models.items():
    
    # Define pipeline: Scaling + Model
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', model)
        ])
   
    # Fit the model on training data
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    #y_prob = pipeline.predict_proba(X_test) if hasattr(model, "predict_proba") else None
    
    # Metrics calculation
    accuracy = accuracy_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
  
    # Save results
    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Balanced Accuracy': balanced_accuracy,
        'Precision (Weighted)': precision,
        'Recall (Weighted)': recall,
        'F1-Score (Weighted)': f1
    })

  return bound(*args, **kwds)
Parameters: { "use_label_encoder" } are not used.



In [144]:
# add random baseline/dummy
# Generate random predictions from the existing labels
y_random = np.random.choice(y_train.unique(), size=len(y_test), replace=True)

# Metrics for Random Baseline
random_accuracy = accuracy_score(y_test, y_random)
random_balanced_accuracy = balanced_accuracy_score(y_test, y_random)
random_precision = precision_score(y_test, y_random, average='weighted', zero_division=0)
random_recall = recall_score(y_test, y_random, average='weighted')
random_f1 = f1_score(y_test, y_random, average='weighted')

# Save Random Baseline results
results.append({
    'Model': 'stratified Dummy',
    'Accuracy': random_accuracy,
    'Balanced Accuracy': random_balanced_accuracy,
    'Precision (Weighted)': random_precision,
    'Recall (Weighted)': random_recall,
    'F1-Score (Weighted)': random_f1
})

In [145]:
# Display results
results_df = pd.DataFrame(results).sort_values(by='Balanced Accuracy', ascending=False)
results_df

Unnamed: 0,Model,Accuracy,Balanced Accuracy,Precision (Weighted),Recall (Weighted),F1-Score (Weighted)
0,SVM (linear),0.611429,0.576531,0.616571,0.611429,0.572857
2,Logistic Regression,0.611429,0.576531,0.616571,0.611429,0.572857
4,Gradient Boosting,0.605714,0.575603,0.602957,0.605714,0.577601
3,Random Forest,0.6,0.557978,0.613867,0.6,0.539975
1,SVM (rbf),0.588571,0.543599,0.599654,0.588571,0.516735
5,stratified Dummy,0.542857,0.53757,0.544146,0.542857,0.543431


In [146]:
# save the data as csv
results_df.to_csv("../Evaluation/Percentage/train_urdu_test_italian_80percent.csv",index=False)