# Cross lingual prediction

In [98]:
# dependencies
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

In [99]:
# data
df_train = pd.read_csv(r'..\Data\Urdu\features_urdu.csv')
df_test = pd.read_csv(r'..\Data\English\features_english.csv')
#df = pd.read_csv(r'..\Data\German\features_german.csv')
#df = pd.read_csv(r'..\Data\English\features_english.csv')
#df = pd.read_csv(r'..\Data\Italian\features_italian.csv')


In [100]:
# Separate features and labels
X_train = df_train.drop(["emotion", "speaker_id", "filename", "valence"], axis=1) # Features
y_train = df_train['valence']

X_test = df_test.drop(["emotion", "speaker_id", "filename", "valence"], axis=1) # Features
y_test = df_test['valence']

# Define models for comparison
models = {
    'SVM (linear)': SVC(kernel='linear', C=1.0, random_state=42, probability=True),
    'SVM (rbf)': SVC(kernel='rbf', C=1.0, random_state=42, probability=True),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='mlogloss')
}

# Save results
results = []

# Compare different models
for name, model in models.items():
    
    # Define pipeline: Scaling + Model
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])
    
    # Fit the model on English training data
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    #y_prob = pipeline.predict_proba(X_test) if hasattr(model, "predict_proba") else None
    
    # Metrics calculation
    accuracy = accuracy_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
  
    # Save results
    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Balanced Accuracy': balanced_accuracy,
        'Precision (Weighted)': precision,
        'Recall (Weighted)': recall,
        'F1-Score (Weighted)': f1
    })

Parameters: { "use_label_encoder" } are not used.



In [101]:
# add random baseline/dummy
# Generate random predictions from the existing labels
y_random = np.random.choice(y_train.unique(), size=len(y_test), replace=True)

# Metrics for Random Baseline
random_accuracy = accuracy_score(y_test, y_random)
random_balanced_accuracy = balanced_accuracy_score(y_test, y_random)
random_precision = precision_score(y_test, y_random, average='weighted', zero_division=0)
random_recall = recall_score(y_test, y_random, average='weighted')
random_f1 = f1_score(y_test, y_random, average='weighted')

# Save Random Baseline results
results.append({
    'Model': 'stratified Dummy',
    'Accuracy': random_accuracy,
    'Balanced Accuracy': random_balanced_accuracy,
    'Precision (Weighted)': random_precision,
    'Recall (Weighted)': random_recall,
    'F1-Score (Weighted)': random_f1
})

In [102]:
# Display results
results_df = pd.DataFrame(results).sort_values(by='Accuracy', ascending=False)
results_df

Unnamed: 0,Model,Accuracy,Balanced Accuracy,Precision (Weighted),Recall (Weighted),F1-Score (Weighted)
1,SVM (rbf),0.54375,0.54375,0.543757,0.54375,0.543732
2,Logistic Regression,0.5125,0.5125,0.518162,0.5125,0.471296
0,SVM (linear),0.510417,0.510417,0.522243,0.510417,0.435365
4,Gradient Boosting,0.495833,0.495833,0.492785,0.495833,0.436292
5,stratified Dummy,0.485417,0.485417,0.485416,0.485417,0.485414
3,Random Forest,0.460417,0.460417,0.460167,0.460417,0.45957


In [None]:
# save the data as csv
results_df.to_csv("../Evaluation/Crosslingual/train_urdu_test_italian.csv",index=False)