In [None]:
# Loadanje modela od prije za pca graf (vecina koda se ponavlja)

In [None]:
model_save_path = "/kaggle/working/saved_model"
tokenizer_save_path = "/kaggle/working/saved_tokenizer"
classifier_save_path = "/kaggle/working/saved_classifier"

# Load BERT model and tokenizer
model_loaded = TFAutoModel.from_pretrained(model_save_path)
tokenizer_loaded = AutoTokenizer.from_pretrained(tokenizer_save_path)

In [None]:
# Function to tokenize data with reduced max_length
def tokenize(batch):
    return tokenizer_loaded(batch["text_combined"], padding='max_length', truncation=True, max_length=128)

english_speaking_countries = ['gb']
data_dir = "/kaggle/input/data-all"

def load_data(file_path):
    country_code = file_path.split('-')[1]  # Extract country code from filename
    try:
        df = pd.read_csv(file_path, delimiter='\t', quoting=3, on_bad_lines='skip')
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return pd.DataFrame()
    
    if country_code in english_speaking_countries:
        df['text_combined'] = df['text']
    else:
        df['text_combined'] = df['text_en']
    # Drop rows where 'text_combined' is NaN or empty
    df = df.dropna(subset=['text_combined'])
    df = df[df['text_combined'] != '']
    df['file_path'] = file_path  # Add file path information
    return df[['text_combined', 'label', 'file_path']]


# Function to split data within each country
def train_val_test_split_country(df, test_size=0.2, val_size=0.1):
    train_data, test_data = train_test_split(df, test_size=test_size, random_state=42)
    train_data, val_data = train_test_split(train_data, test_size=val_size/(1-test_size), random_state=42)
    return train_data, val_data, test_data


In [None]:
# Combine data from all files, splitting within each country first
train_dfs, val_dfs, test_dfs = [], [], []
for filename in os.listdir(data_dir):
    if filename.endswith(".tsv"):
        file_path = os.path.join(data_dir, filename)
        df = load_data(file_path)
        if not df.empty:
            train_df, val_df, test_df = train_val_test_split_country(df)
            train_dfs.append(train_df)
            val_dfs.append(val_df)
            test_dfs.append(test_df)

# Concatenate all country-specific splits
train_data = pd.concat(train_dfs, ignore_index=True)
val_data = pd.concat(val_dfs, ignore_index=True)
test_data = pd.concat(test_dfs, ignore_index=True)

In [None]:
# Create a Dataset object
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)

In [None]:
# Tokenize the datasets
train_encoded = train_dataset.map(tokenize, batched=True)
val_encoded = val_dataset.map(tokenize, batched=True)
test_encoded = test_dataset.map(tokenize, batched=True)

In [None]:
# Set format for TensorFlow
train_encoded.set_format('tf', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label', 'file_path'])
val_encoded.set_format('tf', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label', 'file_path'])
test_encoded.set_format('tf', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label', 'file_path'])


In [None]:
# Convert datasets to TensorFlow format
def to_tf_dataset(encoded_dataset, batch_size):
    def generator():
        for example in encoded_dataset:
            yield ({'input_ids': example['input_ids'],
                    'attention_mask': example['attention_mask'],
                    'token_type_ids': example['token_type_ids']}, 
                   example['label'], example['file_path'])
    
    return tf.data.Dataset.from_generator(generator,
                                          output_signature=(
                                              (
                                                  {
                                                      'input_ids': tf.TensorSpec(shape=(128,), dtype=tf.int32),
                                                      'attention_mask': tf.TensorSpec(shape=(128,), dtype=tf.int32),
                                                      'token_type_ids': tf.TensorSpec(shape=(128,), dtype=tf.int32)
                                                  },
                                                  tf.TensorSpec(shape=(), dtype=tf.int64),
                                                  tf.TensorSpec(shape=(), dtype=tf.string)
                                              )
                                          )).batch(batch_size)

BATCH_SIZE = 64 

# Convert datasets to TensorFlow format
train_dataset = to_tf_dataset(train_encoded, BATCH_SIZE).shuffle(1000)
val_dataset = to_tf_dataset(val_encoded, BATCH_SIZE)
test_dataset = to_tf_dataset(test_encoded, BATCH_SIZE)

In [None]:
import re
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Ensure inline plotting for Jupyter notebooks
%matplotlib inline

# Extract country code from file path
def extract_country_code(file_path):
    match = re.search(r'orientation-([a-z]{2}(?:-[a-z]{2})?)-train.tsv', file_path)
    if match:
        return match.group(1)
    print(f"No match found for: {file_path}")
    return None

# Add country code to the data
train_data['country_code'] = train_data['file_path'].apply(extract_country_code)

# Function to get BERT representations for the train data
def get_bert_representations(dataset):
    representations = []
    labels = []
    countries = []
    for batch in dataset:
        inputs, label, file_paths = batch
        inputs = {key: tf.convert_to_tensor(val) for key, val in inputs.items()}
        outputs = model_loaded(inputs)[0]  # Get the last hidden state
        mean_representation = tf.reduce_mean(outputs, axis=1)
        representations.extend(mean_representation.numpy())
        labels.extend(label.numpy())
        countries.extend([extract_country_code(fp.numpy().decode('utf-8')) for fp in file_paths])
    return np.array(representations), np.array(labels), np.array(countries)

# Get BERT representations for the train data
train_representations, train_labels, train_file_paths = get_bert_representations(train_dataset)

# Calculate mean representations for each country and label
mean_representations = {}
for country in np.unique(train_file_paths):
    for label in [0, 1]:
        country_label_indices = (train_file_paths == country) & (train_labels == label)
        if np.sum(country_label_indices) > 0:
            mean_rep = np.mean(train_representations[country_label_indices], axis=0)
            mean_representations[(country, label)] = mean_rep

# Prepare data for PCA
mean_reps = np.array(list(mean_representations.values()))
countries_labels = list(mean_representations.keys())

# Perform PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(mean_reps)

# Plot the PCA results
plt.figure(figsize=(14, 7))

for i, ((country, label), pca_coord) in enumerate(zip(countries_labels, pca_result)):
    color = 'red' if label == 0 else 'blue'
    plt.scatter(pca_coord[0], pca_coord[1], color=color)
    
    
# Connect the points with lines and add country codes
for country in np.unique([c for c, l in countries_labels]):
    left_coords = pca_result[[i for i, (c, l) in enumerate(countries_labels) if c == country and l == 0]]
    right_coords = pca_result[[i for i, (c, l) in enumerate(countries_labels) if c == country and l == 1]]
    if len(left_coords) > 0 and len(right_coords) > 0:
        plt.plot([left_coords[0][0], right_coords[0][0]], [left_coords[0][1], right_coords[0][1]], 'k-')
        mid_x = (left_coords[0][0] + right_coords[0][0]) / 2
        mid_y = (left_coords[0][1] + right_coords[0][1]) / 2
        plt.text(mid_x, mid_y, country, fontsize=12, color='black', ha='center', va='center')

plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('PCA of Mean Representations for Left-Wing and Right-Wing Speeches by Country')
plt.grid(True)
plt.show()