In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [2]:
import csv
import numpy as np
from scipy.stats import pearsonr

input_path1 = '/content/gdrive/MyDrive/PerfPred/Experiment/data/data_na_kept.csv'
input_path2 = '/content/gdrive/MyDrive/PerfPred/Experiment/data/data_na_disc.csv'
output_path1 = '/content/gdrive/MyDrive/PerfPred/Experiment/data/correlation1.csv'
output_path2 = '/content/gdrive/MyDrive/PerfPred/Experiment/data/correlation2.csv'

col_mapping1 = {
    'TRAIN_SIZE': 4,
    'TRAIN_JSD': 5,
    'GEO_DIST': 9,
    'GEN_DIST': 10,
    'SYN_DIST': 11,
    'PHO_DIST': 12,
    'INV_DIST': 13,
    'FEA_DIST': 14
}

col_mapping2 = {
    'TRAIN1_SIZE': 1,
    'TRAIN1_JSD': 2,
    'TRAIN2_SIZE': 4,
    'TRAIN2_JSD': 5,
    'GEO_DIST': 9,
    'GEN_DIST': 10,
    'SYN_DIST': 11,
    'PHO_DIST': 12,
    'INV_DIST': 13,
    'FEA_DIST': 14
}

langs = ['ka', 'gu', 'hi', 'si', 'ta']

spBLEU = 15

In [3]:
def calculate_and_write(x, y, var, lang, reader, results):

    result = pearsonr(x, y)
    pearson_coefficient = result[0]
    p_value = result[1]

    results.append((var, lang, pearson_coefficient, p_value))

In [4]:
def filtered(reader, var, stage, filter_lang=None):
    x = []
    y = []
    for row in reader:
        if stage == 1 and row[1] != '0':
            continue
        if filter_lang and row[8] != filter_lang:
            continue
        x.append(float(row[col_mapping1[var]]) if stage == 1 else float(row[col_mapping2[var]]))
        y.append(float(row[spBLEU]))
    return x, y

In [5]:
def write_to_csv(output_path, results):
    with open(output_path, 'w', newline='') as output_file:
        writer = csv.writer(output_file)
        writer.writerow(['Variable', 'Lang', 'Pearson Correlation Coefficient', 'p-value'])
        for result in results:
            writer.writerow(result)
    results.clear()

In [6]:
def nested_for_loop(stage, reader, results):
    # Direct
    col_mapping = col_mapping1 if stage == 1 else col_mapping2
    for var in col_mapping:
        file.seek(0)
        next(reader)
        x, y = filtered (reader, var, stage)
        calculate_and_write(x, y, var, 'overall', reader, results)

        # Filter by lang
        for lang in langs:
          file.seek(0)
          next(reader)
          x, y = filtered (reader, var, stage, filter_lang = lang)
          calculate_and_write(x, y, var, lang, reader, results)

1 Stage

In [None]:
# Single stage
results = []

with open(input_path1, 'r') as file:
    reader = csv.reader(file)
    next(reader)

    nested_for_loop(1, reader, results)

write_to_csv (output_path1, results)



In [None]:
# Two stages
results = []

with open(input_path2, 'r') as file:
    reader = csv.reader(file)
    next(reader)

    nested_for_loop(2, reader, results)

write_to_csv (output_path2, results)

For fun: Size vs JSD (Single stage)

In [21]:
x = [] # size
y = [] # jsd
with open(input_path1, 'r') as file:
  reader = csv.reader(file)
  next(reader)
  for row in reader:
    if row[1] != '0':
      continue
    x.append(float(row[col_mapping1['TRAIN_SIZE']]))
    y.append(float(row[col_mapping1['TRAIN_JSD']]))

pearsonr(x,y)

PearsonRResult(statistic=-0.05634358070211624, pvalue=0.46023935055373405)

Size1 vs JSD1 (Two stage)

In [19]:
x = [] # size1
y = [] # jsd1
with open(input_path2, 'r') as file:
  reader = csv.reader(file)
  next(reader)
  for row in reader:
    x.append(float(row[col_mapping2['TRAIN1_SIZE']]))
    y.append(float(row[col_mapping2['TRAIN1_JSD']]))

pearsonr(x,y)

PearsonRResult(statistic=0.08559280436265042, pvalue=0.05064613004766375)

In [20]:
x = [] # size2
y = [] # jsd2
with open(input_path2, 'r') as file:
  reader = csv.reader(file)
  next(reader)
  for row in reader:
    x.append(float(row[col_mapping2['TRAIN2_SIZE']]))
    y.append(float(row[col_mapping2['TRAIN2_JSD']]))

pearsonr(x,y)

PearsonRResult(statistic=-0.0600808197288106, pvalue=0.17048927767052108)

Mis

In [None]:
    # # SIZE1 * SIZE2
    # x = []
    # file.seek(0)
    # next(reader)
    # x = []
    # for row in reader:
    #   x_val = float(row[col_mapping['TRAIN1_SIZE']]) * float(row[col_mapping['TRAIN2_SIZE']])
    #   x.append(x_val)

    # calculate_and_write(x, y, 'SIZE1 * SIZE2', reader)

    # # JSD1 * JSD2
    # x = []
    # file.seek(0)
    # next(reader)
    # x = []
    # for row in reader:
    #   x_val = float(row[col_mapping['TRAIN1_JSD']]) * float(row[col_mapping['TRAIN2_JSD']])
    #   x.append(x_val)

    # calculate_and_write(x, y, 'JSD1 * JSD2', reader)

    # # SIZE1 / JSD1
    # x = []
    # file.seek(0)
    # next(reader)
    # x = []
    # for row in reader:
    #   x_val = float(row[col_mapping['TRAIN1_SIZE']]) * float(row[col_mapping['TRAIN1_JSD']])
    #   x.append(x_val)

    # calculate_and_write(x, y, 'SIZE1 / JSD1', reader)

    # # SIZE2 / JSD2
    # x = []
    # file.seek(0)
    # next(reader)
    # x = []
    # for row in reader:
    #   x_val = float(row[col_mapping['TRAIN2_SIZE']]) * float(row[col_mapping['TRAIN2_JSD']])
    #   x.append(x_val)

    # calculate_and_write(x, y, 'SIZE2 / JSD2', reader)

    # # JSD1 / SIZE1
    # x = []
    # file.seek(0)
    # next(reader)
    # x = []
    # for row in reader:
    #   x_val = float(row[col_mapping['TRAIN1_JSD']]) * float(row[col_mapping['TRAIN1_SIZE']])
    #   x.append(x_val)

    # calculate_and_write(x, y, 'JSD1/ SIZE1', reader)

    # # JSD2 / SIZE2
    # x = []
    # file.seek(0)
    # next(reader)
    # x = []
    # for row in reader:
    #   x_val = float(row[col_mapping['TRAIN2_JSD']]) * float(row[col_mapping['TRAIN2_SIZE']])
    #   x.append(x_val)

    # calculate_and_write(x, y, 'JSD2/ SIZE2', reader)

    # # Lang/ combined size
    # for var in col_mapping:
    #     if (col_mapping[var] < 8):
    #       continue
    #     file.seek(0)
    #     next(reader)
    #     x = []
    #     for row in reader:
    #       combined_size = float(row[col_mapping['TRAIN1_SIZE']]) + float(row[col_mapping['TRAIN2_SIZE']])
    #       x_val = float(row[col_mapping[var]]) / combined_size
    #       x.append(x_val)

    #     calculate_and_write(x, y, var + '/ SIZES', reader)