# Setup

In [None]:
from google.colab import auth
from google.auth import default
import gspread
import pandas as pd
import nltk
import math
import re
import typing as T
from nltk.corpus import stopwords
from itertools import product
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

Mounted at /content/drive


# JS

In [None]:
# punctuation = [l.strip() for l in open('data/punctuation.txt').readlines()]
nltk.download('stopwords')
STOPWORDS = stopwords.words('english')

# Add in punctuation, if desired
# stopwords += punctuation

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def isNumber(num: str) -> bool:
    """Check if a string is a number
    Args:
        num (str): piece of text
    Returns:
        bool: True iff the string can be converted
            to a number
    """
    try:
        float(num)
        return True
    except ValueError:
        return False

def computeFreqDistribution(doc: str, stopwords: bool = False) -> nltk.FreqDist:
    """
    Computes the frequency of each word in a document
    Args:
        doc (str): string containing the entire document
        stopwords (bool): boolean flag indicating whether or not
            to remove the stopwords from the sentence. True indicates
            to remove the stopwords.
    Returns:
        nltk.FreqDist: frequency distribution
    """
    tokens = nltk.regexp_tokenize(doc,'\S+')
    filtered_tokens = [w.lower().strip('.,?!"\'') for w in tokens]
    consolidated_tokens = []
    for w in filtered_tokens:
        if isNumber(w):
            consolidated_tokens.append("<NUMBER>")
            continue
        elif re.match("[\d]+(pm|am)$", w):
            consolidated_tokens.append("<TIME>")
            continue
        elif re.match("[\d]+:[\d]+(pm|am)?$", w):
            consolidated_tokens.append("<TIME>")
            continue
        elif re.match("\(?(\w+)\)?$", w):
            m = re.match("\(?(\w+)\)?$", w)
            consolidated_tokens.append(m.group(1))
            continue
        else:
            consolidated_tokens.append(w)

    # Remove stopwords from distribution
    if stopwords:
        consolidated_tokens = [w for w in consolidated_tokens if w not in STOPWORDS and w != "" ]
    else:
        consolidated_tokens = [w for w in consolidated_tokens if w != ""]

    fd = nltk.FreqDist(consolidated_tokens)
    return fd

def computeUnigramDistribution(doc: str, n_words: int = None, stopwords: bool = False) -> T.Tuple[dict, float]:
    """
    Computes the relative frequencies (i.e., probs) of the most common unigrams
        in a document
    Args:
        doc (str): string containing the entire document
        n_words (int, optional): Number of most common words to consider.
            Defaults to None.
        stopwords: boolean flag indicating whether or not
            to remove the stopwords from the sentence. True indicates
            to remove the stopwords.
    Returns:
        dict: relative frequencies of the form dist[word] = prob
        float: sum of all the probabilities of the n_words most frequent unigrams
    """
    fd = computeFreqDistribution(doc, stopwords)
    keys = list(fd.keys())[:n_words]
    values = list(fd.values())[:n_words]
    N = float(sum(values))
    dist = {}
    for key in keys:
        dist[key] = float(fd[key])/N
    return (dist,N)

# Idea: Average prob of each word in both dist, include word if not present in dist1
def mergeDistributionJS(dist1: dict, dist2: dict) -> dict:
    """
    Merges the two distributions used in the JS divergence
    Args:
        dist1 (dict): probability distribution of the form dist1[word] = prob
        dist2 (dict): probability distribution of the form dist2[word] = prob
    Returns:
        dict: New merged distribution including all words from both distributions
    """
    mergeDist = {}
    for key in dist1.keys():
        mergeDist[key] = 1/2*dist1[key]
    for key in dist2.keys():
        if key in mergeDist.keys():
            mergeDist[key] += 1/2*dist2[key]
        else:
            mergeDist[key] = 1/2*dist2[key]
    return mergeDist

# Idea: Compute sum of probability differences weighted by the log ratio
def KLDivergence(P: dict, M: dict, log_base: float = math.e) -> float:
    """
    Computes the KL divergence for two distributions
        KL(P||M) = \sum_{x \in X}[p(x) * \log(p(x)/q(x))]
    Args:
        P (dict): probability distribution of words
        M (dict): probability distribution of words
        log_base (float): Base value to use for log.
            Defaults to Euler's constant
    Returns:
        float: KL divergence of two distributions
    """
    div = 0
    for key in P.keys():
        div += P[key] * math.log(P[key] / M[key], log_base)
    return div

def JSDivergence(doc1: str, doc2: str, num_words: int = None, log_base: float = math.e, stopwords: bool = False) -> float:
    """
    Calculates the JS Divergence value for two corpora
    Args:
        doc1 (str): string containing the entire document
        doc2 (str): string containing the entire document
        num_words (int): number of most frequent words to
            consider. Defaults to all words.
        log_base (float): Base value to use for log.
            Defaults to Euler's constant
        stopwords (bool): boolean flag indicating whether or not
            to remove the stopwords from the sentence. True indicates
            to remove the stopwords.
    Returns:
        float: the JS divergence of the two corpora
    """
    P, N1 = computeUnigramDistribution(doc1, num_words, stopwords)
    Q, N2 = computeUnigramDistribution(doc2, num_words, stopwords)
    M = mergeDistributionJS(P, Q)
    js = 1/2*KLDivergence(P, M, log_base) + 1/2*KLDivergence(Q, M, log_base)
    return js / math.log(log_base)

# Single Domain

## Finding JSD

In [None]:
langs = ['gu', 'hi', 'ka', 'si', 'ta']
trains = ['cc_align', 'pmo/gov', 'bible']
tests = ['flores', 'bible', 'pmo/gov']
train_sizes = ['0k', '1k', '10k', '25k', '50k', '100k']
test_size = '1k'

In [None]:
lang_map = {
    'gu': 'Gujarati',
    'hi': 'Hindi',
    'ka': 'Kannada',
    'si': 'Sinhala',
    'ta': 'Tamil'
}
gov_map = {
    'gu': 'PrimeMinisterCorpus',
    'hi': 'PrimeMinisterCorpus',
    'ka': 'PrimeMinisterCorpus',
    'si': 'government',
    'ta': 'government'
}
bible_map = {
    'gu': 'new_bible_g1',
    'hi': 'new_bible_g1',
    'ka': 'new_bible_g1',
    'si': 'new_bible_g2',
    'ta': 'new_bible_g1'
}

In [None]:
def get_dataset_name(lang, dataset):
  match dataset:
    case 'cc_align':
      return 'cc_aligned'
    case 'bible':
      return bible_map[lang]
    case 'pmo/gov':
      return gov_map[lang]
    case 'flores':
      return 'flores'

In [None]:
df = pd.DataFrame(columns=['language', 'train set', 'train set size',
                           'test set', 'test set size', 'JSD'])
for lang, train, train_size, test in product(langs, trains, train_sizes, tests):
  if train_size == '0k':
    jsd = 1
  else:
    path = 'drive/MyDrive/PerfPred/Dataset/' + lang_map[lang] + '/'
    train_path = path + 'train/' + get_dataset_name(lang, train) + '/' + train_size + '/train-en_XX.txt'
    test_path = path + 'test/' + get_dataset_name(lang, test) + '/test-en_XX.txt'
    try:
      train_doc = open(train_path, 'r').read()
      test_doc = open(test_path, 'r').read()
      jsd = JSDivergence(train_doc, test_doc, stopwords = True)
    except FileNotFoundError:
      continue

  row = {'language': lang,
        'train set': train,
        'train set size': train_size,
        'test set': test,
        'test set size': test_size,
        'JSD': jsd}

  df.loc[len(df.index)] = row
  print(lang, train, train_size, test)

gu cc_align 0k flores
gu cc_align 0k bible
gu cc_align 0k pmo/gov
gu cc_align 25k flores
gu cc_align 25k bible
gu cc_align 25k pmo/gov
gu cc_align 100k flores
gu cc_align 100k bible
gu cc_align 100k pmo/gov
gu pmo/gov 0k flores
gu pmo/gov 0k bible
gu pmo/gov 0k pmo/gov
gu pmo/gov 1k flores
gu pmo/gov 1k bible
gu pmo/gov 1k pmo/gov
gu pmo/gov 10k flores
gu pmo/gov 10k bible
gu pmo/gov 10k pmo/gov
gu pmo/gov 25k flores
gu pmo/gov 25k bible
gu pmo/gov 25k pmo/gov
gu bible 0k flores
gu bible 0k bible
gu bible 0k pmo/gov
gu bible 1k flores
gu bible 1k bible
gu bible 1k pmo/gov
gu bible 10k flores
gu bible 10k bible
gu bible 10k pmo/gov
gu bible 25k flores
gu bible 25k bible
gu bible 25k pmo/gov
hi cc_align 0k flores
hi cc_align 0k bible
hi cc_align 0k pmo/gov
hi cc_align 25k flores
hi cc_align 25k bible
hi cc_align 25k pmo/gov
hi cc_align 100k flores
hi cc_align 100k bible
hi cc_align 100k pmo/gov
hi pmo/gov 0k flores
hi pmo/gov 0k bible
hi pmo/gov 0k pmo/gov
hi pmo/gov 1k flores
hi pmo/gov

In [None]:
df

Unnamed: 0,language,train set,train set size,test set,test set size,JSD
0,gu,cc_align,0k,flores,1k,1.000000
1,gu,cc_align,0k,bible,1k,1.000000
2,gu,cc_align,0k,pmo/gov,1k,1.000000
3,gu,cc_align,25k,flores,1k,0.372864
4,gu,cc_align,25k,bible,1k,0.546171
...,...,...,...,...,...,...
169,ta,bible,10k,bible,1k,0.097009
170,ta,bible,10k,pmo/gov,1k,0.569990
171,ta,bible,25k,flores,1k,0.472301
172,ta,bible,25k,bible,1k,0.091791


In [None]:
gsheet_df = pd.DataFrame(columns=['train set', 'train set size', 'test set',
                                  'test set size', 'ka', 'gu', 'hi', 'si', 'ta'])
for train, train_size, test in product(trains, train_sizes, tests):
  row = {
      'train set': train,
      'train set size': train_size,
      'test set': test,
      'test set size': test_size
  }
  for lang in langs:
    slice = df[(df['train set'] == train) & (df['train set size'] == train_size) &
               (df['test set'] == test) & (df['test set size'] == test_size) &
               (df['language'] == lang)]
    if len(slice) == 1:
      row[lang] = slice.at[slice.index[0], 'JSD']
    else:
      row[lang] = -1
  if all([row[lang] == -1 for lang in langs]):
    continue
  gsheet_df.loc[len(gsheet_df.index)] = row

In [None]:
worksheet = gc.open('Experiment 1 Tab').get_worksheet(1)
worksheet.update([gsheet_df.columns.values.tolist()] + gsheet_df.values.tolist())

{'spreadsheetId': '1V0fqzJOUg62M1RGGmd2LtKv0ZA76yYkY8Xo0iYaHr5w',
 'updatedRange': "'JSD-Single-Dom'!A1:I37",
 'updatedRows': 37,
 'updatedColumns': 9,
 'updatedCells': 333}

# Mixed Domain

## Finding JSD

In [None]:
langs = ['gu', 'hi', 'ka', 'si', 'ta']
trains = [('cc_align', 'bible'), ('cc_align', 'pmo/gov'), ('bible', 'pmo/gov'),
          ('cc_align', 'pmo/gov', 'bible')]
tests = ['pmo/gov', 'bible', 'flores']
train_size = '25k'
test_size = '1k'

In [None]:
lang_map = {
    'gu': 'Gujarati',
    'hi': 'Hindi',
    'ka': 'Kannada',
    'si': 'Sinhala',
    'ta': 'Tamil'
}
gov_map = {
    'gu': 'PrimeMinisterCorpus',
    'hi': 'PrimeMinisterCorpus',
    'ka': 'PrimeMinisterCorpus',
    'si': 'government',
    'ta': 'government'
}
bible_map = {
    'gu': 'new_bible_g1',
    'hi': 'new_bible_g1',
    'ka': 'new_bible_g1',
    'si': 'new_bible_g2',
    'ta': 'new_bible_g1'
}

In [None]:
def get_dataset_name(lang, dataset):
  match dataset:
    case 'cc_align':
      return 'cc_aligned'
    case 'bible':
      return bible_map[lang]
    case 'pmo/gov':
      return gov_map[lang]
    case 'flores':
      return 'flores'

In [None]:
df = pd.DataFrame(columns=['language', 'train set', 'train set size',
                           'test set', 'test set size', 'JSD'])
for lang, train, test in product(langs, trains, tests):
  path = 'drive/MyDrive/PerfPred/Dataset/' + lang_map[lang] + '/'
  train_name = '+'.join([get_dataset_name(lang, data) for data in train])
  train_size_name = '+'.join([train_size] * len(train))
  train_path = path + 'train/mixed/' + train_name + '/' + train_size_name + '/train-en_XX.txt'
  test_path = path + 'test/' + get_dataset_name(lang, test) + '/test-en_XX.txt'
  try:
    train_doc = open(train_path, 'r').read()
    test_doc = open(test_path, 'r').read()
    jsd = JSDivergence(train_doc, test_doc, stopwords = True)
  except FileNotFoundError:
    jsd = -1

  row = {'language': lang,
         'train set': train,
         'train set size': train_size,
         'test set': test,
         'test set size': test_size,
         'JSD': jsd}

  df.loc[len(df.index)] = row

## Writing to GSheet

In [None]:
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

In [None]:
gsheet_df = pd.DataFrame(columns=['train set', 'train set size', 'test set',
                                  'test set size', 'ka', 'gu', 'hi', 'si', 'ta'])
for train, test in product(trains, tests):
  row = {
      'train set': '+'.join(train),
      'train set size': '+'.join([train_size] * len(train)),
      'test set': test,
      'test set size': test_size
  }
  for lang in langs:
    slice = df[(df['train set'] == train) & (df['train set size'] == train_size) &
               (df['test set'] == test) & (df['test set size'] == test_size) &
               (df['language'] == lang)]
    if len(slice) == 1:
      row[lang] = slice.at[slice.index[0], 'JSD']
    else:
      row[lang] = -1
      pass
  gsheet_df.loc[len(gsheet_df.index)] = row

In [None]:
worksheet = gc.open('Experiment 1 Data').get_worksheet(0)
worksheet.update([gsheet_df.columns.values.tolist()] + gsheet_df.values.tolist())

{'spreadsheetId': '1awrtFTqiNVT4hbbQlD2ZLB5mPj0BqkCAE4RgahtrtgQ',
 'updatedRange': "'Mixed Domain'!A1:I13",
 'updatedRows': 13,
 'updatedColumns': 9,
 'updatedCells': 117}