# Few-shot Learning with Multilingual Language Models (XGLM)

In [None]:
!pip install transformers
!pip install datasets

In [None]:
%cd /content/drive/MyDrive/PhD Julen Etxaniz/phd/datasets/XStoryCloze

## Introduction

In this work, we train a family of multilingual generative language models, dubbed XGLM, on a balanced corpus covering a diverse set of languages, and study their few- and zero-shot learning capabilities in a wide range of tasks. Our largest model with 7.5 billion parameters sets new state of the art in few-shot learning on more than 20 representative languages, outperforming GPT-3 of comparable size in multilingual commonsense reasoning (+7.4 accuracy points for 0-shot, +9.4 for 4-shot) and natural language inference (+5.4 for 0-shot, +5.4 for 4-shot). We have included a [model card](model_card.md) of XGLM for transparency and accountability.



## Data and Languages
XGLM models are trained on a new multilingual corpus extracted from CommonCrawl (CC100-XL), a significantly larger multilingual dataset covering 68 Common Crawl (CC) snapshots (from [Summer 2013](http://commoncrawl.org/2013/11/new-crawl-data-available/) to [March/April 2020](https://commoncrawl.org/2020/04/march-april-2020-crawl-archive-now-available/) consisting of 134 languages. The detailed languages and data statistics are reported in the paper (Table A.1).



## Pre-trained models

Model | Layers | Model Dim | FFN Dim | Languages | Download
---|---|---|---|---|---
`XGLM 564M` | 24 | 1024 | 4096 | trained on 30 languages|  [xglm.564M.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xglm/xglm.564M.tar.gz)
`XGLM 1.7B` | 24 | 2048 | 8192 | trained on 30 languages|  [xglm.1.7B.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xglm/xglm.1.7B.tar.gz)
`XGLM 2.9B` | 48 | 2048 | 8192 | trained on 30 languages|  [xglm.2.9B.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xglm/xglm.2.9B.tar.gz)
`XGLM 7.5B` | 32 | 4096 | 16384 | trained on 30 languages|  [xglm.7.5B.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xglm/xglm.7.5B.tar.gz)
`XGLM 4.5B` | 48 | 2048 | 16384 | trained on 134 languages|  [xglm.4.5B.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xglm/xglm.4.5B.tar.gz)

## Evaluation



In [None]:
from transformers import XGLMTokenizer, XGLMForCausalLM

tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
model = XGLMForCausalLM.from_pretrained("facebook/xglm-564M")
model.eval()
model.cuda()

### XCOPA

In [None]:
from datasets import load_dataset

langs_xcopa = ["et", "ht", "it", "id", "qu", "sw", "zh", "ta", "th", "tr", "vi"]

xcopa = {}
for lang in langs_xcopa:
    xcopa[lang] = load_dataset("xcopa", lang)

In [None]:
xcopa["et"]["validation"][0]

{'premise': 'Mees keeras kraani lahti.',
 'choice1': 'Tualett täitus veega.',
 'choice2': 'Tilast voolas vett.',
 'question': 'effect',
 'label': 1,
 'idx': 0,
 'changed': False}

In [None]:
import torch
import torch.nn.functional as F
from tqdm import tqdm
import pandas as pd

def get_logprobs(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    input_ids, output_ids = inputs["input_ids"], inputs["input_ids"][:, 1:]
    outputs = model(**inputs, labels=input_ids)
    logits = outputs.logits
    logprobs = torch.gather(F.log_softmax(logits, dim=2), 2, output_ids.unsqueeze(2))
    return logprobs

# Zero-shot evaluation for the Choice of Plausible Alternatives (COPA) task.
# A return value of 0 indicates that the first alternative is more plausible,
# while 1 indicates that the second alternative is more plausible.
def XCOPA_eval(prompt, alternative1, alternative2):
    lprob1 = get_logprobs(prompt + "\n" + alternative1).sum()
    lprob2 = get_logprobs(prompt + "\n" + alternative2).sum()
    return 0 if lprob1 > lprob2 else 1

results_xcopa = {"idx": xcopa["et"]["test"]["idx"], 
           "label": xcopa["et"]["test"]["label"]}
for lang in langs_xcopa:
    predictions = []
    for idx, example in tqdm(enumerate(xcopa[lang]["test"])):
        predict = XCOPA_eval(example["premise"], example["choice1"], example["choice2"])
        predictions.append(predict)
    results_xcopa[lang] = predictions

500it [00:54,  9.15it/s]
500it [00:50,  9.83it/s]
500it [00:50,  9.84it/s]
500it [00:50,  9.95it/s]
500it [00:51,  9.74it/s]
500it [00:50,  9.82it/s]
500it [00:50,  9.82it/s]
500it [00:55,  9.01it/s]
500it [00:50,  9.89it/s]
500it [00:50,  9.84it/s]
500it [00:50,  9.84it/s]


In [None]:
results_xcopa_df = pd.DataFrame(results_xcopa).to_csv("XCOPA_xglm-564M.tsv", sep="\t", index=False)

In [None]:
results_xcopa_df = pd.read_csv("XCOPA_xglm-564M.tsv", delimiter="\t")

In [None]:
results_xcopa_df

Unnamed: 0,idx,label,et,ht,it,id,qu,sw,zh,ta,th,tr,vi
0,0,0,1,1,1,1,1,1,1,1,1,1,1
1,1,0,1,1,1,1,1,1,1,1,1,1,1
2,2,1,0,0,0,1,0,0,0,0,1,0,0
3,3,0,0,0,0,0,0,0,0,0,0,0,0
4,4,0,1,0,1,0,0,0,1,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,495,1,1,1,1,1,1,0,0,0,1,1,1
496,496,1,0,0,0,0,0,0,1,0,0,0,1
497,497,0,0,1,0,0,0,1,1,0,1,1,0
498,498,1,0,1,1,1,1,0,1,1,1,0,1


In [None]:
accuracy = {}
for lang in langs_xcopa:
    compare = results_xcopa_df["label"] == results_xcopa_df[lang]
    acc = list(compare).count(True) / len(list(compare)) * 100
    accuracy[lang] = round(acc, 2)

accuracy

{'et': 52.4,
 'ht': 54.2,
 'it': 52.0,
 'id': 55.8,
 'qu': 49.2,
 'sw': 52.8,
 'zh': 53.2,
 'ta': 54.4,
 'th': 55.6,
 'tr': 53.0,
 'vi': 55.8}

### XStoryCloze

In [None]:
from datasets import load_dataset

langs_xstory = ["en", "ru", "zh", "es", "ar", "hi", "id", "te", "sw", "eu", "my"]

x_story_cloze = {}
for lang in langs_xstory:
    x_story_cloze[lang] = load_dataset('x_story_cloze.py', lang)

In [None]:
x_story_cloze["en"]["train"][0]

{'story_id': '138d5bfb-05cc-41e3-bf2c-fa85ebad14e2',
 'input_sentence_1': 'Rick grew up in a troubled household.',
 'input_sentence_2': 'He never found good support in family, and turned to gangs.',
 'input_sentence_3': "It wasn't long before Rick got shot in a robbery.",
 'input_sentence_4': 'The incident caused him to turn a new leaf.',
 'sentence_quiz1': 'He is happy now.',
 'sentence_quiz2': 'He joined a gang.',
 'answer_right_ending': 1}

In [None]:
import torch
import torch.nn.functional as F
from tqdm import tqdm
import pandas as pd

def get_logprobs(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    input_ids, output_ids = inputs["input_ids"], inputs["input_ids"][:, 1:]
    outputs = model(**inputs, labels=input_ids)
    logits = outputs.logits
    logprobs = torch.gather(F.log_softmax(logits, dim=2), 2, output_ids.unsqueeze(2))
    return logprobs

def XStoryCloze_eval(prompt, alternative1, alternative2):
    lprob1 = get_logprobs(prompt + "\n" + alternative1).sum()
    lprob2 = get_logprobs(prompt + "\n" + alternative2).sum()
    return 1 if lprob1 > lprob2 else 2

results_xstory = {"idx": list(range(len(x_story_cloze[lang]["eval"]))), 
           "label": x_story_cloze["en"]["eval"]["answer_right_ending"]}
for lang in langs_xstory:
    predictions = []
    id = []
    for idx, example in tqdm(enumerate(x_story_cloze[lang]["eval"])):
        input_sentences = example["input_sentence_1"] + " " + example["input_sentence_2"] + " " + example["input_sentence_3"] + " " + example["input_sentence_4"]
        predict = XStoryCloze_eval(input_sentences, example["sentence_quiz1"], example["sentence_quiz2"])
        predictions.append(predict)
    results_xstory[lang] = predictions

In [None]:
results_xstory_df = pd.DataFrame(results_xstory).to_csv("XStoryCloze_xglm-564M.tsv", sep="\t", index=False)

In [None]:
results_xstory_df = pd.read_csv("XStoryCloze_xglm-564M.tsv", delimiter="\t")

In [None]:
accuracy = {}
for lang in langs_xstory:
    compare = results_xstory_df["label"] == results_xstory_df[lang]
    acc = list(compare).count(True) / len(list(compare)) * 100
    accuracy[lang] = round(acc, 1)

accuracy

{'en': 60.0,
 'ru': 55.9,
 'zh': 53.1,
 'es': 54.3,
 'ar': 49.6,
 'hi': 52.2,
 'id': 54.1,
 'te': 55.9,
 'sw': 53.3,
 'eu': 53.1,
 'my': 51.6}