In [23]:
import pandas as pd
from math import log, floor
from nltk.corpus import stopwords
import nltk
from ktrain import text
import matplotlib.pyplot as plt

<IPython.core.display.Javascript object>

### Labels (examples in english and spanish)

In [10]:
labels = [
    "common words",
    "geography",
    "health",
    "culture",
    "art",
    "social science",
    "language",
    "general reference",
    "history",
    "event",
    "human activity",
    "mathematic",
    "software",
    "science",
    "philosophy",
    "religion",
    "society",
    "technology",
    "city",
    "animal",
    "name",
    "game",
    "engineer",
    "number",
    "location",
    "political",
]
labels = [
    "palabras",
    "geografia",
    "salud",
    "cultura",
    "arte",
    "ciencias sociales",
    "lenguaje",
    "referencia general",
    "historia",
    "evento",
    "actividad",
    "matematica",
    "software",
    "ciencia",
    "filosofia",
    "religion",
    "sociedad",
    "tecnologia",
    "ciudad",
    "animal",
    "nombre",
    "juego",
    "ingeniero",
    "numero",
    "ubicacion",
    "politica",
]

<IPython.core.display.Javascript object>

### Select and load the model

In [None]:
model_name = "joeddav/xlm-roberta-large-xnli"
zsl = text.ZeroShotClassifier(model_name)

### Load and clean the data (example in spanish)

In [14]:
df = pd.read_csv("es.csv", header=None)
df.columns = ["word", "count"]
df = df[df["count"] > 0]
df["word"] = df["word"].str.strip("'")
df = df[(df["word"].str.len() > 2)]
df = df.sort_values(["count"], ascending=False)
df["word"] = df["word"].str.capitalize()
df.head()

Unnamed: 0,word,count
14066,Los,28535
6704,Del,23912
18944,Que,22543
18006,Por,18949
13488,Las,18168


<IPython.core.display.Javascript object>

### Translation of categories to english (example spanish)

In [15]:
names_cat = {
    "evento": "event",
    "palabras comunes": "common words",
    "geografia": "geography",
    "salud": "health",
    "cultura": "culture",
    "arte": "art",
    "ciencias sociales": "social science",
    "lenguage": "language",
    "referencia general": "general reference",
    "historia": "history",
    "actividad humana": "human activity",
    "matematicas": "mathematic",
    "software": "software",
    "ciencia": "science",
    "filosofia": "philosophy",
    "religion": "religion",
    "sociedad": "society",
    "tecnologia": "technology",
    "ciudad": "city",
    "animal": "animal",
    "nombre": "name",
    "game": "game",
    "ingenieria": "engineer",
    "numero": "number",
    "locacion": "location",
    "politica": "political",
}

<IPython.core.display.Javascript object>

### Get top of the prediction

In [16]:
def get_top_class(row):
    classification = zsl.predict(
        row, labels=labels, include_labels=True, batch_size=32, multilabel=True
    )
    classification.sort(key=lambda tup: tup[1])
    classification1 = classification[-1][0] if classification[-1][1] > 0.3 else None
    return classification1

<IPython.core.display.Javascript object>

In [17]:
%%time
df['classification'] = df['word'].apply(get_top_class)
df = pd.concat([pd.DataFrame(df[col].values.tolist()) for col in df.columns], axis=1, ignore_index=True)
df.columns = ['word', 'count', 'classification1']
df.head()

CPU times: user 29min 29s, sys: 9min 53s, total: 39min 23s
Wall time: 35min 19s


Unnamed: 0,word,count,classification1
0,Los,28535,ciudad
1,Del,23912,politica
2,Que,22543,sociedad
3,Por,18949,animal
4,Las,18168,ciudad


<IPython.core.display.Javascript object>

### Map class to english

In [19]:
df["classification1"] = df["classification1"].map(names_cat)
df.head()

Unnamed: 0,word,count,classification1
0,Los,28535,city
1,Del,23912,political
2,Que,22543,society
3,Por,18949,animal
4,Las,18168,city


<IPython.core.display.Javascript object>

### Generate samples per category

In [20]:
import random

df_class = (
    df.groupby("classification1", as_index=False)
    .agg({"count": "nunique"})
    .sort_values(["count"], ascending=False)
)
df_class.rename(
    columns={"classification1": "Category", "count": "Number of Occurrences ES"},
    inplace=True,
)
results = {}
for tag in list(df["classification1"].unique()):
    try:
        results[tag] = (list(df[df["classification1"] == tag]["word"].values))[:3]
    except:
        if tag is not None:
            results[tag] = random.sample(
                list(df[df["classification1"] == tag]["word"].unique()), 1
            )
df_class["Samples EN"] = df_class["Category"].map(results)
df_class = df_class[df_class["Number of Occurrences ES"] > 0]
df_class["Samples EN"] = [", ".join(map(str, l)) for l in df_class["Samples EN"]]
# df_class['Category'] = df_class['Category'].map(names_cat)
df_class

Unnamed: 0,Category,Number of Occurrences ES,Samples EN
12,political,272,"Del, Con, Pero"
0,animal,241,"Por, Para, Este"
5,general reference,228,"Como, Sin, Sobre"
9,name,206,"Dos, Son, Tres"
16,society,188,"Que, Más, Muy"
4,event,155,"Fue, Esto, Cuando"
2,city,145,"Los, Las, Ciudad"
3,culture,125,"Canción, Canciones, Manera"
7,health,109,"Forma, San, Vida"
13,religion,109,"Padre, Iglesia, Santa"


<IPython.core.display.Javascript object>

### Ocurrences per category

In [21]:
df_occur = df.groupby(["classification1"], as_index=False).sum()
# df_occur['classification1'] = df_occur['classification1'].map(names_cat)
df_occur.rename(
    columns={"count": "Occurrence-vi", "classification1": "Category"}, inplace=True
)
df_occur.head()

Unnamed: 0,Category,Occurrence-vi
0,animal,165211
1,art,12348
2,city,87370
3,culture,22431
4,event,57888


<IPython.core.display.Javascript object>

### Create table for the paper

In [24]:
df_en = pd.read_csv("ZS_en.csv")
df_es = pd.read_csv("ZS_es.csv")
df_ar = pd.read_csv("ZS_ar.csv")
df_ar.columns = ["Category", "Occurrence-ar", "NoW-ar", "Samples-ar"]
df_fi = pd.merge(df_en, df_es, on=["Category"], how="left")
df_fi = pd.merge(df_fi, df_ar, on=["Category"], how="left")
df_fi["Occurrence-ar"] = df_fi["Occurrence-ar"].fillna(0)
df_fi["NoW-ar"] = df_fi["NoW-ar"].fillna(0)
df_fi["Samples-ar"] = df_fi["Samples-ar"].fillna("")
df_fi.sort_values(["NoW-en"], ascending=False, inplace=True)
df_fi.rename(
    columns={
        "Occurrence-en": "OccurrEn",
        "Occurrence-es": "OccurrEs",
        "Occurrence-ar": "OccurrAr",
        "POS Tag": "POSTag",
        "NoW-en": "NoWEn",
        "NoW-es": "NoWEs",
        "NoW-ar": "NoWAr",
        "Samples-es": "SamplesEs",
        "Samples-ar": "SamplesAr",
        "Samples-en": "SamplesEn",
    },
    inplace=True,
)


def human_format(number):
    if number == 0:
        return number
    else:
        units = ["", "K", "M", "G", "T", "P"]
        k = 1000.0
        magnitude = int(floor(log(number, k)))
        return "%.0f%s" % (number / k ** magnitude, units[magnitude])


df_fi["OccurrEn"] = df_fi["OccurrEn"].apply(human_format)
df_fi["NoWEn"] = df_fi["NoWEn"].apply(human_format)
df_fi["OccurrEs"] = df_fi["OccurrEs"].apply(human_format)
df_fi["NoWEs"] = df_fi["NoWEs"].apply(human_format)
df_fi["OccurrAr"] = df_fi["OccurrAr"].apply(human_format)
df_fi["NoWAr"] = df_fi["NoWAr"].apply(human_format)
print(
    df_fi[["Category", "OccurrAr", "NoWAr", "SamplesAr"]].head(15).to_latex(index=None)
)

\begin{tabular}{llll}
\toprule
          Category & OccurrAr & NoWAr &                    SamplesAr \\
\midrule
             event &       2K &    30 &            ي قال, حصل, يحبها \\
    human activity &       1K &    28 &       ستغادر, المشتري, لنغني \\
          location &      333 &    14 &   باءمكانه, باءمكانك, القلعة \\
              name &       1K &    24 &           انا, ساءبقى, اءدري \\
            animal &       1K &    25 &           فيه, تفاحة, باللون \\
            number &      493 &    17 &            عشرة, واحد, ثلاثة \\
 general reference &      185 &     8 &        طريقة, القانون, التعب \\
      common words &      206 &     7 &            عديدة, تمطر, مخطئ \\
              city &       49 &     4 &     اءيطاليا, بيكاسو, البلدة \\
        technology &       92 &     5 &   التلفاز, السيارات, الاءخير \\
           culture &       57 &     4 &         الصينية, بمهارة, فوق \\
          language &       98 &     7 &  اللغة, بالاءنجليزية, ياباني \\
              game &    

<IPython.core.display.Javascript object>