In [1]:
import pandas as pd
from nltk.corpus import stopwords
import nltk
import numpy as np
from tqdm import tqdm
import stanza
from collections import Counter
import matplotlib.pyplot as plt
import random

## Load the data

In [2]:
df = pd.read_csv("es.csv", header=None,)
df.columns = ["word", "count"]
df = df[df["count"] > 0]
df["word"] = df["word"].str.strip("'")
df = df[(df["word"].str.len() > 2)]
df = df.sort_values(["count"], ascending=False)
df.head()

Unnamed: 0,word,count
14066,los,28535
6704,del,23912
18944,que,22543
18006,por,18949
13488,las,18168


## Load the model 

In [3]:
nlp = stanza.Pipeline(
            processors="tokenize,pos",
            lang='es',
        )

2021-09-07 18:40:51 INFO: Loading these models for language: es (Spanish):
| Processor | Package |
-----------------------
| tokenize  | ancora  |
| mwt       | ancora  |
| pos       | ancora  |

2021-09-07 18:40:51 INFO: Use device: gpu
2021-09-07 18:40:51 INFO: Loading: tokenize
2021-09-07 18:40:56 INFO: Loading: mwt
2021-09-07 18:40:56 INFO: Loading: pos
2021-09-07 18:40:57 INFO: Done loading processors!


## Classify each sample

In [4]:
def get_tag_individual(word):
    try:
        word = nlp(word)
        return word.sentences[0].to_dict()[0]['upos']
    except:
        return 'error'
df['POS Tag'] = df['word'].apply(get_tag_individual)
df.head()

Unnamed: 0,word,count,POS Tag
14066,los,28535,PRON
6704,del,23912,PROPN
18944,que,22543,SCONJ
18006,por,18949,ADP
13488,las,18168,PRON


## Classify entities and show samples

In [5]:
names_entities = {
    "ADJ": "Adjective",
    "ADV": "Adverb",
    "NOUN": "Noun",
    "VERB": "Verb",
    "PROPN": "Proper noun",
    "INTJ": "Interjection",
    "ADP": "Adposition",
    "AUX": "Auxiliary",
    "CCONJ": "Coordinating Conjunction",
    "DET": "Determiner",
    "NUM": "Numeral",
    "PART": "Particle",
    "PRON": "Pronoun",
    "SCONJ": "Subordinating Conjunction",
    "PUNCT": "Punctuation",
    "SYM": "Symbols",
    "X": "Other",
}
names = pd.DataFrame(names_entities, index=[0]).T.reset_index()
tags_1 = df.groupby('POS Tag').agg({'word':'nunique'}).to_dict()['word']
df_tags_ = pd.DataFrame(tags_1, index=[0]).T.reset_index()
df_tags_.columns = ['POS Tag', 'Number of Words']
results = {}
for tag in list(df_tags_['POS Tag'].unique()):
    try:
        results[tag] = (list(df[df['POS Tag'] == tag]['word'].values))[:3]
    except:
        results[tag] = list(df[df['POS Tag'] == tag]['word'].values)[:3]
df_tags_['Samples'] = df_tags_['POS Tag'].map(results)
df_tags_['Samples'] = [', '.join(map(str, l)) for l in df_tags_['Samples']]
df_tags_['POS Tag'] = df_tags_['POS Tag'].map(names_entities)
df_tags_ = df_tags_.sort_values(['Number of Words'], ascending=False)
df_tags_ = df_tags_[df_tags_['Number of Words'] > 0]
df_tags_.columns = ['POS Tag', 'NoW-vi', 'Samples-vi']
df_tags_

Unnamed: 0,POS Tag,NoW-vi,Samples-vi
7,Noun,12516,"parte, ciudad, años"
0,Adjective,5644,"gran, primera, mismo"
14,Verb,4654,"tiene, encuentra, puede"
10,Proper noun,725,"del, juan, york"
2,Adverb,378,"más, muy, además"
15,,238,"carlos, hacerlo, charles"
9,Pronoun,117,"los, las, una"
3,Auxiliary,98,"fue, son, está"
8,Numeral,42,"dos, tres, cuatro"
6,Interjection,31,"hey, joder, adiós"


### Ocurrences per category

In [6]:
df_occur = df.groupby(['POS Tag'], as_index=False).sum()
df_occur['POS Tag'] = df_occur['POS Tag'].map(names_entities)
df_occur.rename(columns={'count':'Occurrence-vi'}, inplace=True)
df_occur

Unnamed: 0,POS Tag,Occurrence-vi
0,Adjective,179294
1,Adposition,61468
2,Adverb,56298
3,Auxiliary,54309
4,Coordinating Conjunction,3312
5,Determiner,10079
6,Interjection,1828
7,Noun,490946
8,Numeral,28919
9,Pronoun,121235


### Create table for the paper

In [7]:
import pandas as pd
from math import log, floor


def human_format(number):
    if number == 0:
        return number
    else:
        units = ['', 'K', 'M', 'G', 'T', 'P']
        k = 1000.0
        magnitude = int(floor(log(number, k)))
        return '%.0f%s' % (number / k**magnitude, units[magnitude])
pd.set_option('precision', 0)
df_en = pd.read_csv('POST_en.csv')
df_es = pd.read_csv('POST_es.csv')
df_ar = pd.read_csv('POST_ar.csv')
df_fi = pd.merge(df_en, df_es, on=['POS Tag'], how='left')
df_fi = pd.merge(df_fi, df_ar, on=['POS Tag'], how='left')
df_fi['Occurrence-ar'] = df_fi['Occurrence-ar'].fillna(0)
df_fi['NoW-ar'] = df_fi['NoW-ar'].fillna(0)
df_fi['Samples-ar'] = df_fi['Samples-ar'].fillna('')
df_fi.sort_values(['NoW-en'], ascending=False, inplace=True)
df_fi.rename(columns={'Occurrence-en':'OccurrEn', 'Occurrence-es':'OccurrEs', 
                      'Occurrence-ar':'OccurrAr', 'POS Tag':'POSTag', 'NoW-en':'NoWEn',
                      'NoW-es':'NoWEs', 'NoW-ar':'NoWAr', 'Samples-es':'SamplesEs',
                      'Samples-ar':'SamplesAr', 'Samples-en':'SamplesEn'}, inplace=True)
df_fi['OccurrEn'] = df_fi['OccurrEn'].apply(human_format)
df_fi['NoWEn'] = df_fi['NoWEn'].apply(human_format)
df_fi['OccurrEs'] = df_fi['OccurrEs'].apply(human_format)
df_fi['NoWEs'] = df_fi['NoWEs'].apply(human_format)
df_fi['OccurrAr'] = df_fi['OccurrAr'].apply(human_format)
df_fi['NoWAr'] = df_fi['NoWAr'].apply(human_format)

df_fi = df_fi.round()
print(df_fi.to_latex(index=None))

\begin{tabular}{llllllllll}
\toprule
       POSTag & OccurrEn & NoWEn &           SamplesEn & OccurrEs & NoWEs &                SamplesEs & OccurrAr & NoWAr &               SamplesAr \\
\midrule
         Noun &       2M &   19K &      boy, time, man &     491K &   13K &      parte, ciudad, años &       3K &   236 &    شيء, الكتاب, السيارة \\
  Proper noun &     382K &   10K &     i'm, i'll, i've &      38K &   725 &          del, juan, york &        0 &     0 &                         \\
         Verb &       1M &    6K &      are, have, had &     151K &    5K &  tiene, encuentra, puede &       2K &   102 &          كان, كنت, كانت \\
    Adjective &     628K &    4K &   other, more, many &     179K &    6K &     gran, primera, mismo &      643 &    51 &   الكثير, جميلة, القليل \\
       Adverb &     683K &    1K &  there, also, about &      56K &   378 &         más, muy, además &       74 &     2 &               كذلك, فقط \\
      Pronoun &     644K &    97 &      you, that, his &    