# Detect Languages

In [1]:
from utils import *
from load_data import *
from process_data import *
from create_embeddings import *
from split_data import *
from create_model import *
from evaluate_model import *
from run_to_excel import *

import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizer, DistilBertTokenizer, BertModel, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, RobertaTokenizer, RobertaModel, XLMRobertaModel, AutoTokenizer
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler
import re
import string
from gensim.models.fasttext import FastText
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from pysentimiento.preprocessing import preprocess_tweet


### Preprocessing

In [2]:
# Load data
data = "data/BBDD_SeAcabo.csv" # "data/BBDD_SeAcabo.csv" "AMI_IBEREVAL2018/es_AMI_TrainingSet_NEW.csv"
df = load_data(data)
# Preprocesado de datos
df['full_text'] = df['full_text'].apply(lambda x: preprocess_tweet(x, lang="es"))
df = process_data(df)



### LangDetect

In [3]:
import langdetect as ld

lang_dict = {}

for i in range(len(df)):
    print(f"{df.iloc[i]['full_text']=}")
    try:
        lang = ld.detect_langs(df.iloc[i]['full_text'])[0]
    except:
        pass

    if lang.lang not in lang_dict.keys():
        lang_dict[lang.lang] = []
    lang_dict[lang.lang].append(i)
    

lang_counts = {}
for lang in list(lang_dict.keys()):
    lang_counts[lang] = len(lang_dict[lang])

df.iloc[i]['full_text']='@usuario TODA ESPAÑA ESTÁ CONTIGO url'
df.iloc[i]['full_text']='@usuario Espero que te llegue todo el arrope que queremos transmitirte y la admiración por vuestra valentía.'
df.iloc[i]['full_text']='@usuario No estás sola emoji corazón morado emoji'
df.iloc[i]['full_text']='@usuario @usuario Shes a legend .'
df.iloc[i]['full_text']='@usuario Dilo, reina url'
df.iloc[i]['full_text']='@usuario Dilo reina, mano dura'
df.iloc[i]['full_text']='@usuario Las mujeres víctimas de acoso y violencia, NUEVAMENTE, teniendo que ser ELLAS quienes se van de los espacios en los que merecen estar, en lugar de que quienes se vayan sean los agresores.\nUn abrazo y mucha fuerza para ti y todas las jugadoras  emoji corazón morado emoji  emoji manos en oración tono de piel claro emoji  Gracias por no callar.'
df.iloc[i]['full_text']='@usuario tic tac rubi url'
df.iloc[i]['full_text']='@usuario url'
df.iloc[i]['full_text']='@usuario url'
df.iloc[i]['full_text']='@usuario Todos contigo

In [6]:
lang_dict

{'en': [0,
  3,
  7,
  15,
  21,
  22,
  24,
  34,
  39,
  40,
  47,
  50,
  53,
  60,
  64,
  69,
  73,
  77,
  81,
  89,
  96,
  104,
  127,
  132,
  137,
  139,
  142,
  143,
  144,
  150,
  160,
  164,
  166,
  170,
  206,
  218,
  220,
  221,
  222,
  223,
  228,
  230,
  245,
  254,
  256,
  261,
  262,
  263,
  266,
  267,
  268,
  269,
  270,
  271,
  273,
  275,
  276,
  277,
  279,
  280,
  288,
  291,
  294,
  295,
  296,
  299,
  304,
  312,
  316,
  321,
  322,
  323,
  327,
  330,
  332,
  337,
  339,
  341,
  346,
  347,
  349,
  352,
  469,
  511,
  529,
  584,
  585,
  601,
  697,
  722,
  728,
  744,
  755,
  773,
  872,
  877,
  902,
  944,
  960,
  970,
  975,
  979,
  982,
  1009,
  1030,
  1065,
  1077,
  1098,
  1166,
  1167,
  1168,
  1171,
  1172,
  1174,
  1175,
  1176,
  1177,
  1178,
  1179,
  1180,
  1181,
  1182,
  1183,
  1184,
  1185,
  1186,
  1188,
  1189,
  1190,
  1191,
  1193,
  1195,
  1196,
  1197,
  1198,
  1200,
  1201,
  1202,
  1203,
  1204,
 

In [7]:
lang_counts

{'en': 361,
 'es': 1219,
 'sl': 84,
 'pt': 216,
 'it': 314,
 'de': 60,
 'ca': 128,
 'nl': 15,
 'sq': 1,
 'et': 72,
 'fi': 30,
 'tl': 46,
 'sw': 3,
 'id': 6,
 'tr': 1,
 'lt': 1,
 'cy': 1,
 'hr': 2,
 'ro': 7,
 'da': 1,
 'hu': 1,
 'so': 3,
 'sk': 1}

In [8]:
df.iloc[767]['full_text']

'@catacoll2001 @barnacitizen @Jennihermoso Con Ustedes las jugadoras siempre !!'

### Bart (zero-shot)

In [None]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
candidate_labels = ["Text is in english", "Text is in spanish", "Text is in italian", "Text is in arabic", "Text is in catalan", "Text is in euskera", "Text is in galician"]

lang_dict = {}

for i in range(len(df)):
    print(f"{df.iloc[i]['full_text']=}")
    try:
        lang_result = classifier(df.iloc[i]['full_text'], candidate_labels)
        lang = lang_result["labels"][0]
        print(f"{lang=}")
    except:
        pass

    if lang not in lang_dict.keys():
        lang_dict[lang] = []
    lang_dict[lang].append(i)

lang_counts = {}
for lang in list(lang_dict.keys()):
    lang_counts[lang] = len(lang_dict[lang])
    






In [None]:
lang_counts