In [1]:
import fasttext
import pandas as pd
from langdetect import detect, LangDetectException
from transformers import pipeline
from huggingface_hub import hf_hub_download

In [2]:
# Load FastText model
model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
fasttext_model = fasttext.load_model(model_path)



In [None]:
# Load XLM model
model_id = "juliensimon/xlm-v-base-language-id"
xlm_pipe = pipeline("text-classification", model=model_id)

Downloading model.safetensors:   0%|          | 0.00/3.11G [00:00<?, ?B/s]

In [None]:
# Load the cleaned dataset
df = pd.read_csv("datasets/spotify_after_cleaning.csv")

In [None]:
def detect_language_from_fast_text(text):
    value = fasttext_model.predict(text)
    return value[0][0].split("__label__")[-1].split("_")[0]

In [None]:
def detect_language_from_xlm(text):
    result = xlm_pipe(text)
    return result[0]["label"]

In [None]:
def detect_language_from_langdetect(text):
    # there is problem "5,4,3,2,1"
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

In [None]:
# Detecting language of tracks and filtering out non-english tracks
df['lang_langdetect'] = df['name'].apply(detect_language_from_langdetect)
df['lang_xlm'] = df['name'].apply(detect_language_from_xlm)
df['lang_fasttext'] = df['name'].apply(detect_language_from_fast_text)

In [None]:
df.head()

In [None]:
print(f"Total tracks: {len(df)}")

In [None]:
# Counted English tracks for LangDetect
count_langdetect_en = df[df['lang_langdetect'] == 'en'].shape[0]

# Counted English tracks for XLM
count_xlm_en = df[df['lang_xlm'] == 'English'].shape[0]

# Counted English tracks for FastText
count_fasttext_en = df[df['lang_fasttext'] == 'eng'].shape[0]

print(f"LangDetect model detected {count_langdetect_en} English songs.")
print(f"XLM model detected {count_xlm_en} English songs.")
print(f"FastText model detected {count_fasttext_en} English songs.")

In [None]:
# Purpose: To create a csv file of only english tracks
df_eng = df[df['lang_xlm'] == 'English'].copy()
df_eng.drop(['lang_xlm', 'lang_langdetect', 'lang_fasttext'], axis=1, inplace=True)
df_eng.reset_index(drop=True, inplace=True)
df_eng.to_csv("english_tracks.csv", index=False)
