In [62]:
import fasttext
import pandas as pd
from langdetect import detect, LangDetectException
from transformers import pipeline
import fasttext
from huggingface_hub import hf_hub_download

In [10]:
!pip install fasttext

Collecting fasttext
  Using cached fasttext-0.9.2.tar.gz (68 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py): started
  Building wheel for fasttext (setup.py): finished with status 'error'
  Running setup.py clean for fasttext
Failed to build fasttext


  error: subprocess-exited-with-error
  
  python setup.py bdist_wheel did not run successfully.
  exit code: 1
  
  [33 lines of output]
  !!
  
          ********************************************************************************
          Usage of dash-separated 'description-file' will not be supported in future
          versions. Please use the underscore name 'description_file' instead.
  
          By 2023-Sep-26, you need to update your project and remove deprecated calls
          or your builds will no longer be supported.
  
          See https://setuptools.pypa.io/en/latest/userguide/declarative_config.html for details.
          ********************************************************************************
  
  !!
    opt = self.warn_dash_deprecation(opt, section)
  running bdist_wheel
  running build
  running build_py
  creating build
  creating build\lib.win-amd64-cpython-38
  creating build\lib.win-amd64-cpython-38\fasttext
  copying python\fasttext_module\fastt

In [50]:
# Load FastText model
model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
fasttext_model = fasttext.load_model(model_path)



In [51]:
# Load XLM model
model_id = "juliensimon/xlm-v-base-language-id"
xlm_pipe = pipeline("text-classification", model=model_id)

In [52]:
# Load the cleaned dataset
df = pd.read_csv("after_cleaning.csv")

In [53]:
def detect_language_from_fast_text(text):
    value = fasttext_model.predict(text)
    return value[0][0].split("__label__")[-1].split("_")[0]

In [54]:
def detect_language_from_xlm(text):
    result = xlm_pipe(text)
    return result[0]["label"]

In [60]:
def detect_language_from_langdetect(text):
    # there is problem "5,4,3,2,1"
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

In [63]:
# Detecting language of tracks and filtering out non-english tracks
df['lang_langdetect'] = df['name'].apply(detect_language_from_langdetect)
df['lang_xlm'] = df['name'].apply(detect_language_from_xlm)
df['lang_fasttext'] = df['name'].apply(detect_language_from_fast_text)

In [64]:
df.head()

Unnamed: 0,id,name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity,category_name,lang_langdetect,lang_xlm,lang_fasttext
0,087fGGq27BXXpxKAlbBwNM,Rhythm Is A Dancer,0.273,0.794,225882,0.745,0.000414,9.0,0.143,-11.367,0.0,0.037,124.249,4.0,0.706,32,relax,cy,English,eng
1,0CsM8VGDi38kusMv3pxyj1,True,0.459,0.742,329146,0.533,0.148,0.0,0.064,-9.098,1.0,0.0288,97.395,4.0,0.391,73,relax,vi,English,eng
2,0DiWol3AO6WpXZgp0goxAV,One More Time,0.0194,0.613,320357,0.697,0.0,2.0,0.332,-8.618,1.0,0.133,122.746,4.0,0.476,80,relax,en,English,eng
3,0LzeivEHO16a8eBQGlpVkE,Love Shack,0.11,0.704,262586,0.828,0.0,5.0,0.747,-7.304,0.0,0.0514,133.496,4.0,0.866,61,relax,en,English,spa
4,0MHXrqn909p0LRTPsNsGEi,Move on Up,0.417,0.538,165789,0.969,0.00154,11.0,0.119,-5.626,0.0,0.0906,138.652,4.0,0.928,73,relax,en,English,fin


In [68]:
print(f"Total tracks: {len(df)}")

Total tracks: 15211


In [82]:
# Counted English tracks for LangDetect
count_langdetect_en = df[df['lang_langdetect'] == 'en'].shape[0]

# Counted English tracks for XLM
count_xlm_en = df[df['lang_xlm'] == 'English'].shape[0]

# Counted English tracks for FastText
count_fasttext_en = df[df['lang_fasttext'] == 'eng'].shape[0]

print(f"LangDetect model detected {count_langdetect_en} English songs.")
print(f"XLM model detected {count_xlm_en} English songs.")
print(f"FastText model detected {count_fasttext_en} English songs.")

LangDetect model detected 6109 English songs.
XLM model detected 9309 English songs.
FastText model detected 7201 English songs.


In [85]:
# Purpose: To create a csv file of only english tracks
df_eng = df[df['lang_xlm'] == 'English'].copy()
df_eng.drop(['lang_xlm', 'lang_langdetect', 'lang_fasttext'], axis=1, inplace=True)
df_eng.reset_index(drop=True, inplace=True)
df_eng.to_csv("english_tracks.csv", index=False)
