In [2]:
import re 
import nltk
import string
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk import PorterStemmer, LancasterStemmer, SnowballStemmer
from nltk.tokenize import word_tokenize
# from sklearn.preprocessing import FunctionTransformer
# from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.pipeline import make_pipeline, make_union
# from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
from wordcloud import WordCloud, STOPWORDS
import stylecloud
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'svg'
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')


In [3]:
lyricdf = pd.read_csv('data_lyrics.csv')

In [15]:
# lowercasing
# lambda x: x.lower()

# remove characters
def remove_punct(text):
    return re.sub(r'[`\-=~!@#$%^&*()\_+—\[\]{};\'\\:"|<,./<>?]', '', text)

# stemming
def words_stemmer(words, type="PorterStemmer", lang="english", encoding="utf8"): 
    supported_stemmers = ["PorterStemmer","LancasterStemmer","SnowballStemmer"]
    words = nltk.word_tokenize(words)
    if type is False or type not in supported_stemmers:
        return words
    else:
        stem_words = []
        if type == "PorterStemmer":
            stemmer = PorterStemmer()
            for word in words:
                stem_words.append(stemmer.stem(word))
        if type == "LancasterStemmer":
            stemmer = LancasterStemmer()
            for word in words:
                stem_words.append(stemmer.stem(word))
        if type == "SnowballStemmer":
            stemmer = SnowballStemmer(lang)
            for word in words:
                stem_words.append(stemmer.stem(word))
        return " ".join(stem_words)

def find_pos(word):
    # Part of Speech constants
    # ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'

    pos = nltk.pos_tag(nltk.word_tokenize(word))[0][1]
    
    # Adjective tags - 'JJ', 'JJR', 'JJS'
    if pos.lower()[0] == 'j':
        return 'a'
    # Adverb tags - 'RB', 'RBR', 'RBS'
    elif pos.lower()[0] == 'r':
        return 'r'
    # Verb tags - 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'
    elif pos.lower()[0] == 'v':
        return 'v'

    # Noun tags - 'NN', 'NNS', 'NNP', 'NNPS'
    else:
        return 'n'

# Function to apply lemmatization to a list of words
def words_lemmatizer(text, encoding="utf8"):
    words = nltk.word_tokenize(text)
    lemma_words = []
    wl = WordNetLemmatizer()
    for word in words:
        pos = find_pos(word)
        lemma_words.append(wl.lemmatize(word, pos))
    return " ".join(lemma_words)


# Drop independent numbers (not alphanumeric) in sentences
def remove_numbers(text):
    words = [i for i in text.split(" ") if not i.isnumeric()]
    return " ".join(words)

def remove_repeat_char(text):
    return re.sub(r"(\w)\1{2,}",r"\1", text)

# remove stop words
def remove_stopwords(text, lang='english'):
    words = nltk.wordpunct_tokenize(text)
    lang_stopwords = stopwords.words(lang)
    stopwords_removed = [w for w in words if w.lower() not in lang_stopwords]
    return " ".join(stopwords_removed)

def preprocess_punct(text):
    # translate Chinese punct to English versions
    E_pun = u',.!?[]()<>""\'\''
    C_pun = u'，。！？【】（）《》“”‘’'
    table= {ord(f):ord(t) for f,t in zip(C_pun,E_pun)}
    return text.translate(table)

def remove_standalone_alphabets(text):
    words = nltk.wordpunct_tokenize(text)
    # a, i and u don't need to remove now for sentence completeness
    chra = list('bcdefghjklmnopqrstvwxyz')
    chra_removed = [w for w in words if w.lower() not in chra]
    return " ".join(chra_removed)

def remove_html_tag(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [16]:
lyricdf.lyrics

0        SCENE.—A great terrace in the Palace of Herod,...
1        Investigation of the Ferguson Police Departmen...
2        (10)    Comme là on est où est on est on là. D...
3        The Whitechapel Murder.        During yesterda...
4        Le roi Renaud de guerre revient Portant ses tr...
                               ...                        
93622    Quando eu morrer me enterre na Lapinha Quando ...
93623    It felt so easy in the morning sun When love w...
93624    [Part 1 - "Loser" by Beck] Soy un perdedor! I'...
93625    [Verse 1] Winterlude, Winterlude, oh darling W...
93626    [Letra de "Chuper Amigos"]  [Intro] Empezamos ...
Name: lyrics, Length: 93627, dtype: object

In [5]:
def text_cleaning(df):
    # to remove urls
        #for http-like
    df['lyrics'] = df['lyrics'].apply(lambda i: re.sub(r'(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]', '', i))
        #for www-like
    df['lyrics'] = df['lyrics'].apply(lambda i: re.sub(r'https?://\S+|www\.\S+', '', i))

    #remove html tag and emoji
    df['lyrics'] = df['lyrics'].apply(remove_html_tag)
    df['lyrics'] = df['lyrics'].apply(remove_emoji)

    # to translate Chinese punct to English Ver.
    df['lyrics'] = df['lyrics'].apply(preprocess_punct)

    # remove parentheses and internal intro info
    df['lyrics'] = df['lyrics'].apply(lambda i: re.sub('(\\[(.*?)])', ' ', i))
    df['lyrics'] = df['lyrics'].apply(lambda i: re.sub('(\\(.*?)\\)', '', i))
    df['lyrics'] = df['lyrics'].apply(lambda i: re.sub('(\\{.*?)\\}', '', i))

    # lower, remove punct and numbers
    df['lyrics'] = df['lyrics'].apply(lambda i: i.lower())

    # the remove-punct is not so effective, so try stricter one only matching 
    df['lyrics'] = df['lyrics'].apply(lambda i: remove_punct(i))
    df['lyrics'] = df['lyrics'].apply(lambda i: remove_numbers(i))
    #df['lyrics'] = df['lyrics'].apply(lambda i: remove_repeat_char(i))
    #df['lyrics'] = df['lyrics'].apply(lambda i: words_lemmatizer(i))
    df['lyrics'] = df['lyrics'].apply(lambda i: remove_stopwords(i))

    # only match alphabets and numbers, so sth like √∑´¥ç≈¨∑πåßµç∫´˚®˚® could also be removed
    df['lyrics'] = df['lyrics'].apply(lambda i: re.sub('([^A-Za-z0-9])', ' ', i))

    # remove standalone alphabets
    # problem is how to solve this kind of standalone alphabets
    # l! o! v! e! you!  
    df['lyrics'] = df['lyrics'].apply(remove_standalone_alphabets)

    # to remove superfluous whitespace
    df['lyrics'] = df['lyrics'].apply(lambda i: ' '.join(i.split()))
    return df

In [6]:
%%time
text_cleaning(lyricdf)
lyricdf

CPU times: user 19min 11s, sys: 1min 14s, total: 20min 25s
Wall time: 20min 51s


Unnamed: 0,id,song,album,artist,acousticness_x,danceability_x,duration_ms_x,energy_x,instrumentalness_x,key_x,liveness_x,loudness_x,mode_x,speechiness_x,tempo_x,time_signature,valence_x,album_id,date,date_year,duration_ms_d,com_f_x,acousticness_y,artists,danceability_y,duration_ms_y,energy_y,explicit,instrumentalness_y,key_y,liveness_y,loudness_y,mode_y,name,popularity,release_date,speechiness_y,tempo_y,valence_y,year,release_date_year,artists_d,com_f_y,popularity_dummy,lyrics
0,1WsU2DCFFNbYgJU17zpLIe,,,,,,,,,,,,,,,,,,,,,,0.7110,['Lefty Frizzell'],0.794,155693,0.216,0,0.000000,2,0.3130,-13.160,1,"Moonlight, Darling and You",0,1953,0.0317,110.370,0.580,1953,1953,Lefty Frizzell,"Lefty Frizzell155693Moonlight, Darling and You...",0,scenea great terrace palace herod set banqueti...
1,1ZhSKUZoemAh7uzuYoWIwr,,,,,,,,,,,,,,,,,,,,,,0.4970,['Unspecified'],0.310,175413,0.764,0,0.000958,7,0.2800,-10.685,1,Bach: Great Prelude in G Major,0,1953-01-01,0.0307,179.617,0.884,1953,1953,Unspecified,Unspecified175413Bach: Great Prelude in G Majo...,0,investigation ferguson police department unite...
2,1Vzfi6QyWfVDtsYSaO6Imi,,,,,,,,,,,,,,,,,,,,,,0.9140,['Yves Montand'],0.380,189920,0.465,0,0.000009,5,0.1110,-10.560,1,La grande cité,0,1953,0.0420,95.418,0.438,1953,1953,Yves Montand,Yves Montand189920La grande cité0.9140.380.465...,0,comme est est est de tomtittot titoutomtotalit...
3,1gc3NBlpqD1mvoC0T3SOe7,,,,,,,,,,,,,,,,,,,,,,0.7140,['George Lewis'],0.489,161623,0.510,0,0.000043,5,0.1180,-8.527,1,Down by the Riverside,0,1953-04-20,0.0382,98.646,0.883,1953,1953,George Lewis,George Lewis161623Down by the Riverside0.71400...,0,whitechapel murder yesterday several arrests m...
4,1dCOTwNlRg2HLJKJuWj8nN,,,,,,,,,,,,,,,,,,,,,,0.9700,['Yves Montand'],0.435,274907,0.109,0,0.000000,2,0.4210,-16.072,0,Le roi renaud de guerre revient,0,1953,0.0598,88.224,0.351,1953,1953,Yves Montand,Yves Montand274907Le roi renaud de guerre revi...,0,le roi renaud de guerre revient portant ses tr...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93622,6h4Yp7LPev3PNZvMrFCzC0,Lapinha,Fool On The Hill,Sergio Mendes,0.7630,0.658,188373.0,0.455,0.000000,2.0,0.0827,-15.270,1.0,0.0464,90.961,4.0,0.963,2xzotT2hYczm1r71dtkpYS,1968-01-01,1968.0,188373.0,Sergio Mendes188373Lapinha0.7630.6579999999999...,0.7500,"[""Sergio Mendes & Brasil '66""]",0.659,188373,0.457,0,0.000001,2,0.0823,-15.447,1,Lapinha,25,1968-01-01,0.0414,90.615,0.964,1968,1968,Sergio Mendes & Brasil '66,Sergio Mendes & Brasil '66188373Lapinha0.750.6...,1,quando eu morrer enterre na lapinha quando eu ...
93623,6h5w4ez470ZaXq6sD2FozJ,Didn't Know It Was Love,Too Hot To Sleep,Survivor,0.1220,0.613,262800.0,0.683,0.000000,2.0,0.2410,-11.156,1.0,0.0332,128.416,4.0,0.637,1vDCPR1K9O81nzU2bRWYNE,1988,1988.0,262800.0,Survivor262800Didn't Know It Was Love0.1220.61...,0.1220,['Survivor'],0.613,262800,0.683,0,0.000000,2,0.2410,-11.156,1,Didn't Know It Was Love,34,1988-09-25,0.0332,128.416,0.637,1988,1988,Survivor,Survivor262800Didn't Know It Was Love0.1220.61...,1,felt easy morning sun love season run lookin t...
93624,6h77X5H4dP9l2jzG92vVv0,The Alternative Polka,Bad Hair Day,"""Weird Al"" Yankovic",0.0914,0.393,290907.0,0.878,0.000000,4.0,0.3190,-7.519,0.0,0.0558,176.646,4.0,0.898,0Jlz2oUJcRROhY8MFMp609,1996-12-31,1996.0,290907.0,"""Weird Al"" Yankovic290907The Alternative Polka...",0.0914,"['""Weird Al"" Yankovic']",0.393,290907,0.878,0,0.000000,4,0.3190,-7.519,0,The Alternative Polka,35,1996-12-31,0.0558,176.646,0.898,1996,1996,"""Weird Al"" Yankovic","""Weird Al"" Yankovic290907The Alternative Polka...",1,soy un perdedor im loser baby dont kill everyb...
93625,6h8laPks98zPizQ5qahy40,Winterlude,New Morning,Bob Dylan,0.9100,0.329,142947.0,0.389,0.013100,0.0,0.1140,-12.425,1.0,0.0384,50.166,4.0,0.492,48efaobqOTbvnlxbETstey,1970-10-21,1970.0,142947.0,Bob Dylan142947Winterlude0.910.328999999999999...,0.9100,['Bob Dylan'],0.329,142947,0.389,0,0.013100,0,0.1140,-12.425,1,Winterlude,39,1970-10-21,0.0384,50.166,0.492,1970,1970,Bob Dylan,Bob Dylan142947Winterlude0.910.3290.38900.0131...,1,winterlude winterlude oh darling winterlude ro...


In [17]:
def text_cleaning_complexity(df):
    # to remove urls
        #for http-like
    df['lyrics'] = df['lyrics'].apply(lambda i: re.sub(r'(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]', '', i))
        #for www-like
    df['lyrics'] = df['lyrics'].apply(lambda i: re.sub(r'https?://\S+|www\.\S+', '', i))

    #remove html tag and emoji
    df['lyrics'] = df['lyrics'].apply(remove_html_tag)
    df['lyrics'] = df['lyrics'].apply(remove_emoji)

    # to translate Chinese punct to English Ver.
    df['lyrics'] = df['lyrics'].apply(preprocess_punct)

    # remove parentheses and internal intro info
    df['lyrics'] = df['lyrics'].apply(lambda i: re.sub('(\\[(.*?)])', '', i))
    df['lyrics'] = df['lyrics'].apply(lambda i: re.sub('(\\(.*?)\\)', '', i))
    df['lyrics'] = df['lyrics'].apply(lambda i: re.sub('(\\{.*?)\\}', '', i))

    # lower, remove punct and numbers
    df['lyrics'] = df['lyrics'].apply(lambda i: i.lower())

    # the remove-punct is not so effective, so try stricter one only matching 
    # df['lyrics'] = df['lyrics'].apply(lambda i: remove_punct(i))
    df['lyrics'] = df['lyrics'].apply(lambda i: remove_numbers(i))
    # df['lyrics'] = df['lyrics'].apply(lambda i: remove_repeat_char(i))
    # df['lyrics'] = df['lyrics'].apply(lambda i: words_lemmatizer(i))
    # df['lyrics'] = df['lyrics'].apply(lambda i: remove_stopwords(i))

    # only match alphabets and numbers, so sth like √∑´¥ç≈¨∑πåßµç∫´˚®˚® could also be removed
    #df['lyrics'] = df['lyrics'].apply(lambda i: re.sub('([^A-Za-z0-9])', ' ', i))

    # remove standalone alphabets
    # problem is how to solve this kind of standalone alphabets
    # l! o! v! e! you!  
    df['lyrics'] = df['lyrics'].apply(remove_standalone_alphabets)

    # to remove superfluous whitespace
    df['lyrics'] = df['lyrics'].apply(lambda i: ' '.join(i.split()))
    return df

In [18]:
%%time
text_cleaning_complexity(lyricdf)
lyricdf

CPU times: user 10min 9s, sys: 59.8 s, total: 11min 8s
Wall time: 11min 22s


Unnamed: 0,id,song,album,artist,acousticness_x,danceability_x,duration_ms_x,energy_x,instrumentalness_x,key_x,liveness_x,loudness_x,mode_x,speechiness_x,tempo_x,time_signature,valence_x,album_id,date,date_year,duration_ms_d,com_f_x,acousticness_y,artists,danceability_y,duration_ms_y,energy_y,explicit,instrumentalness_y,key_y,liveness_y,loudness_y,mode_y,name,popularity,release_date,speechiness_y,tempo_y,valence_y,year,release_date_year,artists_d,com_f_y,popularity_dummy,lyrics
0,1WsU2DCFFNbYgJU17zpLIe,,,,,,,,,,,,,,,,,,,,,,0.7110,['Lefty Frizzell'],0.794,155693,0.216,0,0.000000,2,0.3130,-13.160,1,"Moonlight, Darling and You",0,1953,0.0317,110.370,0.580,1953,1953,Lefty Frizzell,"Lefty Frizzell155693Moonlight, Darling and You...",0,scene .— a great terrace in the palace of hero...
1,1ZhSKUZoemAh7uzuYoWIwr,,,,,,,,,,,,,,,,,,,,,,0.4970,['Unspecified'],0.310,175413,0.764,0,0.000958,7,0.2800,-10.685,1,Bach: Great Prelude in G Major,0,1953-01-01,0.0307,179.617,0.884,1953,1953,Unspecified,Unspecified175413Bach: Great Prelude in G Majo...,0,investigation of the ferguson police departmen...
2,1Vzfi6QyWfVDtsYSaO6Imi,,,,,,,,,,,,,,,,,,,,,,0.9140,['Yves Montand'],0.380,189920,0.465,0,0.000009,5,0.1110,-10.560,1,La grande cité,0,1953,0.0420,95.418,0.438,1953,1953,Yves Montand,Yves Montand189920La grande cité0.9140.380.465...,0,comme là on est où est on est on là . de tomti...
3,1gc3NBlpqD1mvoC0T3SOe7,,,,,,,,,,,,,,,,,,,,,,0.7140,['George Lewis'],0.489,161623,0.510,0,0.000043,5,0.1180,-8.527,1,Down by the Riverside,0,1953-04-20,0.0382,98.646,0.883,1953,1953,George Lewis,George Lewis161623Down by the Riverside0.71400...,0,the whitechapel murder . during yesterday seve...
4,1dCOTwNlRg2HLJKJuWj8nN,,,,,,,,,,,,,,,,,,,,,,0.9700,['Yves Montand'],0.435,274907,0.109,0,0.000000,2,0.4210,-16.072,0,Le roi renaud de guerre revient,0,1953,0.0598,88.224,0.351,1953,1953,Yves Montand,Yves Montand274907Le roi renaud de guerre revi...,0,le roi renaud de guerre revient portant ses tr...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93622,6h4Yp7LPev3PNZvMrFCzC0,Lapinha,Fool On The Hill,Sergio Mendes,0.7630,0.658,188373.0,0.455,0.000000,2.0,0.0827,-15.270,1.0,0.0464,90.961,4.0,0.963,2xzotT2hYczm1r71dtkpYS,1968-01-01,1968.0,188373.0,Sergio Mendes188373Lapinha0.7630.6579999999999...,0.7500,"[""Sergio Mendes & Brasil '66""]",0.659,188373,0.457,0,0.000001,2,0.0823,-15.447,1,Lapinha,25,1968-01-01,0.0414,90.615,0.964,1968,1968,Sergio Mendes & Brasil '66,Sergio Mendes & Brasil '66188373Lapinha0.750.6...,1,quando eu morrer me enterre na lapinha quando ...
93623,6h5w4ez470ZaXq6sD2FozJ,Didn't Know It Was Love,Too Hot To Sleep,Survivor,0.1220,0.613,262800.0,0.683,0.000000,2.0,0.2410,-11.156,1.0,0.0332,128.416,4.0,0.637,1vDCPR1K9O81nzU2bRWYNE,1988,1988.0,262800.0,Survivor262800Didn't Know It Was Love0.1220.61...,0.1220,['Survivor'],0.613,262800,0.683,0,0.000000,2,0.2410,-11.156,1,Didn't Know It Was Love,34,1988-09-25,0.0332,128.416,0.637,1988,1988,Survivor,Survivor262800Didn't Know It Was Love0.1220.61...,1,it felt so easy in the morning sun when love w...
93624,6h77X5H4dP9l2jzG92vVv0,The Alternative Polka,Bad Hair Day,"""Weird Al"" Yankovic",0.0914,0.393,290907.0,0.878,0.000000,4.0,0.3190,-7.519,0.0,0.0558,176.646,4.0,0.898,0Jlz2oUJcRROhY8MFMp609,1996-12-31,1996.0,290907.0,"""Weird Al"" Yankovic290907The Alternative Polka...",0.0914,"['""Weird Al"" Yankovic']",0.393,290907,0.878,0,0.000000,4,0.3190,-7.519,0,The Alternative Polka,35,1996-12-31,0.0558,176.646,0.898,1996,1996,"""Weird Al"" Yankovic","""Weird Al"" Yankovic290907The Alternative Polka...",1,"soy un perdedor ! i ' a loser , baby ! so why ..."
93625,6h8laPks98zPizQ5qahy40,Winterlude,New Morning,Bob Dylan,0.9100,0.329,142947.0,0.389,0.013100,0.0,0.1140,-12.425,1.0,0.0384,50.166,4.0,0.492,48efaobqOTbvnlxbETstey,1970-10-21,1970.0,142947.0,Bob Dylan142947Winterlude0.910.328999999999999...,0.9100,['Bob Dylan'],0.329,142947,0.389,0,0.013100,0,0.1140,-12.425,1,Winterlude,39,1970-10-21,0.0384,50.166,0.492,1970,1970,Bob Dylan,Bob Dylan142947Winterlude0.910.3290.38900.0131...,1,"winterlude , winterlude , oh darling winterlud..."


In [24]:
import fkscore
from tqdm import tqdm
text = lyricdf.loc[0, 'lyrics']
f = fkscore.fkscore(text)
f.stats, f.score

({'num_sentences': 1366, 'num_words': 14936, 'num_syllables': 14780},
 {'readability': 112.02, 'grade': '5th Grade'})

In [54]:
%%time
text = lyricdf.loc[2, 'lyrics']
f = fkscore.fkscore(text)
(f.stats, f.score)

CPU times: user 599 ms, sys: 10 ms, total: 609 ms
Wall time: 611 ms


({'num_sentences': 1207, 'num_words': 20099, 'num_syllables': 22887},
 {'readability': 93.598, 'grade': '5th Grade'})

In [38]:
a.iloc[0, 0] = str(f.stats)

In [64]:
%%time
for i in tqdm(range(len(lyricdf))):
    lyricdf.loc[i, 'stats_dict_tuple'] = str((fkscore.fkscore(lyricdf.loc[i, 'lyrics']).stats, fkscore.fkscore(lyricdf.loc[i, 'lyrics']).score))

KeyError: 'error'

In [None]:
%%time
lyricdf['stats_dict_tuple']

In [7]:
# still need strip accent, remove stopwords, lemmatize after translation

lyricdf.to_csv('basic_cleansing_0407_lda.csv')

In [None]:
# stripping accent hopefully could be done by translating

In [None]:
# test for translation twice, once before preprocessing, once after preprocessing

In [12]:
f = open("TextBlob.txt","r")
te = f.read()

In [17]:
a = te[11:]

In [18]:
dfte = pd.DataFrame()
dfte.loc[0, 'lyrics'] = a

In [19]:
text_cleaning(dfte)
dfte

Unnamed: 0,lyrics
0,like tomtittot titoutomtotalitarien unde ubi b...


In [20]:
import pandas as pd
import time
from tqdm import tqdm
from textblob import TextBlob
from langdetect import detect
import textblob.exceptions
blob = TextBlob(dfte.loc[0, 'lyrics']).translate(from_lang='fr',to='en')

In [22]:
# to retrive the string in TextBlob class
' '.join(blob.split())[:200]

'like tomtittot titoutomtotalitarien unde ubi broad hairy face ireland que viencra disgrace lead else keep money find pint portere place done says high guard cis honestly stone fields chop fatty meat s'

In [3]:
lyrics93627 = pd.read_csv('/Users/Amphetamine/Desktop/OneDrive - National University of Singapore/BT5153 Spotify Project/lyric_features.csv')

In [4]:
lyrics93627.iloc[41045, :]

Unnamed: 0                                                             41045
id                                                    4fdaOQs7w18AgDHK0XNsAX
song                                                                     NaN
album                                                                    NaN
artist                                                                   NaN
acousticness_x                                                           NaN
danceability_x                                                           NaN
duration_ms_x                                                            NaN
energy_x                                                                 NaN
instrumentalness_x                                                       NaN
key_x                                                                    NaN
liveness_x                                                               NaN
loudness_x                                                               NaN

In [5]:
com = pd.read_csv('/Users/Amphetamine/Desktop/OneDrive - National University of Singapore/BT5153 Spotify Project/lyricdf_for_complexity.csv')

In [6]:
com.iloc[41045, :]

Unnamed: 0                                                        41045
id                                               4fdaOQs7w18AgDHK0XNsAX
song                                                                NaN
album                                                               NaN
artist                                                              NaN
acousticness_x                                                      NaN
danceability_x                                                      NaN
duration_ms_x                                                       NaN
energy_x                                                            NaN
instrumentalness_x                                                  NaN
key_x                                                               NaN
liveness_x                                                          NaN
loudness_x                                                          NaN
mode_x                                                          

In [43]:
fea = pd.read_csv('lyric_features.csv', index_col = 0)

In [44]:
fea.iloc[41045: 41047, :]

Unnamed: 0,id,song,album,artist,acousticness_x,danceability_x,duration_ms_x,energy_x,instrumentalness_x,key_x,liveness_x,loudness_x,mode_x,speechiness_x,tempo_x,time_signature,valence_x,album_id,date,date_year,duration_ms_d,com_f_x,acousticness_y,artists,danceability_y,duration_ms_y,energy_y,explicit,instrumentalness_y,key_y,liveness_y,loudness_y,mode_y,name,popularity,release_date,speechiness_y,tempo_y,valence_y,year,release_date_year,artists_d,com_f_y,popularity_dummy,lyrics,stats_dict_tuple,num_sentences,num_words,num_syllables,readability,grade,richness,Language,neg,neu,pos,compound,compound_vader_polarity
41045,4fdaOQs7w18AgDHK0XNsAX,,,,,,,,,,,,,,,,,,,,,,0.571,['Jorge Valente'],0.638,152280,0.339,0,0.00104,4,0.0876,-10.973,1,Amorosa,11,1961,0.0368,122.228,0.541,1961,1961,Jorge Valente,Jorge Valente152280Amorosa0.57100000000000010....,0,buenas habíamos visto la vez pasada como una c...,"({'num_sentences': 1, 'num_words': 9073, 'num_...",1.0,9073.0,15864.0,-9150.182,College Graduate,0.112421,en,0.08,0.755,0.165,0.9591,1.0
41046,4hviK3ix9tdsJMPFjmQFQz,,,,,,,,,,,,,,,,,,,,,,0.876,['The Shirelles'],0.468,152173,0.384,0,0.0,10,0.115,-8.469,1,I Saw A Tear,11,1961-02-01,0.039,117.477,0.39,1961,1961,The Shirelles,The Shirelles152173I Saw A Tear0.8759999999999...,0,1 . like a rolling stone by bob dylan 2 . sati...,"({'num_sentences': 523, 'num_words': 4816, 'nu...",523.0,4816.0,4859.0,112.133,5th Grade,0.277829,es,0.028,0.955,0.016,-0.9965,-1.0


In [11]:
import fkscore
from tqdm import tqdm
text = fea.loc[41045, 'lyrics']
f = fkscore.fkscore(text)
f.stats, f.score

({'num_sentences': 1, 'num_words': 9073, 'num_syllables': 15864},
 {'readability': -9150.182, 'grade': 'College Graduate'})

In [45]:
without = pd.read_csv('lyrics_language_sentiment_withoutlyrics.csv', index_col=0)

In [46]:
without

Unnamed: 0,Language,neg,neu,pos,compound,compound_vader_polarity
0,en,0.100,0.782,0.118,0.9997,1.0
1,en,0.125,0.802,0.073,-1.0000,-1.0
2,fr,0.019,0.970,0.011,-0.9968,-1.0
3,en,0.083,0.869,0.048,-0.9996,-1.0
4,fr,0.000,0.979,0.021,0.7293,1.0
...,...,...,...,...,...,...
93624,pt,0.000,1.000,0.000,0.0000,0.0
93625,en,0.108,0.728,0.164,0.9471,1.0
93626,en,0.116,0.777,0.107,0.6269,1.0
93627,en,0.062,0.802,0.137,0.9253,1.0


In [47]:
cdf = without[(without.index != 40638) & (without.index != 42199)]
cdf

Unnamed: 0,Language,neg,neu,pos,compound,compound_vader_polarity
0,en,0.100,0.782,0.118,0.9997,1.0
1,en,0.125,0.802,0.073,-1.0000,-1.0
2,fr,0.019,0.970,0.011,-0.9968,-1.0
3,en,0.083,0.869,0.048,-0.9996,-1.0
4,fr,0.000,0.979,0.021,0.7293,1.0
...,...,...,...,...,...,...
93624,pt,0.000,1.000,0.000,0.0000,0.0
93625,en,0.108,0.728,0.164,0.9471,1.0
93626,en,0.116,0.777,0.107,0.6269,1.0
93627,en,0.062,0.802,0.137,0.9253,1.0


In [48]:
cdf = cdf.reset_index(drop = True)

In [49]:
fea.columns

Index(['id', 'song', 'album', 'artist', 'acousticness_x', 'danceability_x',
       'duration_ms_x', 'energy_x', 'instrumentalness_x', 'key_x',
       'liveness_x', 'loudness_x', 'mode_x', 'speechiness_x', 'tempo_x',
       'time_signature', 'valence_x', 'album_id', 'date', 'date_year',
       'duration_ms_d', 'com_f_x', 'acousticness_y', 'artists',
       'danceability_y', 'duration_ms_y', 'energy_y', 'explicit',
       'instrumentalness_y', 'key_y', 'liveness_y', 'loudness_y', 'mode_y',
       'name', 'popularity', 'release_date', 'speechiness_y', 'tempo_y',
       'valence_y', 'year', 'release_date_year', 'artists_d', 'com_f_y',
       'popularity_dummy', 'lyrics', 'stats_dict_tuple', 'num_sentences',
       'num_words', 'num_syllables', 'readability', 'grade', 'richness',
       'Language', 'neg', 'neu', 'pos', 'compound', 'compound_vader_polarity'],
      dtype='object')

In [51]:
(fea.index == cdf.index).sum()

93627

In [52]:
for i in cdf.columns:
    fea[i] = cdf[i]

In [53]:
fea.to_csv('dbqprofjinjin.csv')

In [31]:
without.isna().sum()

Language                   0
neg                        0
neu                        0
pos                        0
compound                   0
compound_vader_polarity    0
dtype: int64

In [34]:
len(fea)

93627

In [35]:
len(cdf)

93627

In [36]:
fea.index

Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            93617, 93618, 93619, 93620, 93621, 93622, 93623, 93624, 93625,
            93626],
           dtype='int64', length=93627)

In [38]:
fea.index = cdf.index

In [39]:
for i in cdf.columns:
    fea[i] = cdf[i]

In [41]:
fea.to_csv('dbqprofjin.csv')

In [42]:
fea.iloc[41045, :]

id                                                    4fdaOQs7w18AgDHK0XNsAX
song                                                                     NaN
album                                                                    NaN
artist                                                                   NaN
acousticness_x                                                           NaN
danceability_x                                                           NaN
duration_ms_x                                                            NaN
energy_x                                                                 NaN
instrumentalness_x                                                       NaN
key_x                                                                    NaN
liveness_x                                                               NaN
loudness_x                                                               NaN
mode_x                                                                   NaN