In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('SongCSV_filtered.csv')

In [3]:
#Everything in lowercase
train['Lyrics'] = train['Lyrics'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
train['Lyrics'].head()

0                                         instrumental
1    it's time that i rain on your parade watch as ...
2    you who's coming up the stairs, shouting- i’m ...
3    life is like a merry go round painted horses r...
4                                         instrumental
Name: Lyrics, dtype: object

In [4]:
#Removing punctuation that does not add meaning to the song
train['Lyrics'] = train['Lyrics'].str.replace('[^\w\s]','')
train['Lyrics'].head()

0                                         instrumental
1    its time that i rain on your parade watch as a...
2    you whos coming up the stairs shouting im comi...
3    life is like a merry go round painted horses r...
4                                         instrumental
Name: Lyrics, dtype: object

In [5]:
#Removing of stop words
from nltk.corpus import stopwords

stop = stopwords.words('english')
train['Lyrics'] = train['Lyrics'].apply(lambda x: " ".join(x for x in str(x).split() if x not in stop))
train['Lyrics'].head()

0                                         instrumental
1    time rain parade watch hopes explode landmines...
2    whos coming stairs shouting im coming dying li...
3    life like merry go round painted horses riding...
4                                         instrumental
Name: Lyrics, dtype: object

In [6]:
# #Top ten most occurring words in lyrics...
# freq = pd.Series(' '.join(train['Lyrics']).split()).value_counts()[:10]
# freq

In [7]:
# #Removal of those common words
# freq = list(freq.index)
# train['Lyrics'] = train['Lyrics'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
# train['Lyrics'].head()

In [8]:
#Number of RARE words. These words occur so rarely that their meaning don't really constitute anything.
freq = pd.Series(' '.join(train['Lyrics']).split()).value_counts()[-10:]
freq

autres        1
reporting     1
binningham    1
menneet       1
likkle        1
dueña         1
namesake      1
oprah         1
earrings      1
capisci       1
dtype: int64

In [9]:
#Removal of rare words
freq = list(freq.index)
train['Lyrics'] = train['Lyrics'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['Lyrics'].head()

0                                         instrumental
1    time rain parade watch hopes explode landmines...
2    whos coming stairs shouting im coming dying li...
3    life like merry go round painted horses riding...
4                                         instrumental
Name: Lyrics, dtype: object

In [10]:
#Correction of Spelling mistakes
from textblob import TextBlob
train['Lyrics'] = train['Lyrics'].apply(lambda x: str(TextBlob(x).correct()))

In [11]:
train['Lyrics'].head()

0                                         instrumental
1    time rain parade watch hopes explode landmines...
2    who coming stairs shouting in coming dying lik...
3    life like merry go round painted horses riding...
4                                         instrumental
Name: Lyrics, dtype: object

In [12]:
# #Stemming is basically removing the 'ly' or 'ing' from the end of the words....
# from nltk.stem import PorterStemmer
# st = PorterStemmer()
# train['Lyrics'] = train['Lyrics'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
# train['Lyrics'].head()

In [14]:
#Lemmatization is basically converting a word into its root word. It is preferred over Stemming.
from textblob import Word
train['Lyrics'] = train['Lyrics'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['Lyrics'].head()

0                                         instrumental
1    time rain parade watch hope explode landmines ...
2    who coming stair shouting in coming dying like...
3    life like merry go round painted horse riding ...
4                                         instrumental
Name: Lyrics, dtype: object

In [15]:
df = pd.DataFrame(train['Lyrics'])

In [16]:
df.head()

Unnamed: 0,Lyrics
0,instrumental
1,time rain parade watch hope explode landmines ...
2,who coming stair shouting in coming dying like...
3,life like merry go round painted horse riding ...
4,instrumental


In [17]:
df.to_csv('pre_processed_lyrics(new).csv', index = False)