# Cleaning raw data to get ready for modeling

In [44]:
import pandas as pd

In [45]:
df = pd.read_csv("/Users/sa17/Desktop/Music-Recommendation-Algorithm/data/train.csv")

df.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,communication,obscene,music,movement/places,light/visual perceptions,family/spiritual,sadness,feelings,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.263751,0.000598,0.039288,0.000598,0.000598,0.000598,0.380299,0.117175,sadness,1.0
1,4,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.118034,0.001284,0.212681,0.051124,0.001284,0.001284,world/life,1.0
2,6,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...,24,0.00277,0.00277,0.00277,...,0.250668,0.00277,0.323794,0.00277,0.00277,0.00277,0.00277,0.225422,music,1.0
3,10,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,...,0.001548,0.001548,0.001548,0.12925,0.001548,0.001548,0.225889,0.001548,romantic,1.0
4,12,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...,48,0.00135,0.00135,0.417772,...,0.00135,0.00135,0.00135,0.00135,0.00135,0.029755,0.0688,0.00135,romantic,1.0


In [46]:
# Create a new dataframe to remove unneccessary columns
columns = ["violence", "world/life", "night/time", "romantic", "obscene", "music", "sadness", "feelings", "len", "age"]

new_df = df[columns]

new_df

Unnamed: 0,violence,world/life,night/time,romantic,obscene,music,sadness,feelings,len,age
0,0.063746,0.000598,0.000598,0.017104,0.000598,0.039288,0.380299,0.117175,95,1.000000
1,0.096777,0.443435,0.001284,0.001284,0.001284,0.118034,0.001284,0.001284,51,1.000000
2,0.002770,0.002770,0.002770,0.158564,0.002770,0.323794,0.002770,0.225422,24,1.000000
3,0.001548,0.001548,0.001548,0.411536,0.001548,0.001548,0.225889,0.001548,54,1.000000
4,0.001350,0.417772,0.001350,0.463430,0.001350,0.001350,0.068800,0.001350,48,1.000000
...,...,...,...,...,...,...,...,...,...,...
28357,0.001350,0.001350,0.001350,0.001350,0.391651,0.001350,0.065664,0.001350,78,0.014286
28358,0.001284,0.035338,0.001284,0.066324,0.318910,0.058152,0.001284,0.001284,67,0.014286
28359,0.154302,0.168988,0.001504,0.035401,0.356685,0.001504,0.001504,0.001504,77,0.014286
28360,0.001196,0.001196,0.001196,0.001196,0.492434,0.103614,0.001196,0.001196,67,0.014286


Removed all categorical columns artist_name, track_name, genre, lyrics, topic as only numeric values is needed to performed the modeling.

Removed dating, shake the audience, family/gospel, communication, movement/places, light/visual perception and family/spirital as each of these features will fall into one of the 8 song topics which are 
violence, world/life, night/time, romantic, obscene, music, sadness, feelings.

Removed release_date as age is a sufficent value to show the relationship between time and each feature.



In [47]:
# Drop duplicates 
new_df = new_df.drop_duplicates()

new_df

Unnamed: 0,violence,world/life,night/time,romantic,obscene,music,sadness,feelings,len,age
0,0.063746,0.000598,0.000598,0.017104,0.000598,0.039288,0.380299,0.117175,95,1.000000
1,0.096777,0.443435,0.001284,0.001284,0.001284,0.118034,0.001284,0.001284,51,1.000000
2,0.002770,0.002770,0.002770,0.158564,0.002770,0.323794,0.002770,0.225422,24,1.000000
3,0.001548,0.001548,0.001548,0.411536,0.001548,0.001548,0.225889,0.001548,54,1.000000
4,0.001350,0.417772,0.001350,0.463430,0.001350,0.001350,0.068800,0.001350,48,1.000000
...,...,...,...,...,...,...,...,...,...,...
28357,0.001350,0.001350,0.001350,0.001350,0.391651,0.001350,0.065664,0.001350,78,0.014286
28358,0.001284,0.035338,0.001284,0.066324,0.318910,0.058152,0.001284,0.001284,67,0.014286
28359,0.154302,0.168988,0.001504,0.035401,0.356685,0.001504,0.001504,0.001504,77,0.014286
28360,0.001196,0.001196,0.001196,0.001196,0.492434,0.103614,0.001196,0.001196,67,0.014286


Only lost 4 after dropping duplicates

In [48]:
# Create a function to remove outliers 
column = ["violence", "world/life", "night/time", "romantic", "obscene", "music", "sadness", "feelings", "len", "age"]

def remove_outliers(df, cols):
    clean_df = df.copy()
      
    for col in cols:
        Q1 = clean_df[col].quantile(0.25)
        Q3 = clean_df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
     
        # Filter base on current column
        clean_df = clean_df[(clean_df[col] >= lower_bound) & (clean_df[col] <= upper_bound)]
    
    return clean_df

# Apply the function
process_df = remove_outliers(new_df, column)

# Print the shapes
print(f"Original shape: {new_df.shape}")
print(f"New shape after removing outliers: {process_df.shape}")

process_df

Original shape: (28358, 10)
New shape after removing outliers: (13971, 10)


Unnamed: 0,violence,world/life,night/time,romantic,obscene,music,sadness,feelings,len,age
1,0.096777,0.443435,0.001284,0.001284,0.001284,0.118034,0.001284,0.001284,51,1.000000
5,0.420685,0.001053,0.074078,0.001053,0.001053,0.001053,0.128292,0.001053,98,1.000000
9,0.102548,0.001120,0.053944,0.001120,0.001120,0.001120,0.630507,0.001120,61,1.000000
12,0.244358,0.083570,0.000627,0.000627,0.458984,0.000627,0.000627,0.032581,173,0.985714
13,0.000786,0.150691,0.000786,0.000786,0.000786,0.173311,0.445469,0.045097,73,0.985714
...,...,...,...,...,...,...,...,...,...,...
28356,0.001224,0.105172,0.001224,0.100856,0.325359,0.001224,0.001224,0.001224,61,0.014286
28357,0.001350,0.001350,0.001350,0.001350,0.391651,0.001350,0.065664,0.001350,78,0.014286
28358,0.001284,0.035338,0.001284,0.066324,0.318910,0.058152,0.001284,0.001284,67,0.014286
28359,0.154302,0.168988,0.001504,0.035401,0.356685,0.001504,0.001504,0.001504,77,0.014286


In [49]:
# Checking for any null values
process_df.isnull().sum()

violence      0
world/life    0
night/time    0
romantic      0
obscene       0
music         0
sadness       0
feelings      0
len           0
age           0
dtype: int64

In [50]:
process_df.to_csv("/Users/sa17/Desktop/Music-Recommendation-Algorithm/data/cleantrain.csv", index= False)