# **Exercise:** *Data Processing*
### July 18, 2022

In [175]:
# Imports
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from config import ml_songs_data
from datetime import datetime
from datetime import timedelta
pd.options.mode.chained_assignment = None

# Get raw data
raw_song_data = pd.read_csv(ml_songs_data)

# Check data types
raw_song_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1994 entries, 0 to 1993
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Index                   1994 non-null   int64  
 1   Title                   1991 non-null   object 
 2   Artist                  1986 non-null   object 
 3   Top Genre               1986 non-null   object 
 4   Year                    1987 non-null   float64
 5   Month                   1994 non-null   object 
 6   Beats Per Minute (BPM)  1985 non-null   float64
 7   Energy                  1986 non-null   float64
 8   Danceability            970 non-null    float64
 9   Loudness (dB)           1987 non-null   float64
 10  Liveness                1986 non-null   float64
 11  Valence                 960 non-null    float64
 12  Length (Duration)       1985 non-null   object 
 13  Acousticness            1986 non-null   float64
 14  Speechiness             1990 non-null   

In [176]:
# Remove rows and columns with less than 50% values
essential_song_data = raw_song_data.copy()

# Remove columns with less than 50% of values
remove_cols = []
for i in range(len(essential_song_data.columns)):
    if (essential_song_data[essential_song_data.columns[i]].isna().sum() >= len(essential_song_data)/2):
        remove_cols.append(essential_song_data.columns[i])
essential_song_data = essential_song_data.drop(columns=remove_cols)

# Remove rows with less than 50% of values
remove_rows = []
for i in range(len(essential_song_data)):
    if (essential_song_data.iloc[i].isna().sum() >= len(essential_song_data.columns)/2):
        remove_rows.append(i)
essential_song_data = essential_song_data.drop(index=remove_rows)

In [177]:
# Fix data hygiene typing issues
hygienic_song_data = essential_song_data.copy()

# Remove Title, Artist, and Index because it will not be easily passed in ML algorithm
hygienic_song_data = hygienic_song_data.drop(columns=['Title', 'Artist', 'Index'])

# Convert all month variables into ints
month_dict = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
hygienic_song_data['Month'] = hygienic_song_data['Month'].replace(month_dict)
hygienic_song_data['Month'] = hygienic_song_data['Month'].astype(int)

# Remove commas from Length Duration and cast as integer
hygienic_song_data['Length (Duration)'][hygienic_song_data['Length (Duration)'].notna()] = hygienic_song_data['Length (Duration)'][hygienic_song_data['Length (Duration)'].notna()].str.replace(',', '')
hygienic_song_data['Length (Duration)'][hygienic_song_data['Length (Duration)'].notna()] = hygienic_song_data['Length (Duration)'][hygienic_song_data['Length (Duration)'].notna()].astype(int)

# Pass all numeric data through mean imputation
imputer_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
numeric_data = hygienic_song_data.copy()
numeric_data = numeric_data.drop(columns='Top Genre')
imputer_mean.fit(numeric_data)
numeric_data[:] = imputer_mean.transform(numeric_data)

# Pass all categorical data through mode imputation
imputer_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
categorical_data = pd.DataFrame(hygienic_song_data['Top Genre'].copy())
imputer_mode.fit(categorical_data)
categorical_data[:] = imputer_mode.transform(categorical_data)
categorical_data = pd.get_dummies(categorical_data, drop_first=True)

# Put data back together
hygienic_song_data = pd.concat([numeric_data, categorical_data], axis=1)

# Convert year and month columns into age
hygienic_song_data['Age'] = 0
for i in range(len(hygienic_song_data)):
    hygienic_song_data['Age'].iloc[i] = (datetime.now() - pd.to_datetime(f'{int(hygienic_song_data["Year"].iloc[i])}-{int(hygienic_song_data["Month"].iloc[i])}-1', yearfirst=True)) / timedelta(days=365)
hygienic_song_data = hygienic_song_data.drop(columns=['Year', 'Month'])

In [178]:
# Find outliers
def findOutliers(column):
    
    # Get mean and SD
    mean = column.mean()
    std = column.std()

    # Get list of outliers (i.e., 3 STD from total)
    outliers = []
    for i in range(len(column)):
        val = column.iloc[i]
        if (val < mean - 3 * std or val > mean + 3 * std):
            outliers.append(i)
    
    # Return outlier indices
    return outliers

In [179]:
# Get mean, std, and outliers for each column
for column in hygienic_song_data.columns:
    print(f'  Column: {column}')
    print(f'    Mean: {hygienic_song_data[column].mean()}')
    print(f'     STD: {hygienic_song_data[column].std()}')
    print(f'Outliers: {len(findOutliers(hygienic_song_data[column]))}')
    print()

  Column: Beats Per Minute (BPM)
    Mean: 120.23449319213313
     STD: 28.618272515054493
Outliers: 4

  Column: Energy
    Mean: 59.64429868819375
     STD: 22.149824507895488
Outliers: 0

  Column: Loudness (dB)
    Mean: -9.016145307769929
     STD: 3.64864167473393
Outliers: 22

  Column: Liveness
    Mean: 19.019667170953102
     STD: 16.737749503095667
Outliers: 57

  Column: Length (Duration)
    Mean: 262.61654894046416
     STD: 93.72513158581434
Outliers: 26

  Column: Acousticness
    Mean: 28.916834677419356
     STD: 29.0141742284192
Outliers: 0

  Column: Speechiness
    Mean: 4.991939546599496
     STD: 4.406994156747299
Outliers: 42

  Column: Popularity
    Mean: 59.51183879093199
     STD: 14.3630854123838
Outliers: 9

  Column: Top Genre_acoustic pop
    Mean: 0.002014098690835851
     STD: 0.04484478467966559
Outliers: 4

  Column: Top Genre_adult standards
    Mean: 0.06092648539778449
     STD: 0.23925566278511493
Outliers: 121

  Column: Top Genre_afropop
    Me

In [180]:
# Save data
hygienic_song_data.to_csv('data/song_data_clean.csv')