In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PowerTransformer

# Load the dataset
df = pd.read_csv("C:\\Users\\hp\\Documents\\100 days\\Hit Potential\\spotify-2023.csv", encoding='ISO-8859-1')

# Drop rows with missing values and make a copy to avoid SettingWithCopyWarning
new_df = df.dropna().copy()

# Drop column with mostly 0s if it exists
if 'instrumentalness_%' in new_df.columns:
    new_df.drop('instrumentalness_%', axis=1, inplace=True)

# Identify numeric columns
num_cols = new_df.select_dtypes(include=['int64', 'float64']).columns

# Check skewness
skew_vals = new_df[num_cols].skew()
high_skew_cols = skew_vals[abs(skew_vals) > 0.75].index

print("Highly skewed columns:")
print(skew_vals[high_skew_cols].sort_values(ascending=False))

# Apply Yeo-Johnson Transformation (safe for 0s and negatives)
pt = PowerTransformer(method='yeo-johnson')
transformed_data = pt.fit_transform(new_df[high_skew_cols])

# Add transformed columns back with _yj suffix
for i, col in enumerate(high_skew_cols):
    new_df[f'{col}_yj'] = transformed_data[:, i]

# Show new skewness correctly
transformed_df = pd.DataFrame(transformed_data, columns=[f"{col}_yj" for col in high_skew_cols])
new_skew_vals = transformed_df.skew()

print("\nSkewness after Yeo-Johnson:")
for col, val in new_skew_vals.items():
    print(f"{col}: {val:.4f}")



Highly skewed columns:
in_deezer_charts        3.424164
in_spotify_playlists    3.156998
in_spotify_charts       2.426236
in_apple_playlists      2.377990
artist_count            2.262566
liveness_%              2.134825
speechiness_%           1.834621
in_apple_charts         1.108591
acousticness_%          0.972033
released_year          -4.451921
dtype: float64

Skewness after Yeo-Johnson:
artist_count_yj: 0.5095
released_year_yj: -3.2139
in_spotify_playlists_yj: 0.0014
in_spotify_charts_yj: 0.1577
in_apple_playlists_yj: -0.0221
in_apple_charts_yj: -0.1860
in_deezer_charts_yj: 0.6186
acousticness_%_yj: -0.1022
liveness_%_yj: 0.0453
speechiness_%_yj: 0.2147


  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


In [4]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 817 entries, 0 to 952
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   track_name               817 non-null    object 
 1   artist(s)_name           817 non-null    object 
 2   artist_count             817 non-null    int64  
 3   released_year            817 non-null    int64  
 4   released_month           817 non-null    int64  
 5   released_day             817 non-null    int64  
 6   in_spotify_playlists     817 non-null    int64  
 7   in_spotify_charts        817 non-null    int64  
 8   streams                  817 non-null    object 
 9   in_apple_playlists       817 non-null    int64  
 10  in_apple_charts          817 non-null    int64  
 11  in_deezer_playlists      817 non-null    object 
 12  in_deezer_charts         817 non-null    int64  
 13  in_shazam_charts         817 non-null    object 
 14  bpm                      817 no

In [None]:
cols_to_fix = ['streams', 'in_deezer_playlists', 'in_shazam_charts']
for col in cols_to_fix:
    new_df[col] = new_df[col].str.replace(',', '', regex=False).astype(float)


In [8]:
for col in cols_to_fix:
    print(f"\n--- Problematic entries in: {col} ---")
    print(new_df[~new_df[col].str.replace(',', '', regex=False).str.strip().str.isnumeric()][col])



--- Problematic entries in: streams ---
574    BPM110KeyAModeMajorDanceability53Valence75Ener...
Name: streams, dtype: object

--- Problematic entries in: in_deezer_playlists ---
Series([], Name: in_deezer_playlists, dtype: object)

--- Problematic entries in: in_shazam_charts ---
Series([], Name: in_shazam_charts, dtype: object)


In [9]:
new_df = new_df.drop(index=574)


In [11]:
cols_to_fix = ['streams', 'in_deezer_playlists', 'in_shazam_charts']
for col in cols_to_fix:
    new_df[col] = new_df[col].str.replace(',', '', regex=False).astype(float)


In [18]:
cols_to_drop = [
    'track_name',
    'artist(s)_name',
    'artist_count',
    'released_year',
    'released_month',
    'released_day',
    'in_spotify_playlists',
    'in_spotify_charts',
    'in_apple_playlists',
    'in_apple_charts',
    'in_deezer_charts',
    'acousticness_%',
    'liveness_%',
    'speechiness_%'
]

new_df.drop(columns=cols_to_drop, inplace=True)



In [19]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 816 entries, 0 to 952
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   streams                  816 non-null    float64
 1   in_deezer_playlists      816 non-null    float64
 2   in_shazam_charts         816 non-null    float64
 3   bpm                      816 non-null    int64  
 4   key                      816 non-null    object 
 5   mode                     816 non-null    object 
 6   danceability_%           816 non-null    int64  
 7   valence_%                816 non-null    int64  
 8   energy_%                 816 non-null    int64  
 9   artist_count_yj          816 non-null    float64
 10  released_year_yj         816 non-null    float64
 11  in_spotify_playlists_yj  816 non-null    float64
 12  in_spotify_charts_yj     816 non-null    float64
 13  in_apple_playlists_yj    816 non-null    float64
 14  in_apple_charts_yj       816 no

In [21]:
new_df.head(10)

Unnamed: 0,streams,in_deezer_playlists,in_shazam_charts,bpm,key,mode,danceability_%,valence_%,energy_%,artist_count_yj,released_year_yj,in_spotify_playlists_yj,in_spotify_charts_yj,in_apple_playlists_yj,in_apple_charts_yj,in_deezer_charts_yj,acousticness_%_yj,liveness_%_yj,speechiness_%_yj
0,141381703.0,45.0,826.0,125,B,Major,80,89,83,1.059796,0.540029,-1.004174,1.883885,0.222782,2.133135,1.641666,0.537989,-1.052816,-0.8179
1,133716286.0,58.0,382.0,92,C#,Major,71,61,74,-0.798629,0.540029,-0.248818,1.456137,0.310832,1.277389,1.698506,-0.624832,-0.599916,-0.8179
2,140003974.0,91.0,949.0,138,F,Major,51,32,53,-0.798629,0.540029,-0.289824,1.79105,0.877002,1.83591,1.698506,0.010411,1.210881,-0.111764
3,800840817.0,125.0,548.0,170,A,Major,55,58,72,-0.798629,-0.061705,1.010627,1.746352,1.064202,1.83591,1.674619,-0.321895,-0.416335,1.03811
4,303236322.0,87.0,425.0,144,A,Minor,65,23,80,-0.798629,0.540029,0.323332,1.473283,0.778858,1.334651,1.708134,-0.142824,-0.416335,-0.111764
5,183706234.0,88.0,946.0,141,C#,Major,92,66,58,1.059796,0.540029,0.051187,1.711167,0.585691,1.870384,1.724073,0.101913,-1.052816,1.42432
6,725980112.0,43.0,418.0,148,F,Minor,67,83,76,1.059796,0.540029,0.312915,1.473283,0.039144,1.920788,1.687448,0.975837,-1.052816,-1.391334
7,58149378.0,30.0,194.0,100,F,Major,67,26,71,-0.798629,0.540029,-0.806074,1.409338,-0.19231,0.928613,1.687448,0.709478,-0.416335,-0.8179
8,95217315.0,48.0,953.0,130,C#,Minor,85,22,62,-0.798629,0.540029,-0.47577,1.67625,0.493425,1.853237,1.659567,-0.258707,1.079081,0.469863
9,553634067.0,66.0,339.0,170,D,Minor,81,56,48,1.059796,0.540029,0.278713,1.419191,0.327481,1.13721,1.687448,0.186603,-1.052816,1.626032


In [22]:
# Binary encoding for 'mode'
new_df['mode'] = new_df['mode'].map({'Major': 1, 'Minor': 0})

# Ordinal encoding for 'key'
key_mapping = {'C': 0, 'C#': 1, 'D': 2, 'D#': 3, 'E': 4,
               'F': 5, 'F#': 6, 'G': 7, 'G#': 8, 'A': 9,
               'A#': 10, 'B': 11}
new_df['key'] = new_df['key'].map(key_mapping)


In [23]:
new_df[['key', 'mode']].head()


Unnamed: 0,key,mode
0,11,1
1,1,1
2,5,1
3,9,1
4,9,0
