In [27]:
import matplotlib
import pandas as pd
import torch
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from torch import nn

import matplotlib.pyplot as plt
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn import metrics
import random
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split


dtype_dict = {
    'song_id': str,
    'song_name': str,
    'song_popularity': float,
    'explicit': bool,
    'song_type': str,
    'track_number': float,
    'num_artists': float,
    'num_available_markets': float,
    'release_date': str,  # Assuming it's a date, change to appropriate type if needed
    'duration_ms': float,
    'key': float,
    'mode': float,
    'time_signature': float,
    'acousticness': float,
    'danceability': float,
    'energy': float,
    'instrumentalness': float,
    'liveness': float,
    'loudness': float,
    'speechiness': float,
    'valence': float,
    'tempo': float,
    'hit': float,
    'nr_artists': float,
    'artist1_id': str,  # evtl ersätzen mit eintweder haswert oder count
    'artist2_id': str,
    'eigencentrality_x': float,
    'name_x': str,
    'eccentricity_x': float,
    'degree_x': float,
    'clustering_x': float,
    'closnesscentrality_x': float,
    'weighted degree_x': float,
    'betweenesscentrality_x': float,
    'Cluster_x': float,
    'eigencentrality_y': float,
    'name_y': str,
    'eccentricity_y': float,
    'degree_y': float,
    'clustering_y': float,
    'closnesscentrality_y': float,
    'weighted degree_y': float,
    'betweenesscentrality_y': float,
    'Cluster_y': float
}

data = pd.read_csv("data_superstar_v1_0.csv", delimiter=",", dtype=dtype_dict, na_values=[''])
data['date'] = pd.to_datetime(data['release_date'])
data.sort_values(by="date", inplace=True)

In [28]:
data

Unnamed: 0,song_id,song_name,song_popularity,explicit,song_type,track_number,num_artists,num_available_markets,release_date,duration_ms,...,superstar_v4_y,superstar_v5_x,superstar_v5_y,success_rate_x,success_rate_y,hits_in_past_x,hits_in_past_y,superstar_x,superstar_y,date
0,7kXUEJmfvRXbzxOC0pHQgb,I Can't Believe I'm Losing You,5.0,False,Solo,12.0,1.0,79.0,1995-01-01,162667.0,...,1.0,1.0,1.0,0.0,0.0,0,0,0,0,1995-01-01
3659,1qWiKzVrG0GQd1qbgKCsSF,I've Grown Accustomed To Her Face,14.0,False,Solo,2.0,1.0,79.0,1995-01-01,349120.0,...,1.0,1.0,1.0,0.0,0.0,0,0,0,0,1995-01-01
3658,1oNYa5jcL10TbXQSPRgbsk,Come Back To Me,16.0,False,Solo,5.0,1.0,2.0,1995-01-01,206093.0,...,1.0,1.0,1.0,0.0,0.0,0,0,0,0,1995-01-01
3657,7A131DrpfbWAfNJLrxikwf,Top Of The Stairs,26.0,False,Solo,4.0,1.0,2.0,1995-01-01,271307.0,...,1.0,1.0,1.0,0.0,0.0,0,0,0,0,1995-01-01
3656,4SRjOJm5AjO3fxXpZSTEBb,Never Crossed My Mind,20.0,False,Solo,3.0,1.0,2.0,1995-01-01,242027.0,...,1.0,1.0,1.0,0.0,0.0,0,0,0,0,1995-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
911001,1EeuFwQ7RhTNlPg69LGUxM,All the Kids on the Street,5.0,False,Solo,6.0,1.0,79.0,2019-08-23,192401.0,...,1.0,1.0,1.0,0.0,0.0,0,0,0,0,2019-08-23
911000,2O9CikWfjkYAgGCBws5Pl4,Baby I'll Give It to You,1.0,False,Solo,25.0,1.0,76.0,2019-08-23,194907.0,...,1.0,1.0,1.0,0.0,0.0,0,0,0,0,2019-08-23
911025,2CZ46u5NGXQFXke3PMCuAK,Today I Started Loving You Again,3.0,False,Solo,15.0,1.0,78.0,2019-08-23,172147.0,...,1.0,1.0,1.0,0.0,0.0,0,0,0,0,2019-08-23
911012,6zhg8TnVKi90ITDzFKwPr0,Swingtown - Alternate Version,26.0,False,Solo,5.0,1.0,79.0,2019-08-23,205707.0,...,1.0,1.0,1.0,0.0,0.0,0,0,0,0,2019-08-23


In [29]:
data.columns

Index(['song_id', 'song_name', 'song_popularity', 'explicit', 'song_type',
       'track_number', 'num_artists', 'num_available_markets', 'release_date',
       'duration_ms', 'key', 'mode', 'time_signature', 'acousticness',
       'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness',
       'speechiness', 'valence', 'tempo', 'hit', 'artist1_id', 'artist2_id',
       'name_x', 'betweenesscentrality_x', 'closnesscentrality_x',
       'clustering_x', 'degree_x', 'eccentricity_x', 'eigencentrality_x',
       'weighted degree_x', 'pagerank_x', 'Cluster_x', 'profile_x', 'name_y',
       'betweenesscentrality_y', 'closnesscentrality_y', 'clustering_y',
       'degree_y', 'eccentricity_y', 'eigencentrality_y', 'weighted degree_y',
       'pagerank_y', 'Cluster_y', 'profile_y', 'artist1_num', 'artist2_num',
       'years_on_charts', 'superstar_v1_x', 'superstar_v1_y', 'superstar_v2_x',
       'superstar_v2_y', 'superstar_v3_x', 'superstar_v3_y', 'superstar_v4_x',
       'supers

In [30]:

# List of columns to keep
columns_to_keep = ['explicit', 'track_number', 'num_artists', 'num_available_markets', 'release_date',
                   'duration_ms', 'key', 'mode', 'time_signature', 'acousticness',
                   'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness',
                   'speechiness', 'valence', 'tempo', "date", "years_on_charts",               #removeyoc
                   'hit', "artist1_num", "artist2_num", 'betweenesscentrality_y', 'closnesscentrality_y', 'clustering_y', 'Cluster_y',
                   'eccentricity_y', 'eigencentrality_y', 'weighted degree_y', "profile_y"]  # , "superstar_v1_x", "superstar_x"]                              #Collaboration Profile == CLuster????
#  'release_date', 'betweenesscentrality_x', 'closnesscentrality_x', 'clustering_x', 'Cluster_x',
# 'eccentricity_x', 'eigencentrality_x', 'weighted degree_x', "profile_x",
# 'betweenesscentrality_y', 'closnesscentrality_y', 'clustering_y', 'Cluster_y',
# 'eccentricity_y', 'eigencentrality_y', 'weighted degree_y', "profile_y", "hit"]                              #Collaboration Profile == CLuster????

# Drop columns not in the list
data = data[columns_to_keep]
data

Unnamed: 0,explicit,track_number,num_artists,num_available_markets,release_date,duration_ms,key,mode,time_signature,acousticness,...,artist1_num,artist2_num,betweenesscentrality_y,closnesscentrality_y,clustering_y,Cluster_y,eccentricity_y,eigencentrality_y,weighted degree_y,profile_y
0,False,12.0,1.0,79.0,1995-01-01,162667.0,10.0,1.0,4.0,0.690000,...,16718.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
3659,False,2.0,1.0,79.0,1995-01-01,349120.0,1.0,0.0,4.0,0.297000,...,12974.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
3658,False,5.0,1.0,2.0,1995-01-01,206093.0,2.0,1.0,4.0,0.105000,...,1409.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
3657,False,4.0,1.0,2.0,1995-01-01,271307.0,7.0,1.0,4.0,0.027200,...,1409.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
3656,False,3.0,1.0,2.0,1995-01-01,242027.0,7.0,1.0,4.0,0.025800,...,1409.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
911001,False,6.0,1.0,79.0,2019-08-23,192401.0,2.0,1.0,4.0,0.000465,...,15672.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
911000,False,25.0,1.0,76.0,2019-08-23,194907.0,8.0,1.0,4.0,0.038800,...,20835.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
911025,False,15.0,1.0,78.0,2019-08-23,172147.0,1.0,1.0,4.0,0.727000,...,10227.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
911012,False,5.0,1.0,79.0,2019-08-23,205707.0,9.0,1.0,4.0,0.224000,...,3970.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match


In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 911027 entries, 0 to 911026
Data columns (total 31 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   explicit                911027 non-null  bool          
 1   track_number            911027 non-null  float64       
 2   num_artists             911027 non-null  float64       
 3   num_available_markets   911027 non-null  float64       
 4   release_date            911027 non-null  object        
 5   duration_ms             911027 non-null  float64       
 6   key                     911027 non-null  float64       
 7   mode                    911027 non-null  float64       
 8   time_signature          911027 non-null  float64       
 9   acousticness            911027 non-null  float64       
 10  danceability            911027 non-null  float64       
 11  energy                  911027 non-null  float64       
 12  instrumentalness        911027 non-

In [32]:
data.describe()

Unnamed: 0,track_number,num_artists,num_available_markets,duration_ms,key,mode,time_signature,acousticness,danceability,energy,...,hit,artist1_num,artist2_num,betweenesscentrality_y,closnesscentrality_y,clustering_y,Cluster_y,eccentricity_y,eigencentrality_y,weighted degree_y
count,911027.0,911027.0,911027.0,911027.0,911027.0,911027.0,911027.0,911027.0,911027.0,911027.0,...,911027.0,911027.0,911027.0,911027.0,911027.0,911027.0,911027.0,911027.0,911027.0,911027.0
mean,8.686955,1.082063,70.515338,229492.2,5.21087,0.705957,3.906517,0.327322,0.584527,0.55954,...,0.013127,13395.535816,1084.664435,0.003873,0.025505,0.002779,-0.847665,0.809033,0.010028,57.630872
min,1.0,1.0,0.0,1672.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0
25%,3.0,1.0,78.0,183500.0,2.0,0.0,4.0,0.0261,0.485,0.381,...,0.0,7040.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0
50%,7.0,1.0,79.0,220240.0,5.0,1.0,4.0,0.2,0.599,0.564,...,0.0,13278.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0
75%,12.0,1.0,79.0,259107.0,8.0,1.0,4.0,0.608,0.701,0.751,...,0.0,19732.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0
max,125.0,2.0,79.0,6032273.0,11.0,1.0,5.0,0.996,0.992,1.0,...,1.0,26698.0,26700.0,1.0,1.0,1.0,5.0,21.0,0.850651,37432.0
std,6.891276,0.274461,22.634266,95846.08,3.534452,0.45444,0.404534,0.329362,0.164942,0.237606,...,0.113818,7701.751995,4252.894432,0.056594,0.113227,0.035996,0.628339,2.998431,0.073218,981.946546


In [33]:

def find_min_max(df):
    # Select only numeric columns
    numeric_cols = df.select_dtypes(include=['number'])

    # Find max and min values for each numeric column
    min_max_values = {}
    for col in numeric_cols.columns:
        min_value = df[col].min()
        max_value = df[col].max()
        min_max_values[col] = {'min': min_value, 'max': max_value}

    return min_max_values


min_max_val = find_min_max(data)

y = data["hit"]
X = data.drop(columns=["hit"])
X, y, min_max_val

(        explicit  track_number  num_artists  num_available_markets  \
 0          False          12.0          1.0                   79.0   
 3659       False           2.0          1.0                   79.0   
 3658       False           5.0          1.0                    2.0   
 3657       False           4.0          1.0                    2.0   
 3656       False           3.0          1.0                    2.0   
 ...          ...           ...          ...                    ...   
 911001     False           6.0          1.0                   79.0   
 911000     False          25.0          1.0                   76.0   
 911025     False          15.0          1.0                   78.0   
 911012     False           5.0          1.0                   79.0   
 911026     False          27.0          1.0                   79.0   
 
        release_date  duration_ms   key  mode  time_signature  acousticness  \
 0        1995-01-01     162667.0  10.0   1.0             4.0      

In [34]:

def preprocess(df, min_max_values, exclude_cols=None):
    missing_numerical = df.select_dtypes(include=['number']).isnull().sum()
    # Fill missing values with mean for each numeric attribute
    imputer = SimpleImputer(strategy='mean')
    df_filled = df.copy()
    for col in missing_numerical.index:
        if missing_numerical[col] > 0:
            df_filled[col] = imputer.fit_transform(df[[col]])

    # Normalize numerical features into [0, 1] range with MinMaxScaler
    if exclude_cols:
        numerical_cols = df_filled.select_dtypes(include=['number']).columns.difference(exclude_cols)
    else:
        numerical_cols = df_filled.select_dtypes(include=['number']).columns

    # print("numerical columns:", numerical_cols)

    for column_name in numerical_cols:
        df_filled[column_name] = (df_filled[column_name] - min_max_values[column_name]["min"]) / (
                min_max_values[column_name]["max"] - min_max_values[column_name]["min"])

    df_normalized = pd.DataFrame(df_filled, columns=numerical_cols)

    # One-hot encode categorical features
    encoder = OneHotEncoder(handle_unknown='ignore')
    if exclude_cols:
        categorical_cols = df.select_dtypes(include=['object']).columns.difference(exclude_cols)
    else:
        categorical_cols = df.select_dtypes(include=['object']).columns
    df_encoded = encoder.fit_transform(df[categorical_cols])

    # print(categorical_cols)

    # Convert the sparse matrix to dense array
    df_encoded_dense = df_encoded.toarray()

    # Concatenate numerical and encoded categorical features
    df_processed = np.hstack([df_normalized.values, df_encoded_dense])

    return df_processed


# Assuming X is your feature dataset and y is your target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)#random_state=42), stratify=y_scaled, shuffle=True) # try to do with ordered by date results are terrible:(, ..collab prof is missing
#split_day = X["date"].iloc[-1] - pd.DateOffset(years=1)
#X_train = X[(X["date"] < split_day)].copy()

#X_test = X[(X["date"] >= split_day)].copy()
#sep_index = X_train.shape[0]
#y_train = y.iloc[:sep_index].copy()
#y_test = y.iloc[sep_index:].copy()

print("######TRAIN TEST SPLIT DONE######")

######TRAIN TEST SPLIT DONE######


In [35]:
jsklajsa

NameError: name 'jsklajsa' is not defined

In [42]:
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

In [43]:
X_train

Unnamed: 0,explicit,track_number,num_artists,num_available_markets,release_date,duration_ms,key,mode,time_signature,acousticness,...,artist1_num,artist2_num,betweenesscentrality_y,closnesscentrality_y,clustering_y,Cluster_y,eccentricity_y,eigencentrality_y,weighted degree_y,profile_y
0,False,12.0,1.0,79.0,1995-01-01,162667.0,10.0,1.0,4.0,0.690000,...,16718.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
1,False,2.0,1.0,79.0,1995-01-01,349120.0,1.0,0.0,4.0,0.297000,...,12974.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
2,False,5.0,1.0,2.0,1995-01-01,206093.0,2.0,1.0,4.0,0.105000,...,1409.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
3,False,4.0,1.0,2.0,1995-01-01,271307.0,7.0,1.0,4.0,0.027200,...,1409.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
4,False,3.0,1.0,2.0,1995-01-01,242027.0,7.0,1.0,4.0,0.025800,...,1409.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
683265,False,20.0,1.0,79.0,2015-08-19,149844.0,7.0,1.0,4.0,0.691000,...,8626.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
683266,True,2.0,1.0,79.0,2015-08-19,171102.0,8.0,0.0,4.0,0.008460,...,11095.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
683267,True,1.0,1.0,79.0,2015-08-19,171102.0,1.0,0.0,4.0,0.091900,...,11095.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
683268,False,3.0,1.0,79.0,2015-08-19,319671.0,9.0,0.0,4.0,0.019900,...,6092.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match


In [44]:
y_train[(y_train == 1)]

700       1.0
706       1.0
730       1.0
737       1.0
753       1.0
         ... 
681096    1.0
681305    1.0
681621    1.0
681984    1.0
683156    1.0
Name: hit, Length: 9833, dtype: float64

In [45]:
X_train_c = X_train.iloc[700: 700+10]
X_train_c

Unnamed: 0,explicit,track_number,num_artists,num_available_markets,release_date,duration_ms,key,mode,time_signature,acousticness,...,artist1_num,artist2_num,betweenesscentrality_y,closnesscentrality_y,clustering_y,Cluster_y,eccentricity_y,eigencentrality_y,weighted degree_y,profile_y
700,False,8.0,1.0,76.0,1995-01-01,130067.0,4.0,1.0,4.0,0.514,...,11790.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
701,False,10.0,1.0,79.0,1995-01-01,309973.0,7.0,1.0,4.0,0.00391,...,54.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
702,False,5.0,2.0,0.0,1995-01-01,552733.0,4.0,1.0,4.0,0.096,...,11522.0,24396.0,0.000188,0.190583,0.0,0.0,11.0,0.000467,4.0,1A 2A 3A 4A
703,False,2.0,1.0,79.0,1995-01-01,176000.0,0.0,1.0,4.0,0.174,...,17210.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
704,False,6.0,1.0,79.0,1995-01-01,503973.0,5.0,0.0,4.0,0.00214,...,17210.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
705,False,34.0,1.0,78.0,1995-01-01,250067.0,0.0,1.0,4.0,0.0433,...,9759.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
706,False,19.0,1.0,59.0,1995-01-01,196173.0,9.0,1.0,4.0,0.384,...,16641.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
707,False,10.0,1.0,79.0,1995-01-01,290627.0,5.0,1.0,4.0,0.0245,...,17210.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
708,False,5.0,1.0,79.0,1995-01-01,137933.0,10.0,1.0,4.0,0.499,...,10232.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,No Match
709,False,6.0,2.0,79.0,1995-01-01,145293.0,8.0,1.0,4.0,0.656,...,10232.0,394.0,0.000693,0.162914,0.066667,0.0,12.0,1.3e-05,128.0,1A 2A 3A 4A


In [46]:
y_train_c = y_train.iloc[700: 700+10]
y_train_c

700    1.0
701    0.0
702    0.0
703    0.0
704    0.0
705    0.0
706    1.0
707    0.0
708    0.0
709    0.0
Name: hit, dtype: float64

In [None]:
memmemme

In [None]:

def upsampling(X_train, y_train):
    # Convert y_train to a numpy array
    # y_train = y_train.to_numpy()
    X_train = X_train.to_numpy()

    # Count the number of samples in each class
    class_counts = np.bincount(y_train.flatten().astype(int))
    print("nr of samples in each class: ", class_counts)
    max_count = class_counts.max()

    # Find indices of positive instances
    positive_indices = np.where(y_train.flatten() == 1)[0]
    print("positive indices: ", positive_indices)
    
    # Calculate how many times to duplicate positive samples
    difference = max_count - class_counts[1]

    # Randomly select indices from positive instances
    random_indices = np.random.choice(positive_indices, size=difference, replace=True)
    
    # Get rows corresponding to positive instances and concat them
    rows_to_duplicate = np.vstack([X_train[idx] for idx in random_indices])

    # Stack duplicated rows with the original matrix
    X_train_upsampled = np.vstack([X_train, rows_to_duplicate])

    # Create an array of shape (x, 1) with all elements as 1
    rows_of_ones = np.ones((difference, 1))

    # Append rows_of_ones to original_array
    y_train_upsampled = np.concatenate((y_train, rows_of_ones), axis=0)

    print("######UPSAMPLING DONE######")
    return X_train_upsampled, y_train_upsampled


y_reshaped = y_train.values.reshape(-1, 1)
# print(X_train.shape)
# print(y_reshaped.shape)
X_train_upsampled, y_train_upsampled = upsampling(X_train=X_train, y_train=y_reshaped)
# Assuming X_train, X_test, y_train, y_test are your training and testing data
# print("X_train_up type:", type(X_train_upsampled))
# print("y_train_up type:", type(y_train_upsampled))
# print("X_train_up shape:", X_train_upsampled.shape)
# print("y_train_up shape:", y_train_upsampled.shape)
# print(type(X_test))
# print(type(y_test))
X_train_upsampled

In [None]:

# Count occurrences of each unique value
unique_values, counts = np.unique(y_train_upsampled, return_counts=True)

# Create a dictionary to store the counts of each value
value_counts = dict(zip(unique_values, counts))

print("Value counts:", value_counts)

# Convert arrays to DataFrames
X_train_upsampled_df = pd.DataFrame(X_train_upsampled, columns=X_train.columns)
y_train_upsampled_df = pd.DataFrame(y_train_upsampled, columns=['hit'])

# Concatenate y_train_upsampled as an extra column to X_train_upsampled_df
X_train_upsampled_with_y = pd.concat([X_train_upsampled_df, y_train_upsampled_df], axis=1)
X_train_upsampled_with_y['date'] = pd.to_datetime(X_train_upsampled_with_y['release_date'])
X_train_upsampled_with_y.sort_values(by="date", inplace=True)
X_train_upsampled_with_y.drop(columns=["release_date", "date"], inplace=True)

# print(X_train_upsampled_with_y.head())
# prepro:
y_train_upsampled_ordered = X_train_upsampled_with_y["hit"]
X_train_upsampled_ordered = X_train_upsampled_with_y.drop(columns="hit")

In [None]:
# Define data types for each column
dtype_dict = {
    'explicit': bool,
    'track_number': float,
    'num_artists': float,
    'num_available_markets': float,
    'duration_ms': float,
    'key': float,
    'mode': float,
    'time_signature': float,
    'acousticness': float,
    'danceability': float,
    'energy': float,
    'instrumentalness': float,
    'liveness': float,
    'loudness': float,
    'speechiness': float,
    'valence': float,
    'tempo': float,
    'years_on_charts': float,
    "artist1_num": float,
    "artist2_num": float,
    'betweenesscentrality_y': float,
    'closnesscentrality_y': float,
    'clustering_y': float,
    'Cluster_y': str,
    'eccentricity_y': float,
    'eigencentrality_y': float,
    'weighted degree_y': float,
    'profile_y': str,
}


# Use astype method to cast columns to the specified data types
X_train_upsampled_ordered = X_train_upsampled_ordered.astype(dtype_dict)
X_test.drop(columns=["release_date", "date"], inplace=True)
X_test = X_test.astype(dtype_dict)

y_train_upsampled_ordered_reshaped = y_train_upsampled_ordered.values.reshape(-1, 1)
y_test_reshaped = y_test.values.reshape(-1, 1)

sep_index = X_train_upsampled_ordered.shape[0]
concatenated_df = pd.concat([X_train_upsampled_ordered, X_test])
print(concatenated_df.columns)
data_prepro = preprocess(concatenated_df, min_max_val)
X_train_upsampled_prepro = data_prepro[:sep_index]
X_test_prepro = data_prepro[sep_index:]

print("######PREPROCESSING DONE######")