In [1]:
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.feature_selection import VarianceThreshold, RFE
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score

In [2]:
df = pd.read_csv('data/music_data.csv')

In [3]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34995 entries, 0 to 34994
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_id             34995 non-null  int64  
 1   age                 34995 non-null  int64  
 2   education           34995 non-null  object 
 3   gender              34995 non-null  object 
 4   name                34995 non-null  object 
 5   country             34995 non-null  object 
 6   music               34995 non-null  object 
 7   artist_name         34995 non-null  object 
 8   featured_artists    34995 non-null  object 
 9   genre               34995 non-null  object 
 10  plays               34995 non-null  int64  
 11  artiste_popularity  34995 non-null  float64
 12  audio_popularity    34995 non-null  float64
 13  music_acousticness  34995 non-null  float64
 14  danceability        34995 non-null  float64
 15  energy              34995 non-null  float64
 16  key 

In [4]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# Split data into X and y
X = df.copy()
y = X.pop('music')

# Identify categorical columns (example method, adjust based on your data)
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

# Define a transformer that applies OneHotEncoder to the categorical columns and StandardScaler to the numeric columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(exclude=['object', 'category']).columns),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

# Apply the transformations
X_transformed = preprocessor.fit_transform(X)

# Cross-validation to find optimal number of components
def pca_cross_val_score(n_components):
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X_transformed)
    model = LogisticRegression()
    return cross_val_score(model, X_pca, y, cv=5).mean()

scores = [pca_cross_val_score(i) for i in range(1, X_transformed.shape[1] + 1)]

# Plotting cross-validation scores
plt.figure(figsize=(10, 6))
plt.plot(range(1, X_transformed.shape[1] + 1), scores, marker='o', linestyle='--')
plt.xlabel('Number of Components')
plt.ylabel('Cross-Validated Score')
plt.title('Cross-Validation Score vs. Number of Components')
plt.grid()
plt.show();




In [11]:
# import pandas as pd
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder, StandardScaler

# # Assuming df is your DataFrame and 'music' is the target variable
# X = df.drop('music', axis=1)
# y = df['music']

# # Identify categorical columns (example method, adjust based on your data)
# categorical_cols = X.select_dtypes(include=['object', 'category']).columns

# # Define a transformer that applies OneHotEncoder to the categorical columns and StandardScaler to the numeric columns
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), X.select_dtypes(exclude=['object', 'category']).columns),
#         ('cat', OneHotEncoder(), categorical_cols)
#     ])

# # Apply the transformations
# X_transformed = preprocessor.fit_transform(X)

# # Proceed with PCA on X_transformed
# pca = PCA().fit(X_transformed)

# # Plotting the cumulative variance explained
# plt.figure(figsize=(10, 6))
# plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o', linestyle='--')
# plt.xlabel('Number of Components')
# plt.ylabel('Cumulative Explained Variance')
# plt.title('Explained Variance vs. Number of Components')
# plt.grid()
# plt.show()


In [8]:
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import silhouette_score

# # Identify numerical and categorical columns based on dtypes
# numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
# categorical_cols = ["education", "gender", "country", "explicit"]

# # Define the preprocessing for numerical features: scaling and handling skewness
# numeric_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler()),
#     ('variance_threshold', VarianceThreshold(threshold=0.1)),
#     ('pca', PCA(n_components=0.95))  # Adjust based on explained variance
# ])

# # Define the preprocessing for categorical features: encoding
# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# # Combine preprocessing steps
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, numerical_cols),
#         ('cat', categorical_transformer, categorical_cols)
#     ])

# # Define the logistic regression model for RFE
# logistic_model = LogisticRegression(max_iter=500)  # Increase max_iter for convergence

# # Create a pipeline that does preprocessing and RFE, then clustering
# pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('rfe', RFE(logistic_model, n_features_to_select=10)),  # Adjust n_features_to_select as needed
#     ('clusterer', KMeans(n_clusters=10, random_state=42))  # Adjust n_clusters as needed
# ])

# # Split the data into features and target
# X = df.drop('music', axis=1)
# y = df['music']

# # Split the data into training and testing sets
# X_train, X_test, y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)

# # Fit the pipeline on the training data
# pipeline.fit(X_train,y_train)

# # Function to determine the optimal number of clusters
# def optimal_number_of_clusters(X, max_k):
#     iters = range(2, max_k+1, 2)
#     s = []
#     for k in iters:
#         kmeans = KMeans(n_clusters=k, random_state=42).fit(X)
#         s.append(silhouette_score(X, kmeans.labels_))

#     f, ax = plt.subplots(1, 1)
#     ax.plot(iters, s, marker='o')
#     ax.set_xlabel('Cluster Centers')
#     ax.set_xticks(iters)
#     ax.set_xticklabels(iters)
#     ax.set_ylabel('Silhouette Score')
#     ax.set_title('Silhouette Scores for Different Cluster Centers')
#     plt.show()

# # Apply preprocessing only to the data for optimal cluster calculation
# X_preprocessed = preprocessor.fit_transform(X_train,y_train)
# optimal_number_of_clusters(X_preprocessed, 20)  # Adjust max_k as needed

# # Re-fit the pipeline with the optimal number of clusters found
# optimal_clusters = 10  # Update this based on silhouette score plot
# pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('rfe', RFE(logistic_model, n_features_to_select=10)),  # Adjust n_features_to_select as needed
#     ('clusterer', KMeans(n_clusters=optimal_clusters, random_state=42))
# ])

# # Fit the pipeline on the training data again with optimal clusters
# pipeline.fit(X_train,y_train)

# # Transform the test data and predict the clusters
# test_clusters = pipeline.predict(test_df)

# # Evaluate clustering with silhouette score on the test data
# X_test_preprocessed = preprocessor.transform(test_df)
# test_silhouette_score = silhouette_score(X_test_preprocessed, test_clusters)
# print(f'Silhouette Score on Test Data: {test_silhouette_score}')


TypeError: RFE.fit() missing 1 required positional argument: 'y'