In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
# from imblearn.over_sampling import SMOTE
# synthetic minority oversampling technique for imbalanced data


In [6]:
df = pd.read_csv('data/music_data.csv')

In [7]:

numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = ["education", "gender", "country", "explicit"]
text_features = ['music','artist_name', 'featured_artists']


# Remove the target variable from numerical features
numerical_cols.remove('ratings')  # Assuming 'plays' is the target variable


# Define preprocessing for features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(max_features=500)),
    ('svd', TruncatedSVD(n_components=80))  # Reducing dimensions
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
        # ('music_tfidf', text_transformer,'music'),
        # ('artist_tfidf', text_transformer, 'artist_name'),
        # ('featured_artists_tfidf', text_transformer, 'featured_artists')
    
    ]
)

X = df.drop(["music","ratings"], axis=1)
y = df['ratings']

# Apply preprocessing to features
X_processed = preprocessor.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('rfe', RFE(estimator=RandomForestRegressor(n_estimators=500, random_state=42), n_features_to_select=10)),
    ('rf', RandomForestRegressor(n_estimators=500, random_state=42))
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
error = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {error}')


feature_importances = pipeline.named_steps['rf'].feature_importances_

feature_importances_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)

print(feature_importances_df)

plt.figure(figsize=(10, 6))
plt.barh(range(len(feature_importances_df)), feature_importances_df['Importance'])
plt.yticks(range(len(feature_importances_df)), feature_importances_df['Feature'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances')
plt.show()

ValueError: Specifying the columns using strings is only supported for dataframes.

In [6]:
X_train_df = pd.DataFrame(X_train)  # Convert numpy array back to DataFrame if needed
X_train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,0.238734,0.313433,0.001911,0.043934,0.616826,0.601308,0.905902,0.363636,0.052688,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
1,0.263706,0.343284,0.030009,0.147427,0.838854,0.80737,0.141007,0.272727,0.957183,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,0.447297,0.014925,0.001366,0.168433,0.651925,0.850641,0.930854,0.454545,0.613005,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
3,0.015971,0.686567,0.001494,0.351175,0.790788,0.706059,0.439479,0.454545,0.947426,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,0.086525,0.089552,0.000772,0.244357,0.527193,0.272543,0.622771,0.090909,0.942431,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0


## PCA ANALYSIS

In [8]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Assuming df is your DataFrame and 'music' is the target variable
X = df.drop('music', axis=1)
y = df['music']

# Identify categorical columns (example method, adjust based on your data)
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

# Define a transformer that applies OneHotEncoder to the categorical columns and StandardScaler to the numeric columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(exclude=['object', 'category']).columns),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

# Apply the transformations
X_transformed = preprocessor.fit_transform(X)

# Proceed with PCA on X_transformed
pca = PCA().fit(X_transformed)

# Plotting the cumulative variance explained
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o', linestyle='--')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance vs. Number of Components')
plt.grid()
plt.show()


In [9]:
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import silhouette_score

# # Identify numerical and categorical columns based on dtypes
# numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
# categorical_cols = ["education", "gender", "country", "explicit"]

# # Define the preprocessing for numerical features: scaling and handling skewness
# numeric_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler()),
#     ('variance_threshold', VarianceThreshold(threshold=0.1)),
#     ('pca', PCA(n_components=0.95))  # Adjust based on explained variance
# ])

# # Define the preprocessing for categorical features: encoding
# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# # Combine preprocessing steps
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, numerical_cols),
#         ('cat', categorical_transformer, categorical_cols)
#     ])

# # Define the logistic regression model for RFE
# logistic_model = LogisticRegression(max_iter=500)  # Increase max_iter for convergence

# # Create a pipeline that does preprocessing and RFE, then clustering
# pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('rfe', RFE(logistic_model, n_features_to_select=10)),  # Adjust n_features_to_select as needed
#     ('clusterer', KMeans(n_clusters=10, random_state=42))  # Adjust n_clusters as needed
# ])

# # Split the data into features and target
# X = df.drop('music', axis=1)
# y = df['music']

# # Split the data into training and testing sets
# X_train, X_test, y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)

# # Fit the pipeline on the training data
# pipeline.fit(X_train,y_train)

# # Function to determine the optimal number of clusters
# def optimal_number_of_clusters(X, max_k):
#     iters = range(2, max_k+1, 2)
#     s = []
#     for k in iters:
#         kmeans = KMeans(n_clusters=k, random_state=42).fit(X)
#         s.append(silhouette_score(X, kmeans.labels_))

#     f, ax = plt.subplots(1, 1)
#     ax.plot(iters, s, marker='o')
#     ax.set_xlabel('Cluster Centers')
#     ax.set_xticks(iters)
#     ax.set_xticklabels(iters)
#     ax.set_ylabel('Silhouette Score')
#     ax.set_title('Silhouette Scores for Different Cluster Centers')
#     plt.show()

# # Apply preprocessing only to the data for optimal cluster calculation
# X_preprocessed = preprocessor.fit_transform(X_train,y_train)
# optimal_number_of_clusters(X_preprocessed, 20)  # Adjust max_k as needed

# # Re-fit the pipeline with the optimal number of clusters found
# optimal_clusters = 10  # Update this based on silhouette score plot
# pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('rfe', RFE(logistic_model, n_features_to_select=10)),  # Adjust n_features_to_select as needed
#     ('clusterer', KMeans(n_clusters=optimal_clusters, random_state=42))
# ])

# # Fit the pipeline on the training data again with optimal clusters
# pipeline.fit(X_train,y_train)

# # Transform the test data and predict the clusters
# test_clusters = pipeline.predict(test_df)

# # Evaluate clustering with silhouette score on the test data
# X_test_preprocessed = preprocessor.transform(test_df)
# test_silhouette_score = silhouette_score(X_test_preprocessed, test_clusters)
# print(f'Silhouette Score on Test Data: {test_silhouette_score}')
