In [65]:
#Import necessary libraries
import pandas as pd
import numpy as np

In [66]:
import os
os.chdir('C:/Users/Kshitij/Documents/Projects/ML/Music_Recommender')

# Then your existing imports
import time
import sys
import pandas as pd
import inspect
from src.recommendation_engine import search_songs_database

print("search_songs_database function signature:")
print(inspect.signature(search_songs_database))

search_songs_database function signature:
(query, limit=10)


In [67]:
# Load your dataset
file_path = "C:/Users/Kshitij/Documents/Projects/ML/Music_Recommender/data/MusicDataSet.csv"

try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully!")
    print(f"Dataset shape: {df.shape}")  # Shows (rows, columns)
    
    # Display first few rows
    print("\nFirst 5 rows:")
    display(df.head())
    
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
except Exception as e:
    print(f"Error: {e}")

Dataset loaded successfully!
Dataset shape: (114000, 21)

First 5 rows:


Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [68]:
df.dtypes

Unnamed: 0            int64
track_id             object
artists              object
album_name           object
track_name           object
popularity            int64
duration_ms           int64
explicit               bool
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature        int64
track_genre          object
dtype: object

In [69]:
#Basic Data Overview
print(" MUSIC DATASET - BASIC OVERVIEW")
print("=" * 60)

print(f"Dataset Shape: {df.shape} ({df.shape[0]:,} songs, {df.shape[1]} features)")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print("\n KEY COLUMNS IDENTIFIED:")
print(" Track Info: track_id, track_name, artists, album_name, track_genre")
print("Audio Features: danceability, energy, tempo, valence, acousticness, etc.")
print(" Metadata: popularity, duration_ms, explicit, key, mode")

 MUSIC DATASET - BASIC OVERVIEW
Dataset Shape: (114000, 21) (114,000 songs, 21 features)
Memory Usage: 49.66 MB

 KEY COLUMNS IDENTIFIED:
 Track Info: track_id, track_name, artists, album_name, track_genre
Audio Features: danceability, energy, tempo, valence, acousticness, etc.
 Metadata: popularity, duration_ms, explicit, key, mode


In [70]:
# Data Check
print(" DATA QUALITY CHECK")
print("=" * 50)

# Remove unnecessary column
if 'Unnamed: 0' in df.columns:
    df.drop('Unnamed: 0', axis=1, inplace=True)
    print(" Removed 'Unnamed: 0' column")

# Check for missing values
print("\nMissing Values:")
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_info = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percent
})
print(missing_info[missing_info['Missing Count'] > 0])

if missing_data.sum() == 0:
    print("No missing values found!")

# Check for duplicates
print(f"\nDuplicate Songs (by track_id): {df['track_id'].duplicated().sum()}")
print(f"Duplicate Rows (complete): {df.duplicated().sum()}")

# Check for empty strings
empty_strings = {}
for col in ['artists', 'album_name', 'track_name', 'track_genre']:
    empty_count = (df[col] == '').sum()
    if empty_count > 0:
        empty_strings[col] = empty_count

if empty_strings:
    print(f"\nEmpty strings found: {empty_strings}")
else:
    print("No empty strings in text columns")

 DATA QUALITY CHECK
 Removed 'Unnamed: 0' column

Missing Values:
            Missing Count  Missing Percentage
artists                 1            0.000877
album_name              1            0.000877
track_name              1            0.000877

Duplicate Songs (by track_id): 24259
Duplicate Rows (complete): 450
No empty strings in text columns


In [71]:
# Cell 5: Music Metadata Analysis
print(" MUSIC METADATA ANALYSIS")
print("=" * 50)

print(" Popularity Distribution:")
print(f"   Min: {df['popularity'].min()}, Max: {df['popularity'].max()}")
print(f"   Average: {df['popularity'].mean():.1f}, Median: {df['popularity'].median()}")

print(f"\n  Duration Analysis:")
df['duration_min'] = df['duration_ms'] / 60000
print(f"   Shortest: {df['duration_min'].min():.1f} min")
print(f"   Longest: {df['duration_min'].max():.1f} min") 
print(f"   Average: {df['duration_min'].mean():.1f} min")

print(f"\n Explicit Content: {df['explicit'].sum()} songs ({df['explicit'].mean()*100:.1f}%)")

print(f"\n Key Distribution (0=C, 1=C#, etc.):")
print(df['key'].value_counts().sort_index())

print(f"\n Mode (0=Minor, 1=Major):")
print(df['mode'].value_counts())

print(f"\n Time Signature:")
print(df['time_signature'].value_counts().sort_index())

 MUSIC METADATA ANALYSIS
 Popularity Distribution:
   Min: 0, Max: 100
   Average: 33.2, Median: 35.0

  Duration Analysis:
   Shortest: 0.0 min
   Longest: 87.3 min
   Average: 3.8 min

 Explicit Content: 9747 songs (8.6%)

 Key Distribution (0=C, 1=C#, etc.):
key
0     13061
1     10772
2     11644
3      3570
4      9008
5      9368
6      7921
7     13245
8      7360
9     11313
10     7456
11     9282
Name: count, dtype: int64

 Mode (0=Minor, 1=Major):
mode
1    72681
0    41319
Name: count, dtype: int64

 Time Signature:
time_signature
0       163
1       973
3      9195
4    101843
5      1826
Name: count, dtype: int64


In [72]:
# Cell 6: Audio Features Analysis
print(" AUDIO FEATURES ANALYSIS")
print("=" * 50)

audio_features = ['danceability', 'energy', 'loudness', 'speechiness', 
                  'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

print("Audio Features Summary:")
for feature in audio_features:
    print(f"\n {feature.capitalize()}:")
    print(f"   Range: {df[feature].min():.3f} to {df[feature].max():.3f}")
    print(f"   Mean: {df[feature].mean():.3f} | Std: {df[feature].std():.3f}")
    
# Check for potential outliers using IQR
print("\n Potential Outliers (using IQR):")
for feature in audio_features:
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
    if len(outliers) > 0:
        print(f"   {feature}: {len(outliers)} potential outliers")

 AUDIO FEATURES ANALYSIS
Audio Features Summary:

 Danceability:
   Range: 0.000 to 0.985
   Mean: 0.567 | Std: 0.174

 Energy:
   Range: 0.000 to 1.000
   Mean: 0.641 | Std: 0.252

 Loudness:
   Range: -49.531 to 4.532
   Mean: -8.259 | Std: 5.029

 Speechiness:
   Range: 0.000 to 0.965
   Mean: 0.085 | Std: 0.106

 Acousticness:
   Range: 0.000 to 0.996
   Mean: 0.315 | Std: 0.333

 Instrumentalness:
   Range: 0.000 to 1.000
   Mean: 0.156 | Std: 0.310

 Liveness:
   Range: 0.000 to 1.000
   Mean: 0.214 | Std: 0.190

 Valence:
   Range: 0.000 to 0.995
   Mean: 0.474 | Std: 0.259

 Tempo:
   Range: 0.000 to 243.372
   Mean: 122.148 | Std: 29.978

 Potential Outliers (using IQR):
   danceability: 620 potential outliers
   loudness: 6173 potential outliers
   speechiness: 13211 potential outliers
   instrumentalness: 25246 potential outliers
   liveness: 8642 potential outliers
   tempo: 617 potential outliers


In [73]:
# Cell 7: Genre and Artist Analysis
print(" GENRE AND ARTIST ANALYSIS")
print("=" * 50)

print(f" Total Unique Genres: {df['track_genre'].nunique()}")
print(f" Total Unique Artists: {df['artists'].nunique()}")
print(f" Total Unique Albums: {df['album_name'].nunique()}")

print("\n Top 10 Most Common Genres:")
top_genres = df['track_genre'].value_counts().head(10)
print(top_genres)

print("\nüëë Top 10 Artists with Most Songs:")
# Handle multiple artists in one field
all_artists = df['artists'].str.split(';').explode().str.strip()
top_artists = all_artists.value_counts().head(10)
print(top_artists)

print(f"\nüìÄ Average songs per genre: {df['track_genre'].value_counts().mean():.1f}")
print(f"üé§ Average songs per artist: {all_artists.value_counts().mean():.1f}")

 GENRE AND ARTIST ANALYSIS
 Total Unique Genres: 114
 Total Unique Artists: 31437
 Total Unique Albums: 46589

 Top 10 Most Common Genres:
track_genre
acoustic       1000
afrobeat       1000
alt-rock       1000
alternative    1000
ambient        1000
anime          1000
black-metal    1000
bluegrass      1000
blues          1000
brazil         1000
Name: count, dtype: int64

üëë Top 10 Artists with Most Songs:
artists
J Balvin                   510
Bad Bunny                  416
Daddy Yankee               375
Wolfgang Amadeus Mozart    354
Feid                       348
George Jones               343
Pritam                     333
Arijit Singh               305
ILLENIUM                   298
The Beatles                280
Name: count, dtype: int64

üìÄ Average songs per genre: 1000.0
üé§ Average songs per artist: 5.3


In [74]:
# Cell 8: Advanced Correlation Analysis
print(" AUDIO FEATURES CORRELATION ANALYSIS")
print("=" * 50)

# Select only numerical audio features for correlation
numerical_features = ['popularity', 'danceability', 'energy', 'loudness', 
                      'speechiness', 'acousticness', 'instrumentalness', 
                      'liveness', 'valence', 'tempo', 'duration_min']

correlation_matrix = df[numerical_features].corr()

print("Top Correlations (Absolute Value > 0.3):")
corr_pairs = correlation_matrix.unstack().sort_values(key=abs, ascending=False)
corr_pairs = corr_pairs[corr_pairs != 1.0]  # Remove self-correlations
print(corr_pairs[abs(corr_pairs) > 0.3].head(10))

 AUDIO FEATURES CORRELATION ANALYSIS
Top Correlations (Absolute Value > 0.3):
energy            loudness            0.761690
loudness          energy              0.761690
energy            acousticness       -0.733906
acousticness      energy             -0.733906
                  loudness           -0.589803
loudness          acousticness       -0.589803
danceability      valence             0.477341
valence           danceability        0.477341
instrumentalness  loudness           -0.433477
loudness          instrumentalness   -0.433477
dtype: float64


In [75]:
# Cell 9: Data Quality Summary & Recommendations
print(" DATA QUALITY SUMMARY & RECOMMENDATIONS")
print("=" * 60)

print(" DATASET STRENGTHS:")
print("‚Ä¢ Complete audio features for content-based recommendation")
print("‚Ä¢ Rich metadata (artists, genres, popularity)")
print("‚Ä¢ No missing values")
print("‚Ä¢ Good variety of genres and artists")

print("\n RECOMMENDATIONS FOR RECOMMENDATION SYSTEM:")
print("1. CONTENT-BASED: Use audio features (danceability, energy, tempo, etc.)")
print("2. GENRE-BASED: Use track_genre for genre-specific recommendations") 
print("3. POPULARITY: Use popularity score to boost relevant recommendations")
print("4. TEXT-BASED: Combine artists + track_name + genre for text similarity")

print("\n SUGGESTED FEATURES FOR RECOMMENDATION:")
feature_groups = {
    'Mood Features': ['danceability', 'energy', 'valence', 'tempo'],
    'Audio Properties': ['acousticness', 'instrumentalness', 'liveness', 'speechiness'],
    'Technical Features': ['loudness', 'key', 'mode', 'time_signature'],
    'Metadata': ['popularity', 'duration_min', 'explicit']
}

for group, features in feature_groups.items():
    print(f"   {group}: {', '.join(features)}")

 DATA QUALITY SUMMARY & RECOMMENDATIONS
 DATASET STRENGTHS:
‚Ä¢ Complete audio features for content-based recommendation
‚Ä¢ Rich metadata (artists, genres, popularity)
‚Ä¢ No missing values
‚Ä¢ Good variety of genres and artists

 RECOMMENDATIONS FOR RECOMMENDATION SYSTEM:
1. CONTENT-BASED: Use audio features (danceability, energy, tempo, etc.)
2. GENRE-BASED: Use track_genre for genre-specific recommendations
3. POPULARITY: Use popularity score to boost relevant recommendations
4. TEXT-BASED: Combine artists + track_name + genre for text similarity

 SUGGESTED FEATURES FOR RECOMMENDATION:
   Mood Features: danceability, energy, valence, tempo
   Audio Properties: acousticness, instrumentalness, liveness, speechiness
   Technical Features: loudness, key, mode, time_signature
   Metadata: popularity, duration_min, explicit


In [76]:
# Cell 10: Data Cleaning & Preparation
print(" DATA CLEANING & PREPARATION")
print("=" * 50)

# 1. Check for and remove duplicates
initial_count = len(df)
df = df.drop_duplicates(subset=['track_id'])  # Remove duplicate songs
final_count = len(df)
print(f"1. Removed {initial_count - final_count} duplicate songs")
print(f"   Final dataset: {final_count} unique songs")

# 2. Clean text data
print("\n2. Text Data Cleaning:")
df['artists_clean'] = df['artists'].str.strip()
df['track_name_clean'] = df['track_name'].str.strip()
df['album_name_clean'] = df['album_name'].str.strip()
df['track_genre_clean'] = df['track_genre'].str.strip()

print("    Cleaned text fields (removed extra spaces)")

# 3. Create useful derived features
print("\n3. Creating Derived Features:")
# Convert duration to minutes
df['duration_min'] = df['duration_ms'] / 60000

# Create decade from popularity (for grouping)
df['popularity_group'] = pd.cut(df['popularity'], bins=[0, 30, 60, 80, 100], 
                                labels=['Low', 'Medium', 'High', 'Very High'])

print("    Created duration_min and popularity_group")

print(f"\n Final dataset shape: {df.shape}")
display(df[['track_name_clean', 'artists_clean', 'track_genre_clean', 'popularity', 'duration_min']].head(3))

 DATA CLEANING & PREPARATION
1. Removed 24259 duplicate songs
   Final dataset: 89741 unique songs

2. Text Data Cleaning:
    Cleaned text fields (removed extra spaces)

3. Creating Derived Features:
    Created duration_min and popularity_group

 Final dataset shape: (89741, 26)


Unnamed: 0,track_name_clean,artists_clean,track_genre_clean,popularity,duration_min
0,Comedy,Gen Hoshino,acoustic,73,3.844433
1,Ghost - Acoustic,Ben Woodward,acoustic,55,2.4935
2,To Begin Again,Ingrid Michaelson;ZAYN,acoustic,57,3.513767


In [77]:
# Cell 11: Feature Analysis for Recommendation
print(" FEATURE ANALYSIS FOR RECOMMENDATION SYSTEM")
print("=" * 60)

# Define feature groups for different recommendation strategies
audio_features = ['danceability', 'energy', 'valence', 'tempo', 
                  'acousticness', 'instrumentalness', 'liveness', 'speechiness']

mood_features = ['danceability', 'energy', 'valence', 'tempo']
technical_features = ['loudness', 'key', 'mode', 'time_signature']

print("1. Audio Features Distribution (for Content-Based Filtering):")
for feature in audio_features:
    print(f"   {feature}: {df[feature].min():.3f} to {df[feature].max():.3f}")

print(f"\n2. Genre Distribution:")
print(f"   Total genres: {df['track_genre_clean'].nunique()}")
print(f"   Songs per genre: {df['track_genre_clean'].value_counts().mean():.1f} (avg)")

print(f"\n3. Artist Distribution:")
print(f"   Total artists: {df['artists_clean'].nunique()}")
print(f"   Songs per artist: {df['artists_clean'].value_counts().mean():.1f} (avg)")

print("\n RECOMMENDATION STRATEGIES AVAILABLE:")
strategies = {
    "Content-Based (Audio Features)": "Recommend similar songs based on audio characteristics",
    "Genre-Based": "Recommend songs from the same genre",
    "Popularity-Based": "Recommend trending/popular songs", 
    "Mood-Based": "Recommend songs with similar mood (danceability, energy, valence)"
}

for strategy, description in strategies.items():
    print(f"   ‚Ä¢ {strategy}: {description}")

 FEATURE ANALYSIS FOR RECOMMENDATION SYSTEM
1. Audio Features Distribution (for Content-Based Filtering):
   danceability: 0.000 to 0.985
   energy: 0.000 to 1.000
   valence: 0.000 to 0.995
   tempo: 0.000 to 243.372
   acousticness: 0.000 to 0.996
   instrumentalness: 0.000 to 1.000
   liveness: 0.000 to 1.000
   speechiness: 0.000 to 0.965

2. Genre Distribution:
   Total genres: 113
   Songs per genre: 794.2 (avg)

3. Artist Distribution:
   Total artists: 31437
   Songs per artist: 2.9 (avg)

 RECOMMENDATION STRATEGIES AVAILABLE:
   ‚Ä¢ Content-Based (Audio Features): Recommend similar songs based on audio characteristics
   ‚Ä¢ Genre-Based: Recommend songs from the same genre
   ‚Ä¢ Popularity-Based: Recommend trending/popular songs
   ‚Ä¢ Mood-Based: Recommend songs with similar mood (danceability, energy, valence)


In [78]:
# Cell 12: Feature Scaling Preparation
print(" FEATURE SCALING ANALYSIS")
print("=" * 50)

from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Analyze which features need scaling
print("Features that need scaling (wide range or different units):")
scaling_candidates = []

for feature in ['tempo', 'loudness', 'duration_min', 'duration_ms']:
    current_range = df[feature].max() - df[feature].min()
    print(f"   {feature}: range = {current_range:.2f}")
    scaling_candidates.append(feature)

print(f"\nAudio features (0-1 scale, may not need scaling):")
for feature in ['danceability', 'energy', 'speechiness', 'acousticness', 
                'instrumentalness', 'liveness', 'valence']:
    current_range = df[feature].max() - df[feature].min()
    print(f"   {feature}: range = {current_range:.3f}")

print("\n Scaling Strategy:")
print("   ‚Ä¢ MinMaxScaler for audio features (preserve 0-1 range)")
print("   ‚Ä¢ StandardScaler for tempo, loudness, duration")
print("   ‚Ä¢ No scaling needed for binary/categorical features")

 FEATURE SCALING ANALYSIS
Features that need scaling (wide range or different units):
   tempo: range = 243.37
   loudness: range = 54.06
   duration_min: range = 87.29
   duration_ms: range = 5237295.00

Audio features (0-1 scale, may not need scaling):
   danceability: range = 0.985
   energy: range = 1.000
   speechiness: range = 0.965
   acousticness: range = 0.996
   instrumentalness: range = 1.000
   liveness: range = 1.000
   valence: range = 0.995

 Scaling Strategy:
   ‚Ä¢ MinMaxScaler for audio features (preserve 0-1 range)
   ‚Ä¢ StandardScaler for tempo, loudness, duration
   ‚Ä¢ No scaling needed for binary/categorical features


In [79]:
# Cell 13: Create Combined Features for Text-Based Similarity
print(" CREATING TEXT-BASED SIMILARITY FEATURES")
print("=" * 50)

# Create combined text features for text-based recommendation
df['artist_genre_combined'] = df['artists_clean'] + " " + df['track_genre_clean']
df['song_info_combined'] = df['track_name_clean'] + " " + df['artists_clean'] + " " + df['track_genre_clean']

print(" Created combined text features:")
print(f"   ‚Ä¢ artist_genre_combined: artists + genre")
print(f"   ‚Ä¢ song_info_combined: track + artists + genre")

print("\nSample of combined features:")
sample_texts = df[['track_name_clean', 'artist_genre_combined', 'song_info_combined']].head(3)
for idx, row in sample_texts.iterrows():
    print(f"    {row['track_name_clean']}")
    print(f"     ‚Üí Artist+Genre: {row['artist_genre_combined'][:50]}...")
    print(f"     ‚Üí Full Info: {row['song_info_combined'][:60]}...")
    print()

 CREATING TEXT-BASED SIMILARITY FEATURES
 Created combined text features:
   ‚Ä¢ artist_genre_combined: artists + genre
   ‚Ä¢ song_info_combined: track + artists + genre

Sample of combined features:
    Comedy
     ‚Üí Artist+Genre: Gen Hoshino acoustic...
     ‚Üí Full Info: Comedy Gen Hoshino acoustic...

    Ghost - Acoustic
     ‚Üí Artist+Genre: Ben Woodward acoustic...
     ‚Üí Full Info: Ghost - Acoustic Ben Woodward acoustic...

    To Begin Again
     ‚Üí Artist+Genre: Ingrid Michaelson;ZAYN acoustic...
     ‚Üí Full Info: To Begin Again Ingrid Michaelson;ZAYN acoustic...



In [80]:
# Cell 14: Final Data Overview Before Modeling
print(" FINAL DATA OVERVIEW - READY FOR MODELING")
print("=" * 60)

print(" FEATURES AVAILABLE FOR RECOMMENDATION:")

feature_categories = {
    "Audio Characteristics": ['danceability', 'energy', 'valence', 'tempo', 
                             'acousticness', 'instrumentalness', 'liveness', 'speechiness'],
    "Technical Properties": ['loudness', 'key', 'mode', 'time_signature'],
    "Metadata": ['popularity', 'duration_min', 'explicit'],
    "Text Features": ['artist_genre_combined', 'song_info_combined', 'track_genre_clean'],
    "Identifiers": ['track_id', 'track_name_clean', 'artists_clean', 'album_name_clean']
}

for category, features in feature_categories.items():
    print(f"\n {category} ({len(features)} features):")
    print(f"   {', '.join(features)}")

print(f"\n Final Dataset Summary:")
print(f"   ‚Ä¢ Total songs: {len(df):,}")
print(f"   ‚Ä¢ Total genres: {df['track_genre_clean'].nunique()}")
print(f"   ‚Ä¢ Total artists: {df['artists_clean'].nunique()}")
print(f"   ‚Ä¢ Audio features: {len(audio_features)}")
print(f"   ‚Ä¢ Ready for multiple recommendation approaches!")

 FINAL DATA OVERVIEW - READY FOR MODELING
 FEATURES AVAILABLE FOR RECOMMENDATION:

 Audio Characteristics (8 features):
   danceability, energy, valence, tempo, acousticness, instrumentalness, liveness, speechiness

 Technical Properties (4 features):
   loudness, key, mode, time_signature

 Metadata (3 features):
   popularity, duration_min, explicit

 Text Features (3 features):
   artist_genre_combined, song_info_combined, track_genre_clean

 Identifiers (4 features):
   track_id, track_name_clean, artists_clean, album_name_clean

 Final Dataset Summary:
   ‚Ä¢ Total songs: 89,741
   ‚Ä¢ Total genres: 113
   ‚Ä¢ Total artists: 31437
   ‚Ä¢ Audio features: 8
   ‚Ä¢ Ready for multiple recommendation approaches!


In [81]:
# Cell 15: Prepare Features for Content-Based Recommendation
print(" PREPARING FOR CONTENT-BASED RECOMMENDATION")
print("=" * 60)

# Select audio features for content-based filtering
audio_features = ['danceability', 'energy', 'valence', 'tempo', 
                  'acousticness', 'instrumentalness', 'liveness', 'speechiness', 'loudness']

print("Selected Audio Features for Recommendation:")
print(audio_features)

# Create feature matrix
feature_matrix = df[audio_features]
print(f"\nFeature Matrix Shape: {feature_matrix.shape}")
print("First 3 rows of feature matrix:")
display(feature_matrix.head(3))

# Check feature ranges
print("\nFeature Ranges (before scaling):")
for feature in audio_features:
    print(f"   {feature}: {feature_matrix[feature].min():.3f} to {feature_matrix[feature].max():.3f}")

 PREPARING FOR CONTENT-BASED RECOMMENDATION
Selected Audio Features for Recommendation:
['danceability', 'energy', 'valence', 'tempo', 'acousticness', 'instrumentalness', 'liveness', 'speechiness', 'loudness']

Feature Matrix Shape: (89741, 9)
First 3 rows of feature matrix:


Unnamed: 0,danceability,energy,valence,tempo,acousticness,instrumentalness,liveness,speechiness,loudness
0,0.676,0.461,0.715,87.917,0.0322,1e-06,0.358,0.143,-6.746
1,0.42,0.166,0.267,77.489,0.924,6e-06,0.101,0.0763,-17.235
2,0.438,0.359,0.12,76.332,0.21,0.0,0.117,0.0557,-9.734



Feature Ranges (before scaling):
   danceability: 0.000 to 0.985
   energy: 0.000 to 1.000
   valence: 0.000 to 0.995
   tempo: 0.000 to 243.372
   acousticness: 0.000 to 0.996
   instrumentalness: 0.000 to 1.000
   liveness: 0.000 to 1.000
   speechiness: 0.000 to 0.965
   loudness: -49.531 to 4.532


In [82]:
# Cell 16: Feature Scaling
print(" APPLYING FEATURE SCALING")
print("=" * 50)

from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler = StandardScaler()

# Scale the features (especially important for tempo and loudness)
feature_matrix_scaled = scaler.fit_transform(feature_matrix)
feature_matrix_scaled = pd.DataFrame(feature_matrix_scaled, columns=audio_features, index=df.index)

print(" Features scaled successfully!")
print("\nFeature Ranges (after scaling):")
for feature in audio_features:
    print(f"   {feature}: {feature_matrix_scaled[feature].min():.3f} to {feature_matrix_scaled[feature].max():.3f}")

print("\nFirst 3 rows of scaled features:")
display(feature_matrix_scaled.head(3))

 APPLYING FEATURE SCALING
 Features scaled successfully!

Feature Ranges (after scaling):
   danceability: -3.182 to 2.393
   energy: -2.473 to 1.425
   valence: -1.786 to 1.999
   tempo: -4.053 to 4.028
   acousticness: -0.970 to 1.974
   instrumentalness: -0.535 to 2.552
   liveness: -1.113 to 4.018
   speechiness: -0.772 to 7.747
   loudness: -7.858 to 2.496

First 3 rows of scaled features:


Unnamed: 0,danceability,energy,valence,tempo,acousticness,instrumentalness,liveness,speechiness,loudness
0,0.64426,-0.675976,0.934036,-1.133609,-0.875177,-0.535478,0.723666,0.490464,0.335731
1,-0.804604,-1.825609,-0.77028,-1.479854,1.760797,-0.535464,-0.595072,-0.098361,-1.673094
2,-0.702731,-1.073476,-1.329508,-1.518271,-0.349638,-0.535481,-0.512971,-0.280217,-0.236523


In [83]:
# Cell 17: ULTRA-EFFICIENT RECOMMENDATION SYSTEM
print(" ULTRA-EFFICIENT RECOMMENDATION SYSTEM")
print("=" * 50)

from sklearn.neighbors import NearestNeighbors
import time

print("Using Minimal Features for Maximum Efficiency:")
# Select only the most important audio features (reduces dimensionality)
essential_features = ['danceability', 'energy', 'valence', 'tempo', 'acousticness']
print(f"Essential features: {essential_features}")

# Use minimal preprocessing - only what's absolutely necessary
feature_matrix_minimal = df[essential_features].copy()

# Simple scaling (much faster than StandardScaler)
for feature in ['tempo']:
    feature_matrix_minimal[feature] = (feature_matrix_minimal[feature] - feature_matrix_minimal[feature].mean()) / feature_matrix_minimal[feature].std()

print(f"Feature matrix shape: {feature_matrix_minimal.shape}")
print(" Minimal features prepared")

# Build efficient NearestNeighbors modelprint("\nBuilding efficient NearestNeighbors model...")
start_time = time.time()

# Use ball_tree algorithm which is very memory efficient
nn_model = NearestNeighbors(
    n_neighbors=21,  # Small number for efficiency (20 recommendations + itself)
    algorithm='ball_tree',  # Memory efficient
    metric='euclidean'  # Faster than cosine
)

nn_model.fit(feature_matrix_minimal)
build_time = time.time() - start_time

print(f" Efficient model built in {build_time:.2f} seconds")
print(f"   ‚Ä¢ Features: {len(essential_features)} (reduced from {len(audio_features)})")
print(f"   ‚Ä¢ Algorithm: ball_tree (memory efficient)")
print(f"   ‚Ä¢ Metric: euclidean (faster than cosine)")
print(f"   ‚Ä¢ Model size: Very small (stores tree structure only)")

 ULTRA-EFFICIENT RECOMMENDATION SYSTEM
Using Minimal Features for Maximum Efficiency:
Essential features: ['danceability', 'energy', 'valence', 'tempo', 'acousticness']
Feature matrix shape: (89741, 5)
 Minimal features prepared
 Efficient model built in 0.30 seconds
   ‚Ä¢ Features: 5 (reduced from 9)
   ‚Ä¢ Algorithm: ball_tree (memory efficient)
   ‚Ä¢ Metric: euclidean (faster than cosine)
   ‚Ä¢ Model size: Very small (stores tree structure only)


In [84]:
# Cell 18: MINIMAL MEMORY RECOMMENDATION FUNCTION
print(" MINIMAL MEMORY RECOMMENDATION FUNCTION")
print("=" * 50)

def get_minimal_recommendations(track_name, nn_model=nn_model, df=df, 
                               feature_matrix=feature_matrix_minimal, num_recommendations=10):
    """
    Ultra-efficient recommendation function with minimal memory usage
    Computes neighbors on-the-fly without storing large matrices
    """
    start_time = time.time()
    
    try:
        # Find the song index
        song_idx = df[df['track_name_clean'] == track_name].index[0]
        
        # Get the feature vector for this song
        song_features = feature_matrix.iloc[song_idx:song_idx+1]
        
        # Find nearest neighbors (computed on-the-fly)
        distances, indices = nn_model.kneighbors(song_features, n_neighbors=num_recommendations + 1)
        
        # Convert to recommendations (skip the first one - same song)
        recommendations = []
        for i in range(1, len(indices[0])):
            neighbor_idx = indices[0][i]
            distance = distances[0][i]
            
            # Convert distance to similarity score (inverse relationship)
            similarity_score = max(0, 1 - (distance / distances[0].max()))
            
            neighbor_data = df.iloc[neighbor_idx]
            recommendations.append({
                'track_name': neighbor_data['track_name_clean'],
                'artists': neighbor_data['artists_clean'],
                'genre': neighbor_data['track_genre_clean'],
                'popularity': neighbor_data['popularity'],
                'similarity_score': round(similarity_score, 3)
            })
        
        response_time = time.time() - start_time
        
        result_df = pd.DataFrame(recommendations)
        return result_df, response_time
    
    except IndexError:
        return f"Song '{track_name}' not found. Please check the spelling.", 0
    except Exception as e:
        return f"Error: {str(e)}", 0

print(" Minimal memory function created!")
print("   ‚Ä¢ No large matrices stored")
print("   ‚Ä¢ Computes neighbors on-demand")
print("   ‚Ä¢ Very small memory footprint")
print("   ‚Ä¢ Fast response times")

 MINIMAL MEMORY RECOMMENDATION FUNCTION
 Minimal memory function created!
   ‚Ä¢ No large matrices stored
   ‚Ä¢ Computes neighbors on-demand
   ‚Ä¢ Very small memory footprint
   ‚Ä¢ Fast response times


In [85]:
# Cell 19 (FIXED): TEST ULTRA-EFFICIENT SYSTEM
print(" TESTING ULTRA-EFFICIENT SYSTEM")
print("=" * 50)

# First, let's fix the recommendation function
def get_minimal_recommendations_fixed(track_name, nn_model=nn_model, df=df, 
                                    feature_matrix=feature_matrix_minimal, num_recommendations=10):
    """
    Fixed version of the minimal recommendations function
    """
    start_time = time.time()
    
    try:
        # Find the song index
        song_idx = df[df['track_name_clean'] == track_name].index[0]
        
        # Get the feature vector for this song
        song_features = feature_matrix.iloc[song_idx:song_idx+1]
        
        # Make sure we have valid features
        if song_features.isnull().any().any():
            return "Error: Song has missing feature data", 0
        
        # Find nearest neighbors
        distances, indices = nn_model.kneighbors(song_features, n_neighbors=num_recommendations + 1)
        
        # Convert to recommendations (skip the first one - same song)
        recommendations = []
        for i in range(1, len(indices[0])):
            neighbor_idx = indices[0][i]
            distance = distances[0][i]
            
            # Better similarity calculation
            max_distance = distances[0][1:].max() if len(distances[0]) > 1 else 1
            if max_distance > 0:
                similarity_score = max(0, 1 - (distance / max_distance))
            else:
                similarity_score = 0.5  # Default if all distances are same
            
            neighbor_data = df.iloc[neighbor_idx]
            recommendations.append({
                'track_name': neighbor_data['track_name_clean'],
                'artists': neighbor_data['artists_clean'],
                'genre': neighbor_data['track_genre_clean'],
                'popularity': neighbor_data['popularity'],
                'similarity_score': round(similarity_score, 3)
            })
        
        response_time = time.time() - start_time
        return pd.DataFrame(recommendations), response_time
    
    except IndexError:
        return f"Song '{track_name}' not found in dataset.", 0
    except Exception as e:
        return f"Error: {str(e)}", 0

print(" Fixed recommendation function created!")

# Test with sample songs (make sure they exist in the dataset)
print("\nFinding valid test songs...")
valid_songs = []
attempts = 0
while len(valid_songs) < 3 and attempts < 10:
    potential_song = df.sample(1)['track_name_clean'].iloc[0]
    song_idx = df[df['track_name_clean'] == potential_song].index[0]
    song_features = feature_matrix_minimal.iloc[song_idx:song_idx+1]
    
    # Check if song has valid features
    if not song_features.isnull().any().any():
        valid_songs.append(potential_song)
    attempts += 1

print(f"Found {len(valid_songs)} valid test songs")

print("\nPerformance Test Results:\n")
total_time = 0
successful_tests = 0

for i, song in enumerate(valid_songs, 1):
    print(f" TEST {i}: '{song}'")
    print("-" * 40)
    
    recommendations, response_time = get_minimal_recommendations_fixed(song, num_recommendations=5)
    
    if isinstance(recommendations, pd.DataFrame) and not recommendations.empty:
        total_time += response_time
        successful_tests += 1
        
        print(f"    Response time: {response_time*1000:.1f} ms")
        print(f"    Top 3 recommendations:")
        
        for idx, row in recommendations.head(3).iterrows():
            print(f"      ‚Ä¢ {row['similarity_score']:.3f} - {row['track_name']}")
            print(f"         {row['artists']} |  {row['genre']}")
        
        print()
    else:
        print(f"    {recommendations}")
        print()

if successful_tests > 0:
    avg_time = total_time / successful_tests
    print(f" PERFORMANCE SUMMARY:")
    print(f"   ‚Ä¢ Average response time: {avg_time*1000:.1f} ms")
    print(f"   ‚Ä¢ Successful tests: {successful_tests}/3")
    print(f"   ‚Ä¢ Memory usage: Minimal (no large matrices)")
    print(f"   ‚Ä¢ Scalability: Excellent")
else:
    print(" No successful tests - there may be data quality issues")

 TESTING ULTRA-EFFICIENT SYSTEM
 Fixed recommendation function created!

Finding valid test songs...
Found 3 valid test songs

Performance Test Results:

 TEST 1: 'Golden Time Of Day - Single Edit/Remastered 2004'
----------------------------------------
    Response time: 24.8 ms
    Top 3 recommendations:
      ‚Ä¢ 0.228 - Nature Boy
         Nat King Cole |  jazz
      ‚Ä¢ 0.197 - Eu Me Rendo
         Gabriel Guedes de Almeida |  gospel
      ‚Ä¢ 0.150 - The Moon Is a Harsh Mistress
         Josh Groban |  opera

 TEST 2: 'L'amour, l'amour, l'amour'
----------------------------------------
    Response time: 27.0 ms
    Top 3 recommendations:
      ‚Ä¢ 0.525 - NEW ME
         LiSA |  anime
      ‚Ä¢ 0.280 - How Did You Love
         Shinedown |  grunge
      ‚Ä¢ 0.212 - How Did You Love
         Shinedown |  grunge

 TEST 3: 'Khamoshi'
----------------------------------------
    Response time: 73.5 ms
    Top 3 recommendations:
      ‚Ä¢ 1.000 - NUMB
         XXXTENTACION |  emo
  

In [86]:
# Cell 22 (FIXED): DATA QUALITY CHECK FOR FEATURES
print(" DATA QUALITY CHECK FOR FEATURES")
print("=" * 50)

print("Checking for missing values in essential features:")
missing_counts = feature_matrix_minimal.isnull().sum()
print(missing_counts)

print(f"\nSongs with complete feature data: {len(feature_matrix_minimal.dropna())}/{len(feature_matrix_minimal)}")

if missing_counts.sum() > 0:
    print("\n Some songs have missing features. Let's clean the data...")
    # Remove songs with missing features
    valid_indices = feature_matrix_minimal.dropna().index
    df_clean = df.loc[valid_indices].copy()
    feature_matrix_clean = feature_matrix_minimal.loc[valid_indices].copy()
    
    print(f"Cleaned dataset: {len(df_clean)} songs (removed {len(df) - len(df_clean)} songs with missing data)")
    
    # Rebuild the model with clean data
    print("\nRebuilding model with clean data...")
    nn_model_clean = NearestNeighbors(n_neighbors=21, algorithm='ball_tree', metric='euclidean')
    nn_model_clean.fit(feature_matrix_clean)
    print(" Clean model built!")
    
    # Update our variables without global declaration
    df = df_clean
    feature_matrix_minimal = feature_matrix_clean
    nn_model = nn_model_clean
else:
    print(" All songs have complete feature data!")

 DATA QUALITY CHECK FOR FEATURES
Checking for missing values in essential features:
danceability    0
energy          0
valence         0
tempo           0
acousticness    0
dtype: int64

Songs with complete feature data: 89741/89741
 All songs have complete feature data!


In [87]:
# Cell 23: FINAL ROBUST RECOMMENDATION FUNCTION
print("FINAL ROBUST RECOMMENDATION FUNCTION")
print("=" * 50)

def get_recommendations_final(track_name, num_recommendations=10):
    """
    Final robust version that handles all edge cases
    """
    try:
        # Check if song exists
        if track_name not in df['track_name_clean'].values:
            similar_names = df[df['track_name_clean'].str.contains(track_name, case=False, na=False)]
            if len(similar_names) > 0:
                suggestion = similar_names.iloc[0]['track_name_clean']
                return f"Song not found. Did you mean: '{suggestion}'?"
            else:
                return "Song not found in dataset."
        
        # Get song index and features
        song_idx = df[df['track_name_clean'] == track_name].index[0]
        song_features = feature_matrix_minimal.iloc[song_idx:song_idx+1]
        
        # Get recommendations
        distances, indices = nn_model.kneighbors(song_features, n_neighbors=num_recommendations + 1)
        
        recommendations = []
        for i in range(1, len(indices[0])):
            neighbor_idx = indices[0][i]
            distance = distances[0][i]
            
            # Normalize similarity score
            max_dist = distances[0][1:].max() if len(distances[0]) > 1 else 1
            similarity = max(0.1, 1 - (distance / max_dist)) if max_dist > 0 else 0.5
            
            neighbor_data = df.iloc[neighbor_idx]
            recommendations.append({
                'track_name': neighbor_data['track_name_clean'],
                'artists': neighbor_data['artists_clean'],
                'genre': neighbor_data['track_genre_clean'],
                'popularity': neighbor_data['popularity'],
                'similarity_score': round(similarity, 3)
            })
        
        return pd.DataFrame(recommendations)
    
    except Exception as e:
        return f"Error getting recommendations: {str(e)}"

print(" Final robust function created!")
print("   ‚Ä¢ Handles missing songs")
print("   ‚Ä¢ Suggests similar names")
print("   ‚Ä¢ Robust similarity calculation")
print("   ‚Ä¢ Error handling")

# Test the final function
print("\n FINAL TEST:")
test_song = df.sample(1)['track_name_clean'].iloc[0]
print(f"Testing with: '{test_song}'")
result = get_recommendations_final(test_song, 3)

if isinstance(result, pd.DataFrame):
    print(" Success! Recommendations:")
    for idx, row in result.iterrows():
        print(f"   ‚Ä¢ {row['similarity_score']:.3f} - {row['track_name']} by {row['artists']}")
else:
    print(f" {result}")

FINAL ROBUST RECOMMENDATION FUNCTION
 Final robust function created!
   ‚Ä¢ Handles missing songs
   ‚Ä¢ Suggests similar names
   ‚Ä¢ Robust similarity calculation
   ‚Ä¢ Error handling

 FINAL TEST:
Testing with: 'Black is Magnified'
 Success! Recommendations:
   ‚Ä¢ 0.377 - ÁÑ°ÈáçÂäõ„ÉÄ„É≥„Çπ by SILENT SIREN
   ‚Ä¢ 0.100 - Cretino by Babas√≥nicos
   ‚Ä¢ 0.100 - Korken knallen by Nancy Franck


In [88]:
# Cell 24: SAVE THE RECOMMENDATION SYSTEM
print(" SAVING THE RECOMMENDATION SYSTEM")
print("=" * 50)

import pickle
import os

# Create a models directory if it doesn't exist
if not os.path.exists('../models'):
    os.makedirs('../models')

# Save all necessary components
save_components = {
    'nn_model': nn_model,
    'feature_matrix_minimal': feature_matrix_minimal,
    'df': df,
    'essential_features': essential_features
}

with open('../models/recommendation_system.pkl', 'wb') as f:
    pickle.dump(save_components, f)

print(" Recommendation system saved!")
print("   ‚Ä¢ File: '../models/recommendation_system.pkl'")
print("   ‚Ä¢ Components saved:")
print("     - NearestNeighbors model")
print("   - Feature matrix")
print("     - Cleaned dataset")
print("     - Essential features list")

# Also save the robust recommendation function as a Python file for Streamlit
function_code = '''
import pandas as pd
import pickle

def load_recommendation_system():
    """Load the pre-trained recommendation system"""
    with open('models/recommendation_system.pkl', 'rb') as f:
        components = pickle.load(f)
    return components

def get_recommendations(track_name, num_recommendations=10):
    """
    Main recommendation function for Streamlit app
    """
    # Load components
    components = load_recommendation_system()
    nn_model = components['nn_model']
    feature_matrix_minimal = components['feature_matrix_minimal']
    df = components['df']
    
    try:
        # Check if song exists
        if track_name not in df['track_name_clean'].values:
            similar_names = df[df['track_name_clean'].str.contains(track_name, case=False, na=False)]
            if len(similar_names) > 0:
                suggestion = similar_names.iloc[0]['track_name_clean']
                return f"Song not found. Did you mean: '{suggestion}'?"
            else:
                return "Song not found in dataset."
        
        # Get song index and features
        song_idx = df[df['track_name_clean'] == track_name].index[0]
        song_features = feature_matrix_minimal.iloc[song_idx:song_idx+1]
        
        # Get recommendations
        distances, indices = nn_model.kneighbors(song_features, n_neighbors=num_recommendations + 1)
        
        recommendations = []
        for i in range(1, len(indices[0])):
            neighbor_idx = indices[0][i]
            distance = distances[0][i]
            
            # Normalize similarity score
            max_dist = distances[0][1:].max() if len(distances[0]) > 1 else 1
            similarity = max(0.1, 1 - (distance / max_dist)) if max_dist > 0 else 0.5
            
            neighbor_data = df.iloc[neighbor_idx]
            recommendations.append({
                'track_name': neighbor_data['track_name_clean'],
                'artists': neighbor_data['artists_clean'],
                'genre': neighbor_data['track_genre_clean'],
                'popularity': neighbor_data['popularity'],
                'similarity_score': round(similarity, 3)
            })
        
        return pd.DataFrame(recommendations)
    
    except Exception as e:
        return f"Error getting recommendations: {str(e)}"
'''

# Save the function to a Python file
with open('C:/Users/Kshitij/Documents/Projects/ML/Music_Recommender/src/recommendation_engine.py', 'w') as f:
    f.write(function_code)

print(" Recommendation engine saved!")
print("   ‚Ä¢ File: '../src/recommendation_engine.py'")

 SAVING THE RECOMMENDATION SYSTEM
 Recommendation system saved!
   ‚Ä¢ File: '../models/recommendation_system.pkl'
   ‚Ä¢ Components saved:
     - NearestNeighbors model
   - Feature matrix
     - Cleaned dataset
     - Essential features list
 Recommendation engine saved!
   ‚Ä¢ File: '../src/recommendation_engine.py'


In [89]:
# Cell 25 (UPDATED): CREATE REQUIREMENTS FILE
print(" UPDATING REQUIREMENTS.TXT")
print("=" * 50)

requirements = """streamlit==1.28.0
pandas==2.1.0
numpy==1.24.0
scikit-learn==1.3.0
plotly==5.15.0
pickle-mixin==1.0.2
"""

with open('../requirements.txt', 'w') as f:
    f.write(requirements)

print(" requirements.txt updated!")
print("   ‚Ä¢ Added plotly dependency")

 UPDATING REQUIREMENTS.TXT
 requirements.txt updated!
   ‚Ä¢ Added plotly dependency


In [90]:
# Cell 26 (Fixed): CREATE STREAMLIT APP
print(" CREATING STREAMLIT APP")
print("=" * 50)

streamlit_code = '''
import streamlit as st
import pandas as pd
import sys
import os

# Add the src directory to Python path
sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))

try:
    from recommendation_engine import get_recommendations, load_recommendation_system
except ImportError:
    st.error("Could not import recommendation engine. Please make sure 'src/recommendation_engine.py' exists.")
    st.stop()

# Page configuration
st.set_page_config(
    page_title="Music Recommendation System",
    page_icon=":musical_note:",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS
st.markdown("""
    <style>
    .main-header {
        font-size: 3rem;
        color: #1DB954;
        text-align: center;
        margin-bottom: 2rem;
    }
    .song-card {
        padding: 1rem;
        border-radius: 10px;
        border: 1px solid #ddd;
        margin: 0.5rem 0;
        background-color: #f9f9f9;
    }
    .similarity-high { color: #1DB954; font-weight: bold; }
    .similarity-med { color: #FFA500; font-weight: bold; }
    .similarity-low { color: #FF4B4B; font-weight: bold; }
    </style>
    """, unsafe_allow_html=True)

# Header
st.markdown('<h1 class="main-header">Music Recommendation System</h1>', unsafe_allow_html=True)

# Sidebar
st.sidebar.title("About")
st.sidebar.info(
    "This system recommends songs based on audio features like "
    "danceability, energy, tempo, and more. "
    "Just enter a song you like and discover new music!"
)

# Load data once (cached for performance)
@st.cache_data
def load_data():
    components = load_recommendation_system()
    if isinstance(components, str):  # Error message
        st.error(components)
        st.stop()
    return components['df']

@st.cache_data
def search_songs(query, df, max_results=15):
    """Search songs based on query"""
    if not query or len(query) < 1:
        return pd.DataFrame()
    
    # Search in track names (case insensitive)
    mask = df['track_name_clean'].str.contains(query, case=False, na=False)
    results = df[mask][['track_name_clean', 'artists_clean', 'track_genre_clean', 'popularity']]
    
    return results.head(max_results)

# Load the data
df = load_data()

# Initialize session state
if 'selected_song' not in st.session_state:
    st.session_state.selected_song = None
if 'search_query' not in st.session_state:
    st.session_state.search_query = ""

# Main content
col1, col2 = st.columns([1, 2])

with col1:
    st.subheader("Find Your Next Favorite Song")
    
    # Search box
    search_query = st.text_input(
        "Search for a song:",
        value=st.session_state.search_query,
        placeholder="Type a song name...",
        key="search_input"
    )
    
    # Update session state
    st.session_state.search_query = search_query
    
    # Show search results in real-time
    if search_query and len(search_query) >= 1:
        search_results = search_songs(search_query, df, max_results=10)
        
        if not search_results.empty:
            st.write(f"Found {len(search_results)} songs:")
            
            # Display search results as clickable items
            for idx, row in search_results.iterrows():
                col_a, col_b = st.columns([3, 1])
                
                with col_a:
                    st.write(f"**{row['track_name_clean']}**")
                    st.write(f"*{row['artists_clean']}*")
                
                with col_b:
                    if st.button("Select", key=f"select_{idx}"):
                        st.session_state.selected_song = row['track_name_clean']
                        st.rerun()
                
                st.divider()
        else:
            st.info("No songs found. Try a different search term.")
    else:
        # Show popular songs when no search
        st.write("Popular songs to try:")
        popular_songs = df.nlargest(5, 'popularity')[['track_name_clean', 'artists_clean']]
        for _, song in popular_songs.iterrows():
            if st.button(f"**{song['track_name_clean']}** - {song['artists_clean']}", 
                        key=f"popular_{song['track_name_clean']}", 
                        use_container_width=True):
                st.session_state.selected_song = song['track_name_clean']
                st.rerun()
    
    # Show currently selected song
    if st.session_state.selected_song:
        selected_song_data = df[df['track_name_clean'] == st.session_state.selected_song].iloc[0]
        st.success(f"Selected: {st.session_state.selected_song}")
        st.write(f"Artist: {selected_song_data['artists_clean']}")
        
        # Number of recommendations
        num_recommendations = st.slider(
            "Number of recommendations:",
            min_value=5,
            max_value=20,
            value=10,
            help="How many similar songs would you like to discover?"
        )
        
        # Get recommendations button
        if st.button("Get Recommendations", type="primary", use_container_width=True):
            with st.spinner("Finding similar songs..."):
                recommendations = get_recommendations(st.session_state.selected_song, num_recommendations)
            
            if isinstance(recommendations, pd.DataFrame):
                st.session_state.recommendations = recommendations
                st.session_state.show_recommendations = True
            else:
                st.error(recommendations)
                st.session_state.show_recommendations = False
    
    # Clear selection button
    if st.session_state.selected_song:
        if st.button("Clear Selection", use_container_width=True):
            st.session_state.selected_song = None
            st.session_state.show_recommendations = False
            st.session_state.recommendations = None
            st.rerun()

with col2:
    st.subheader("Song Information")
    
    if st.session_state.selected_song:
        # Display input song info
        input_song_data = df[df['track_name_clean'] == st.session_state.selected_song].iloc[0]
        
        st.write(f"**Song:** {input_song_data['track_name_clean']}")
        st.write(f"**Artist:** {input_song_data['artists_clean']}")
        st.write(f"**Genre:** {input_song_data['track_genre_clean']}")
        st.write(f"**Popularity:** {input_song_data['popularity']}/100")
        
        # Audio features visualization
        st.subheader("Audio Features")
        features = ['danceability', 'energy', 'valence', 'acousticness']
        
        # Show progress bars for each feature
        for feature in features:
            value = input_song_data[feature]
            st.write(f"**{feature.title()}:** {value:.2f}")
            st.progress(value)
            
    else:
        st.info("Start typing a song name in the search box or click a popular song to see information here!")

# Display recommendations
if ('show_recommendations' in st.session_state and 
    st.session_state.show_recommendations and 
    'recommendations' in st.session_state and 
    not st.session_state.recommendations.empty):
    
    st.subheader("Recommended Songs")
    st.write(f"Similar to **{st.session_state.selected_song}**:")
    
    recommendations = st.session_state.recommendations
    
    # Display each recommendation
    for idx, row in recommendations.iterrows():
        # Determine similarity color
        if row['similarity_score'] > 0.7:
            sim_class = "similarity-high"
        elif row['similarity_score'] > 0.4:
            sim_class = "similarity-med"
        else:
            sim_class = "similarity-low"
        
        with st.container():
            col_a, col_b, col_c = st.columns([3, 2, 1])
            
            with col_a:
                st.write(f"**{row['track_name']}**")
                st.write(f"*{row['artists']}*")
            
            with col_b:
                st.write(f"`{row['genre']}`")
                st.write(f"Popularity: {row['popularity']}")
            
            with col_c:
                st.markdown(f'<p class="{sim_class}">Similarity: {row["similarity_score"]}</p>', unsafe_allow_html=True)
            
            st.divider()

# Quick search examples in sidebar
st.sidebar.subheader("Quick Search")
example_searches = ["love", "dance", "rock", "happy", "sad", "chill"]
for example in example_searches:
    if st.sidebar.button(example, key=f"example_{example}"):
        st.session_state.search_query = example
        st.rerun()

# Footer
st.markdown("---")
st.markdown(
    "Built with Streamlit | "
    "Music data from Spotify | "
    "Recommendations based on audio features"
)
'''

# Write with UTF-8 encoding to handle special characters
with open('../app.py', 'w', encoding='utf-8') as f:
    f.write(streamlit_code)

print(" Streamlit app created!")
print("   ‚Ä¢ File: '../app.py'")
print("   ‚Ä¢ Features: Search bar interface, real-time recommendations")

 CREATING STREAMLIT APP
 Streamlit app created!
   ‚Ä¢ File: '../app.py'
   ‚Ä¢ Features: Search bar interface, real-time recommendations


In [91]:
# Cell 27: FINAL PROJECT STRUCTURE
print(" FINAL PROJECT STRUCTURE")
print("=" * 50)

import os

def check_project_structure():
    base_path = ".."
    expected_structure = {
        'app.py': 'Streamlit application',
        'requirements.txt': 'Python dependencies',
        'data/MusicDataSet.csv': 'Original dataset',
        'models/recommendation_system.pkl': 'Trained model',
        'src/recommendation_engine.py': 'Recommendation engine',
        'notebooks/': 'Jupyter notebooks'
    }
    
    print("Project Structure:")
    print("-" * 40)
    
    all_good = True
    for path, description in expected_structure.items():
        full_path = os.path.join(base_path, path)
        if os.path.exists(full_path):
            status = "Yes"
        else:
            status = "No"
            all_good = False
        
        print(f"{status} {path:40} {description}")
    
    print("-" * 40)
    if all_good:
        print(" All files are ready for deployment!")
    else:
        print("  Some files are missing.")
    
    return all_good

# Check structure
is_ready = check_project_structure()

if is_ready:
    print("\nüöÄ DEPLOYMENT INSTRUCTIONS:")
    print("1. Open terminal in your project folder")
    print("2. Activate your virtual environment")
    print("3. Run: streamlit run app.py")
    print("4. Your app will open in browser at http://localhost:8501")

 FINAL PROJECT STRUCTURE
Project Structure:
----------------------------------------
Yes app.py                                   Streamlit application
Yes requirements.txt                         Python dependencies
No data/MusicDataSet.csv                    Original dataset
Yes models/recommendation_system.pkl         Trained model
No src/recommendation_engine.py             Recommendation engine
No notebooks/                               Jupyter notebooks
----------------------------------------
  Some files are missing.


In [92]:
import os

# Check the file directly
file_path = r'C:\Users\Kshitij\Documents\PROJECTS\ML\Music_Recommender\src\recommendation_engine.py'

print(f"Checking file: {file_path}")
print("=" * 50)

if os.path.exists(file_path):
    print("‚úÖ File exists!")
    
    # Read the file content
    with open(file_path, 'r') as f:
        content = f.read()
    
    # Check for specific functions
    if 'def setup_music_database' in content:
        print("‚úÖ setup_music_database function found in file")
    else:
        print("‚ùå setup_music_database function NOT found in file")
        
    if 'def search_songs_database' in content:
        print("‚úÖ search_songs_database function found in file")
    else:
        print("‚ùå search_songs_database function NOT found in file")
        
    if 'import sqlite3' in content:
        print("‚úÖ sqlite3 import found")
    else:
        print("‚ùå sqlite3 import NOT found")
        
    # Show the first 20 lines of the file
    
        
else:
    print("‚ùå File does not exist!")

Checking file: C:\Users\Kshitij\Documents\PROJECTS\ML\Music_Recommender\src\recommendation_engine.py
‚úÖ File exists!
‚ùå setup_music_database function NOT found in file
‚ùå search_songs_database function NOT found in file
‚ùå sqlite3 import NOT found


In [93]:
import os

# Try different possible locations
possible_paths = [
    r'C:\Users\Kshitij\Documents\Projects\ML\Music_Recommender\src\recommendation_engine.py',
    r'C:\Users\Kshitij\Documents\PROJECT\ML\Music_Recommender\src\recommendation_engine.py', 
    r'C:\Users\Kshitij\Documents\ML\Music_Recommender\src\recommendation_engine.py',
    './src/recommendation_engine.py',
    '../src/recommendation_engine.py'
]

print("üîç Searching for recommendation_engine.py...")
print("=" * 50)

found_path = None
for path in possible_paths:
    if os.path.exists(path):
        found_path = path
        print(f"‚úÖ FOUND: {path}")
        break
    else:
        print(f"‚ùå NOT FOUND: {path}")

if found_path:
    print(f"\nüéØ Using path: {found_path}")
else:
    print("\n‚ùå File not found in any common locations.")
    print("Let's search your entire Documents folder...")
    
    # Search recursively in Documents folder
    for root, dirs, files in os.walk(r'C:\Users\Kshitij\Documents'):
        if 'recommendation_engine.py' in files:
            full_path = os.path.join(root, 'recommendation_engine.py')
            print(f"‚úÖ FOUND: {full_path}")
            found_path = full_path
            break
    
    if not found_path:
        print("‚ùå File not found anywhere in Documents folder.")

üîç Searching for recommendation_engine.py...
‚úÖ FOUND: C:\Users\Kshitij\Documents\Projects\ML\Music_Recommender\src\recommendation_engine.py

üéØ Using path: C:\Users\Kshitij\Documents\Projects\ML\Music_Recommender\src\recommendation_engine.py


In [94]:
# Replace with the correct path from Step 1
file_path = r'C:\Users\Kshitij\Documents\Projects\ML\Music_Recommender\src\recommendation_engine.py'  # Use the correct path from above

try:
    # Read current content
    with open(file_path, 'r') as f:
        current_content = f.read()

    # Check what needs to be added
    needs_sqlite = 'import sqlite3' not in current_content
    needs_database_functions = 'def search_songs_database' not in current_content

    if needs_sqlite or needs_database_functions:
        print("üõ†Ô∏è Adding missing functions to recommendation_engine.py...")
        
        # Split into lines
        lines = current_content.split('\n')
        
        # Add imports if missing
        if needs_sqlite:
            # Find where to insert imports (after existing imports)
            for i, line in enumerate(lines):
                if line.startswith('import') or line.startswith('from'):
                    continue
                else:
                    # Insert our imports here
                    lines.insert(i, 'import sqlite3')
                    lines.insert(i+1, 'import os')
                    break
        
        # Add database functions if missing
        if needs_database_functions:
            database_functions = '''
def setup_music_database():
    """Setup SQLite database for music data"""
    components = load_recommendation_system()
    df = components['df']
    
    conn = sqlite3.connect('music_data.db')
    df.to_sql('songs', conn, if_exists='replace', index=False)
    
    # Create index for faster searches
    conn.execute("CREATE INDEX IF NOT EXISTS idx_track_name ON songs(track_name_clean)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_popularity ON songs(popularity)")
    
    conn.close()
    return "Database setup complete"

def search_songs_database(query, limit=10):
    """Search songs using SQLite database - auto-initializes if needed"""
    # Check if database exists, if not create it
    if not os.path.exists('music_data.db'):
        print("Initializing music database for the first time...")
        setup_music_database()
    
    conn = sqlite3.connect('music_data.db')
    
    # Use parameterized query to prevent SQL injection
    result = pd.read_sql("""
        SELECT * FROM songs 
        WHERE track_name_clean LIKE ? 
        ORDER BY popularity DESC 
        LIMIT ?
    """, conn, params=[f'%{query}%', limit])
    
    conn.close()
    return result

def get_mood_based_recommendations(target_mood, num_recommendations=10):
    """Get songs based on mood (happy, sad, energetic, calm)"""
    components = load_recommendation_system()
    df = components['df']
    
    mood_profiles = {
        'happy': {'valence': 0.7, 'energy': 0.6, 'danceability': 0.6},
        'sad': {'valence': 0.3, 'energy': 0.3, 'danceability': 0.3},
        'energetic': {'energy': 0.8, 'danceability': 0.7, 'valence': 0.6},
        'calm': {'energy': 0.2, 'acousticness': 0.7, 'valence': 0.5}
    }
    
    if target_mood in mood_profiles:
        target_profile = mood_profiles[target_mood]
        scores = []
        
        for idx, row in df.iterrows():
            score = 0
            for feature, target_value in target_profile.items():
                score += 1 - abs(row[feature] - target_value)
            scores.append(score)
        
        df_temp = df.copy()
        df_temp['mood_score'] = scores
        return df_temp.nlargest(num_recommendations, 'mood_score')[['track_name_clean', 'artists_clean', 'mood_score']]
    
    return pd.DataFrame()

def create_playlist_from_seeds(seed_songs, playlist_length=20):
    """Create a playlist from multiple seed songs"""
    all_recommendations = []
    
    for song in seed_songs:
        recs = get_recommendations(song, playlist_length//len(seed_songs))
        if isinstance(recs, pd.DataFrame):
            all_recommendations.append(recs)
    
    if all_recommendations:
        combined = pd.concat(all_recommendations)
        # Remove duplicates and sort by similarity
        combined = combined.drop_duplicates('track_name').nlargest(playlist_length, 'similarity_score')
        return combined
    
    return pd.DataFrame()

def explain_recommendation(original_song, recommended_song):
    """Explain why a song was recommended"""
    components = load_recommendation_system()
    df = components['df']
    
    orig_data = df[df['track_name_clean'] == original_song].iloc[0]
    rec_data = df[df['track_name_clean'] == recommended_song].iloc[0]
    
    similarities = []
    for feature in ['danceability', 'energy', 'valence', 'tempo']:
        similarity = 1 - abs(orig_data[feature] - rec_data[feature])
        if similarity > 0.7:
            similarities.append(f"{feature} ({similarity:.1%} similar)")
    
    return f"Recommended because of similar: {', '.join(similarities)}" if similarities else "Similar in overall audio characteristics"
'''
            
            # Insert before get_recommendations function
            for i, line in enumerate(lines):
                if 'def get_recommendations' in line:
                    lines.insert(i, database_functions)
                    break
        
        # Write the updated content back
        with open(file_path, 'w') as f:
            f.write('\n'.join(lines))
        
        print("‚úÖ File updated successfully!")
    else:
        print("‚úÖ All functions are already in the file!")
        
except FileNotFoundError:
    print(f"‚ùå File not found at: {file_path}")
    print("Please check the path from Step 1 and update the file_path variable.")
except Exception as e:
    print(f"‚ùå Error: {e}")

üõ†Ô∏è Adding missing functions to recommendation_engine.py...
‚úÖ File updated successfully!


In [95]:
import os

file_path = r'C:\Users\Kshitij\Documents\Projects\ML\Music_Recommender\src\recommendation_engine.py'

print("üîç Checking ACTUAL content of recommendation_engine.py:")
print("=" * 60)

if os.path.exists(file_path):
    with open(file_path, 'r') as f:
        content = f.read()
    
    print(f"File exists: {file_path}")
    print(f"File size: {len(content)} characters")
    
    # Show all function definitions
    print("\nüìã ALL FUNCTIONS IN FILE:")
    lines = content.split('\n')
    function_count = 0
    for line in lines:
        if line.strip().startswith('def '):
            function_count += 1
            print(f"  {function_count}. {line.strip()}")
    
    print(f"\nTotal functions found: {function_count}")
    
    # Check specifically for our functions
    print("\nüîç SPECIFIC FUNCTION CHECK:")
    target_functions = ['search_songs_database', 'setup_music_database', 'load_recommendation_system', 'get_recommendations']
    for func in target_functions:
        if f'def {func}' in content:
            print(f"  ‚úÖ {func} - FOUND")
        else:
            print(f"  ‚ùå {func} - MISSING")
    
    # Show first 30 lines to see the structure
    print(f"\nüìÑ FIRST 30 LINES OF FILE:")
    print("-" * 40)
    for i, line in enumerate(lines[:30], 1):
        print(f"{i:2}: {line}")
        
else:
    print(f"‚ùå File not found: {file_path}")

üîç Checking ACTUAL content of recommendation_engine.py:
File exists: C:\Users\Kshitij\Documents\Projects\ML\Music_Recommender\src\recommendation_engine.py
File size: 5832 characters

üìã ALL FUNCTIONS IN FILE:
  1. def load_recommendation_system():
  2. def setup_music_database():
  3. def search_songs_database(query, limit=10):
  4. def get_mood_based_recommendations(target_mood, num_recommendations=10):
  5. def create_playlist_from_seeds(seed_songs, playlist_length=20):
  6. def explain_recommendation(original_song, recommended_song):
  7. def get_recommendations(track_name, num_recommendations=10):

Total functions found: 7

üîç SPECIFIC FUNCTION CHECK:
  ‚úÖ search_songs_database - FOUND
  ‚úÖ setup_music_database - FOUND
  ‚úÖ load_recommendation_system - FOUND
  ‚úÖ get_recommendations - FOUND

üìÑ FIRST 30 LINES OF FILE:
----------------------------------------
 1: import sqlite3
 2: import os
 3: 
 4: import pandas as pd
 5: import pickle
 6: 
 7: def load_recommendation_

In [96]:
# If functions are missing, run this to completely replace the file


print("üõ†Ô∏è Creating COMPLETE recommendation_engine.py...")

complete_content = '''import pandas as pd
import pickle
import sqlite3
import os

def load_recommendation_system():
    """Load the pre-trained recommendation system"""
    with open('models/recommendation_system.pkl', 'rb') as f:
        components = pickle.load(f)
    return components

def setup_music_database():
    """Setup SQLite database for music data"""
    components = load_recommendation_system()
    df = components['df']
    
    conn = sqlite3.connect('music_data.db')
    df.to_sql('songs', conn, if_exists='replace', index=False)
    
    # Create index for faster searches
    conn.execute("CREATE INDEX IF NOT EXISTS idx_track_name ON songs(track_name_clean)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_popularity ON songs(popularity)")
    
    conn.close()
    return "Database setup complete"

def search_songs_database(query, limit=10):
    """Search songs using SQLite database - auto-initializes if needed"""
    # Check if database exists, if not create it
    if not os.path.exists('music_data.db'):
        print("Initializing music database for the first time...")
        setup_music_database()
    
    conn = sqlite3.connect('music_data.db')
    
    # Use parameterized query to prevent SQL injection
    result = pd.read_sql("""
        SELECT * FROM songs 
        WHERE track_name_clean LIKE ? 
        ORDER BY popularity DESC 
        LIMIT ?
    """, conn, params=[f'%{query}%', limit])
    
    conn.close()
    return result

def get_mood_based_recommendations(target_mood, num_recommendations=10):
    """Get songs based on mood (happy, sad, energetic, calm)"""
    components = load_recommendation_system()
    df = components['df']
    
    mood_profiles = {
        'happy': {'valence': 0.7, 'energy': 0.6, 'danceability': 0.6},
        'sad': {'valence': 0.3, 'energy': 0.3, 'danceability': 0.3},
        'energetic': {'energy': 0.8, 'danceability': 0.7, 'valence': 0.6},
        'calm': {'energy': 0.2, 'acousticness': 0.7, 'valence': 0.5}
    }
    
    if target_mood in mood_profiles:
        target_profile = mood_profiles[target_mood]
        scores = []
        
        for idx, row in df.iterrows():
            score = 0
            for feature, target_value in target_profile.items():
                score += 1 - abs(row[feature] - target_value)
            scores.append(score)
        
        df_temp = df.copy()
        df_temp['mood_score'] = scores
        return df_temp.nlargest(num_recommendations, 'mood_score')[['track_name_clean', 'artists_clean', 'mood_score']]
    
    return pd.DataFrame()

def create_playlist_from_seeds(seed_songs, playlist_length=20):
    """Create a playlist from multiple seed songs"""
    all_recommendations = []
    
    for song in seed_songs:
        recs = get_recommendations(song, playlist_length//len(seed_songs))
        if isinstance(recs, pd.DataFrame):
            all_recommendations.append(recs)
    
    if all_recommendations:
        combined = pd.concat(all_recommendations)
        # Remove duplicates and sort by similarity
        combined = combined.drop_duplicates('track_name').nlargest(playlist_length, 'similarity_score')
        return combined
    
    return pd.DataFrame()

def explain_recommendation(original_song, recommended_song):
    """Explain why a song was recommended"""
    components = load_recommendation_system()
    df = components['df']
    
    orig_data = df[df['track_name_clean'] == original_song].iloc[0]
    rec_data = df[df['track_name_clean'] == recommended_song].iloc[0]
    
    similarities = []
    for feature in ['danceability', 'energy', 'valence', 'tempo']:
        similarity = 1 - abs(orig_data[feature] - rec_data[feature])
        if similarity > 0.7:
            similarities.append(f"{feature} ({similarity:.1%} similar)")
    
    return f"Recommended because of similar: {', '.join(similarities)}" if similarities else "Similar in overall audio characteristics"

def get_recommendations(track_name, num_recommendations=10):
    """
    Main recommendation function for Streamlit app
    """
    # Load components
    components = load_recommendation_system()
    nn_model = components['nn_model']
    feature_matrix_minimal = components['feature_matrix_minimal']
    df = components['df']

    try:
        # Check if song exists
        if track_name not in df['track_name_clean'].values:
            similar_names = df[df['track_name_clean'].str.contains(track_name, case=False, na=False)]
            if len(similar_names) > 0:
                suggestion = similar_names.iloc[0]['track_name_clean']
                return f"Song not found. Did you mean: '{suggestion}'?"
            else:
                return "Song not found in dataset."

        # Get song index and features
        song_idx = df[df['track_name_clean'] == track_name].index[0]
        song_features = feature_matrix_minimal.iloc[song_idx:song_idx+1]

        # Get recommendations
        distances, indices = nn_model.kneighbors(song_features, n_neighbors=num_recommendations + 1)

        recommendations = []
        for i in range(1, len(indices[0])):
            neighbor_idx = indices[0][i]
            distance = distances[0][i]

            # Normalize similarity score
            max_dist = distances[0][1:].max() if len(distances[0]) > 1 else 1
            similarity = max(0.1, 1 - (distance / max_dist)) if max_dist > 0 else 0.5

            neighbor_data = df.iloc[neighbor_idx]
            recommendations.append({
                'track_name': neighbor_data['track_name_clean'],
                'artists': neighbor_data['artists_clean'],
                'genre': neighbor_data['track_genre_clean'],
                'popularity': neighbor_data['popularity'],
                'similarity_score': round(similarity, 3)
            })

        return pd.DataFrame(recommendations)

    except Exception as e:
        return f"Error getting recommendations: {str(e)}"
'''

# Create directory if it doesn't exist
os.makedirs(os.path.dirname(file_path), exist_ok=True)

# Write the complete file
with open(file_path, 'w') as f:
    f.write(complete_content)

print(f"‚úÖ Created COMPLETE file: {file_path}")
print("All functions including search_songs_database have been added!")

üõ†Ô∏è Creating COMPLETE recommendation_engine.py...
‚úÖ Created COMPLETE file: C:\Users\Kshitij\Documents\Projects\ML\Music_Recommender\src\recommendation_engine.py
All functions including search_songs_database have been added!


In [97]:
# After creating the file, test the import
import sys
import os

# Clear any cached imports
if 'src.recommendation_engine' in sys.modules:
    del sys.modules['src.recommendation_engine']

sys.path.append(os.path.abspath('..'))

try:
    from src.recommendation_engine import search_songs_database, load_recommendation_system
    print("‚úÖ SUCCESS: Import worked!")
    print("Available functions:")
    
    # List all available functions
    import src.recommendation_engine as engine
    functions = [f for f in dir(engine) if not f.startswith('_')]
    for func in functions:
        print(f"  - {func}")
        
except ImportError as e:
    print(f"‚ùå Import failed: {e}")

‚úÖ SUCCESS: Import worked!
Available functions:
  - create_playlist_from_seeds
  - explain_recommendation
  - get_mood_based_recommendations
  - get_recommendations
  - load_recommendation_system
  - os
  - pd
  - pickle
  - search_songs_database
  - setup_music_database
  - sqlite3


In [98]:
import time
import sys
import os
import pandas as pd

# Add the parent directory to Python path (ALTERNATIVE - forward slashes)
sys.path.append('C:/Users/Kshitij/Documents/Projects/ML/Music_Recommender')

# Now import should work
from src.recommendation_engine import search_songs_database, load_recommendation_system

print("‚úÖ Import successful!")

# Load data for the original search function
components = load_recommendation_system()
df = components['df']

# Define the original search function (from your app.py)
def search_songs(query, df, max_results=15):
    """Search songs based on query"""
    if not query or len(query) < 1:
        return pd.DataFrame()

    # Search in track names (case insensitive)
    mask = df['track_name_clean'].str.contains(query, case=False, na=False)
    results = df[mask][['track_name_clean', 'artists_clean', 'track_genre_clean', 'popularity']]

    return results.head(max_results)

# Test performance comparison
query = "love"

print("üîç Performance Comparison:")
print("=" * 50)

# Original search
start_time = time.time()
original_results = search_songs(query, df, 10)
original_time = time.time() - start_time
print(f"Original search: {original_time:.3f} seconds")
print(f"Found {len(original_results)} songs")

# Database search  
start_time = time.time()
db_results = search_songs_database(query, 10)
db_time = time.time() - start_time
print(f"Database search: {db_time:.3f} seconds")
print(f"Found {len(db_results)} songs")

if original_time > 0 and db_time > 0:
    speed_improvement = original_time / db_time
    print(f"Speed improvement: {speed_improvement:.1f}x faster")
    
    if speed_improvement > 1:
        print("üéâ Database search is FASTER!")
    else:
        print("‚ö†Ô∏è  Database search might need optimization")
else:
    print("Cannot calculate speed improvement (zero time)")

# Show sample results
print(f"\nüìä Sample results comparison:")
print("-" * 40)

print("Original search (first 3 results):")
if not original_results.empty:
    for i, (_, row) in enumerate(original_results.head(3).iterrows()):
        print(f"  {i+1}. {row['track_name_clean']} - {row['artists_clean']}")

print("\nDatabase search (first 3 results):")
if not db_results.empty:
    for i, (_, row) in enumerate(db_results.head(3).iterrows()):
        print(f"  {i+1}. {row['track_name_clean']} - {row['artists_clean']}")

‚úÖ Import successful!
üîç Performance Comparison:
Original search: 0.256 seconds
Found 10 songs
Database search: 0.079 seconds
Found 10 songs
Speed improvement: 3.2x faster
üéâ Database search is FASTER!

üìä Sample results comparison:
----------------------------------------
Original search (first 3 results):
  1. Can't Help Falling In Love - Kina Grannis
  2. Falling in Love at a Coffee Shop - Landon Pigg
  3. ily (i love you baby) - Andrew Foy;Renee Foy

Database search (first 3 results):
  1. Another Love - Tom Odell
  2. lovely (with Khalid) - Billie Eilish;Khalid
  3. I Love You So - The Walters


In [99]:
import pickle
import os

# Check what's in the pickle file
try:
    with open('models/recommendation_system.pkl', 'rb') as f:
        components = pickle.load(f)
    
    print("Keys found in pickle file:")
    for key in components.keys():
        print(f" - {key}")
        
    print(f"\nDataFrame shape: {components['df'].shape if 'df' in components else 'No df key'}")
    
except Exception as e:
    print(f"Error: {e}")

Keys found in pickle file:
 - nn_model
 - feature_matrix_minimal
 - df
 - essential_features

DataFrame shape: (89741, 28)
