# Last.fm User Exploratory Data Analysis

note: explore bivariate analysis between artist's play count and their location but that requires combining location data, maybe look into outliers?, include more interpretations and comment more later to make it more professional

## Setup and Load Dataset

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [None]:
# read in user data
file_path = r'C:\Users\radia\Documents\4th year\w_23\dsc180b_w23\filtered_user_df.csv'
user_df = pd.read_csv(file_path)

In [None]:
# drop unnecessary columns
user_df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], inplace=True)

In [None]:
# display data
user_df.head()

In [None]:
# basic understanding of dataset
print(f'DataFrame Shape:\n{user_df.shape}\n')

print(f'DataFrame Column Types:\n{user_df.dtypes}\n')

print(f'Number of Missing Values:\n{user_df.isna().sum()}')

## Numerical Analysis

### Play Count

In [None]:
# get summary statistics of play count
user_df['play_count'].describe()

In [None]:
# plot distribution of play count
plt.figure(figsize=(10, 6))
plt.hist(user_df['play_count'], bins=100, edgecolor='k', alpha=0.7)

plt.yscale('log', nonpositive='clip') 

plt.title('Distribution of Play Count')
plt.xlabel('Play Count')
plt.ylabel('Frequency (Log Scale)')

# setting the x-axis to display whole numbers using a custom formatter
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))

plt.show()

# Categorical Analysis

### User

In [None]:
# get number of unique 
print(f'Number of Unique Users:\n{user_df["user"].nunique()}')

In [None]:
# dataframe of users by total play count to analyze usage
user_totalplay = user_df.groupby('user')['play_count'].sum().reset_index(name='total_play_count')

In [None]:
# top 10 users
user_totalplay.nlargest(10, 'total_play_count')

In [None]:
# bottom 10 users
user_totalplay.nsmallest(10, 'total_play_count')

In [None]:
# graph distribution of total play count across users
plt.figure(figsize=(10, 6))
plt.hist(user_totalplay['total_play_count'], bins=100, edgecolor='k', alpha=0.7)

plt.yscale('log', nonpositive='clip') 

plt.title('Distribution of Total Play Count')
plt.xlabel('Total Play Count')
plt.ylabel('Frequency (Log Scale)')

# setting the x-axis to display whole numbers using a custom formatter
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))

plt.show()

### Artist

In [None]:
# get number of unique artists
print(f'Number of Unique Artists:\n{user_df["artist_name"].nunique()}')

In [None]:
# dataframe of artists by total play count to analyze popularity
artist_totalplay = user_df.groupby('artist_name')['play_count'].sum().reset_index(name='total_play_count')

In [None]:
# top 10 artists
artist_totalplay.nlargest(10, 'total_play_count')

In [None]:
# bottom 10 artists
artist_totalplay.nlargest(10, 'total_play_count')

In [None]:
# graph top 30 artists and their total play count
# idk how useful this is so it could be tweaked
top_artists = artist_totalplay.nlargest(30, 'total_play_count')

plt.figure(figsize=(13, 6))
plt.bar(range(len(top_artists['artist_name'])), top_artists['total_play_count'], align='center', edgecolor='k', alpha=0.7)

plt.yscale('log', nonpositive='clip') 

plt.title('Top 30 Artists by Total Play Count')
plt.xlabel('Artist')
plt.ylabel('Total Play Count (Log Scale)')

plt.xticks(range(len(top_artists['artist_name'])), top_artists['artist_name'], rotation=60)

plt.show()

## Bivariate Analysis