## Read in our data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
%matplotlib inline

In [None]:
df = pd.read_csv('/Users/joshuakowal/DSCI 303 Assignments/spotify_hits/data/tracks_w_lang.csv')

#Get all songs that are in English
df = df[df['lang'] == 'en']
df.head()

## Feature Generation
### Perform PCA on Each Decade and then do KMeans Clustering to identify groups of songs

In [None]:
#Helper function to normalize all the columns of a matrix

def normalize_cols(X, m = None, s = None):
    """
    Z-score normalizes all columns in matrix X
    
    Returns the new z_scored matrix X_n
    """
    nrows, ncols = X.shape
    X_n = np.zeros((nrows, ncols))
    ms = []
    ss = []
    for i in range(ncols):
        if m is None:
            mean = X[:,i].mean()
        else:
            mean = m[i]
        if s is None:
            std = X[:,i].std()
        else:
            std = s[i]
        X_n[:,i] = (X[:,i] - mean) / std
        ms.append(mean)
        ss.append(ss)
    return X_n, ms, ss

In [None]:
summary = df.describe()
start_dec = round(int(summary.at['min', 'year']), -1)
last_dec = round(int(summary.at['max','year']), -1)

def PCA_and_KMeans(df, start_dec, last_dec):
    """
    This function takes our dataframe df and performs the following for each
    individual decade:
    
    1. Perform PCA with an increasing number of principal components until we
    get 95% of the variance covered. Once we reach this threshold, we retain
    this transformed data in a variable called X_new. We add a tuple containing
    the pca model and the transformed data matrix in the pcas dictionary corresponding
    to the key representing its decade
    
    2. Perform K-Means Clustering with 10, 12, 14, and 16 clusters on the transformed data.
    A dictionary containing cluster numbers corresponding to predicted labels will be added
    to the kmeans dictionary for the given decade.
    
    Returns:
    pcas - A dictionary containing {'decade':(pca_model, transformed_X)} pairs
    kmeans - A dictionary containing {'decade':{n_clusters:pred_labels__for_n_clusters}} pairs
    dfs - A dictionary containing {decade:[non-normalized_df, normalized_df]} pairs
    """
    pcas = {}
    kmeans = {}
    dfs = {}
    for dec in range(start_dec, last_dec, 10):
        key = str(dec) + '\'s'
        
        decade_df = df[df['year'] >= dec]
        decade_df = decade_df[decade_df['year'] < (dec + 10)]
        decade_df = decade_df.drop(labels = ['explicit', 'mode', 'year'], axis = 1)
        
        #Isolate a target variable, which is popularity
        target = decade_df['popularity']
        decade_df.drop('popularity', axis = 1)
        dfs[key] = [decade_df]
        X = decade_df.select_dtypes(include = 'number').to_numpy()
        X_norm, prev_mean, prev_std = normalize_cols(X)
        dfs[key].append(X_norm)
        dfs[key].append(target)
        dfs[key].append(prev_mean)
        dfs[key].append(prev_std)
        
        #Do PCA until we get 95% of the variance
        n_comp = 3
        while True:
            pca = PCA(n_components = n_comp)
            X_new = pca.fit_transform(X_norm)
            var_ret = np.sum(pca.explained_variance_ratio_)
            if var_ret >= 0.95:
                pcas[key] = (pca, X_new, target)
                break
            n_comp += 1
        
        #Now that we have X_new, let's do KMeans Clustering
        #Perform clustering with 10, 12, 14, and 16 clusters
        mods = {}
        for i in range(10, 18, 2):
            km = KMeans(n_clusters = i)
            y_pred = km.fit_predict(X_new)
            centroids = km.cluster_centers_
            mods[i] = (X_new, y_pred, centroids)
        kmeans[key] = mods
    return pcas, kmeans, dfs

In [None]:
pca_dict, kmeans_dict, dataframes = PCA_and_KMeans(df, start_dec, last_dec)

In [None]:
#Summary statistics of KMeans
for key, val in kmeans_dict.items():
    print('Results of KMeans for the', key)
    for key2, val2 in val.items():
        unique, counts = np.unique(val2[1], return_counts = True)
        a = dict(zip(unique, counts))
        print('For', key2, 'clusters:', a)
    print('\n')

In [None]:
#Write out the new dataframes split by decade
for key1, value1 in dataframes.items():
    

## Read in a Couple Other Datasets for Comparison

In [None]:
df2 = pd.read_csv('/Users/joshuakowal/DSCI 303 Assignments/spotify_hits/data/data_w_genres.csv')
df2.head(10)

In [None]:
df3 = pd.read_csv('/Users/joshuakowal/DSCI 303 Assignments/spotify_hits/data/data_by_genres.csv')
df3.head(10)

In [None]:
df3.describe()

## Linear Regression with the Large Dataset

### Using 2000-2015 for the training data and 2016-2019 as the test data

In [None]:
from sklearn.linear_model import LinearRegression

df_numeric = df.select_dtypes(include = 'number')
temp = df_numeric[df_numeric['year'] >= 2000]
X = temp[temp['year'] <= 2015]
X.drop(labels = ['explicit', 'mode', 'year'], axis = 1)
y = X['popularity']
X.drop('popularity', axis = 1)

testX = df_numeric[df_numeric['year'] > 2015]
testX.drop(labels = ['explicit', 'mode', 'year'], axis = 1)
testy = testX['popularity']
testX.drop('popularity', axis = 1)

X = X.to_numpy()
y = y.to_numpy()
testX = testX.to_numpy()
normX, mu, sig = normalize_cols(X)
norm_testX, mu, sig = normalize_cols(testX, mu, sig)

#Make our Linear Regression model and fit with X and y
reg = LinearRegression()
reg.fit(normX,y)
yhat = reg.predict(norm_testX)
r2 = reg.score(norm_testX, testy)
mse = np.linalg.norm(yhat - testy)
print("Model score:", r2)
print("MSE with Test:", mse)

This is a really bad model for our purposes.