# Training SVM classifier 

In [1]:
import pandas as pd
import numpy as np
import json

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA as sklearnPCA
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

### Train SVM without normalisation

In [13]:
# Train SVM without normalisation

# Import data and filter data for four radio stations
df_features=pd.read_csv('df_features.csv')
df_meta=pd.read_csv('df_meta.csv')
filtered_names = df_meta.loc[df_meta['Name'].isin(['bbc_6music', 'bbc_radio_three', 'bbc_radio_one'])]

# Split data into training and test sets
index = range(len(df_features.index))
X_train, X_test, y_train, y_test =train_test_split(df_features.iloc[filtered_names.index], filtered_names.index, test_size=0.4, random_state=42)

X = X_train
# Training set lables
Y = filtered_names["Name"][y_train]

# Fit SVM model
clf = svm.SVC(decision_function_shape='ovo')
clf.fit(X, Y) 

# Predict lables and calculate accuracy
predicted = clf.predict(X_test)
class_accuracy =accuracy_score(filtered_names["Name"][y_test], predicted)
print("{:f}%".format(class_accuracy*100))

34.394066%


### Training SVM with normalised data

In [2]:
# Load data and filter data for four radio stations
df_features=pd.read_csv('df_features.csv')
df_meta=pd.read_csv('df_meta.csv')
filtered_names = df_meta.loc[df_meta['Name'].isin(['bbc_6music', 'bbc_radio_three','bbc_radio_one','bbc_radio_two'])]

In [3]:
# Split data into training and test sets
X_all = df_features.iloc[filtered_names.index]
X_train, X_test, y_train, y_test =train_test_split(X_all, filtered_names.index, test_size=0.4, random_state=42)

In [4]:
# Train SVM with normalised data

# Normalise data
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# Get original lables for training and test data sets (Radio stations)
y_train_labels = filtered_names["Name"][y_train]
y_test_labels = filtered_names["Name"][y_test]

# Fit SVM model
clf = svm.SVC(decision_function_shape='ovr',  )
clf.fit(X_train_std, y_train_labels) 
predicted_labels = clf.predict(X_test_std)

# Determine accuracy
class_accuracy =accuracy_score(y_test_labels, predicted_labels)
# Output
print("{:f}%".format(class_accuracy*100))

96.947347%


In [5]:
# Predict radio labels for BBC Introducing Kent music

# Load and normalise data
df_kent_features=pd.read_csv('df_kent_features.csv')
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(df_kent_features)

# Fit the trained model to the data set
y_train_labels = filtered_names["Name"][y_train]
# Calculate probabilities for each lable
clf = svm.SVC(decision_function_shape='ovr', probability=True)
clf.fit(X_train_std, y_train_labels) 
predicted_labels = clf.predict(X_test_std)
predicted_labels

array(['bbc_6music', 'bbc_6music', 'bbc_6music', ..., 'bbc_radio_three',
       'bbc_6music', 'bbc_6music'], dtype=object)

In [6]:
# Predicted labels
labels =  pd.DataFrame(predicted_labels,columns = ['Label'])
labels.groupby('Label').count().sort_values("Label")

KeyError: 'Label'

In [17]:
labels =  pd.DataFrame(predicted_labels,columns = ['Label'])
plt.hist(labels.groupby('Label').count().sort_values("Artist", ascending=False)['Artist'])
plt.title("Radio 3 music by show")
plt.xlabel('Number of songs per show')
plt.ylabel('Number of shows')

KeyError: 'Artist'

In [7]:
kent_probabilities = clf.predict_proba(X_test_std)

In [8]:
kent_meta = pd.read_csv('df_kent_meta.csv')

In [9]:
# Combine probability dataframe with track id dataframe
kent_prob_df = pd.DataFrame(kent_probabilities,columns = 
                            ['Radio_6music','Radio_one','Radio_three', 'Radio_two'])
concant_df = pd.concat([kent_meta, kent_prob_df], axis=1)

In [10]:
%matplotlib inline

In [11]:
comb_df= pd.concat([concant_df['Track_ID'],labels['Label']], axis=1)
comb_df.groupby('Label').count().sort_values("Track_ID", ascending=False)

Unnamed: 0_level_0,Track_ID
Label,Unnamed: 1_level_1
bbc_6music,1387
bbc_radio_three,191
bbc_radio_one,145


In [12]:
# Find songs by Track ID
data = pd.read_csv("/Users/kristing/Documents/AI_playlists/webapp/BBC_Introducing_Music/kent.csv",
                   names=['Artist','Artist_id', 'Title', 'Track_id','Upload_date'])
df = (pd.DataFrame(data))
df.loc[df['Track_id']=='1000047']

Unnamed: 0,Artist,Artist_id,Title,Track_id,Upload_date
490,Cacao,274836,Mangoes,1000047,15/06/2016


In [13]:
str_list=best_list['Track_ID'].apply(str)
str_list

NameError: name 'best_list' is not defined

In [14]:
# Find 10 highest probabilties 
best_list=concant_df.nlargest(10, 'Radio_two')
str_list=best_list['Track_ID'].apply(str)
output_data=df.loc[df['Track_id'].isin(str_list)]
output_data = output_data.drop('Upload_date', 1)
output_data=output_data.to_json(orient='records')
output_data

'[{"Artist":"Phoebe Peek","Artist_id":"177080","Title":"Scoping","Track_id":"1027721"},{"Artist":"Kristofer","Artist_id":"293903","Title":"Will You","Track_id":"1000765"},{"Artist":"Alex DJ Brown","Artist_id":"157198","Title":"Dunno ","Track_id":"989749"},{"Artist":"Elliot Galvin Trio","Artist_id":"278662","Title":"Blop","Track_id":"984449"},{"Artist":"Izzy B Uke","Artist_id":"287581","Title":"Lucked Out","Track_id":"974097"},{"Artist":"Aidan Regnaud","Artist_id":"276272","Title":"Hard to Please","Track_id":"952160"},{"Artist":"Ryan Fox","Artist_id":"41478","Title":"Pairs in Nature","Track_id":"927376"},{"Artist":"Phoebe Peek","Artist_id":"177080","Title":"In Too Deep","Track_id":"910030"},{"Artist":"Just2 Harp Duo","Artist_id":"226468","Title":"Pie Jesu and Titanic Theme Song  Medley ","Track_id":"905712"},{"Artist":"charsoul, sullyman, gemini, charlie sullivan","Artist_id":"266110","Title":"devils surprise","Track_id":"887492"}]'

In [19]:
with open('/Users/kristing/Documents/AI_playlists/webapp/BBC_Introducing_Music/radio2.json', 'w') as f:
    f.write(output_data)

In [21]:
len(predicted_labels)

1723

In [22]:
len(y_test_labels)

9762

In [18]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test_labels, predicted_labels)

ValueError: Found input variables with inconsistent numbers of samples: [9762, 1723]

In [16]:
import matplotlib.pyplot as plt
import itertools
%matplotlib inline
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


In [17]:
# plt.figure(figsize=(12, 10))
plot_confusion_matrix(cm, set(list(predicted_labels)), normalize=True)

NameError: name 'cm' is not defined

In [41]:
predicted

array(['bbc_radio_two', 'bbc_radio_two', 'bbc_radio_three', ...,
       'bbc_radio_two', 'bbc_radio_one', 'bbc_radio_two'], dtype=object)