<h1>Classifiers</h1>

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

<h2>Importing, Cleaning, Splitting</h2>

In [43]:
spotify2000 = pd.read_csv('Spotify-2000.csv', thousands = ',')
musicdata = spotify2000.drop(columns = ['Index', 'Title', 'Artist', 'Top Genre']) #Data is imported to musicdata without the Index

mxmh_df = pd.read_csv('mxmh_survey_results.csv')

#removing columns for simplicity
mxmh = mxmh_df.drop(columns = ['Timestamp', 'Permissions', 'BPM', 'Music effects'])
mxmh = mxmh.drop(mxmh.filter(regex = 'Frequency').columns, axis = 1) #dropping every 'Frequency' column

In [44]:
#function to turn all string values to unbiased integer values
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis = 1)
    res = res.drop([feature_to_encode], axis = 1)
    return(res)

In [46]:
#Converting all Yes-or-No questions to boolean

mxmh['Age'] = mxmh['Age'].fillna(mxmh['Age'].median())

mxmh['While working'] = mxmh['While working'].ffill().replace(('Yes', 'No'), (1, 0)).astype(bool)

mxmh['Instrumentalist'] = mxmh['Instrumentalist'].ffill().replace(('Yes', 'No'), (1, 0)).astype(bool)
mxmh['Composer'] = mxmh['Composer'].ffill().replace(('Yes', 'No'), (1, 0)).astype(bool)
mxmh['Foreign languages'] = mxmh['Foreign languages'].ffill().replace(('Yes', 'No'), (1, 0)).astype(bool)
mxmh['Exploratory'] = mxmh['Exploratory'].replace(('Yes', 'No'), (1, 0)).astype(bool)

In [47]:
mxmh['Primary streaming service'] = mxmh['Primary streaming service'].bfill()

#One-Hot Encoding all string values for the ML algorithm to use
mxmh = encode_and_bind(mxmh, 'Fav genre')
mxmh = encode_and_bind(mxmh, 'Primary streaming service')

In [53]:
mxmh.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 736 entries, 0 to 735
Data columns (total 33 columns):
 #   Column                                                       Non-Null Count  Dtype  
---  ------                                                       --------------  -----  
 0   Age                                                          736 non-null    float64
 1   Hours per day                                                736 non-null    float64
 2   While working                                                736 non-null    bool   
 3   Instrumentalist                                              736 non-null    bool   
 4   Composer                                                     736 non-null    bool   
 5   Exploratory                                                  736 non-null    bool   
 6   Foreign languages                                            736 non-null    bool   
 7   Anxiety                                                      736 non-null    flo

Dataset should now be cleaned: duplicate entries removed, missing values checked etc.

In [49]:
predictionvalue = 'Age'
#X = mxmh.drop(mxmh.filter(regex = 'Fav genre').columns, axis = 1)
X = mxmh.drop(columns = predictionvalue) #Input set, given to the algorithm for prediction, named X conventionally
y = mxmh[predictionvalue] #Output set, what the algorithm should predict, named y conventionally

70-80% of the dataset should be allocated to training.

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2) #data is split into train and test set

<h2>Decision Tree Classifiers</h2>

In [51]:
model_tree = DecisionTreeClassifier()
model_tree.fit(X_train, y_train) #passing the training set
pred_tree = model_tree.predict(X_test) #passing the input set for testing

In [52]:
score_tree = accuracy_score(y_test, pred_tree)
print('Accuracy:', score_tree) #printing the accuracy of the Decision Tree

Accuracy: 0.060810810810810814


<h2>Random Forest Classifier</h2>

In [23]:
model_forest = RandomForestClassifier()
model_forest.fit(X_train, y_train)
pred_forest = model_forest.predict(X_test)

In [24]:
score_forest = accuracy_score(y_test, pred_forest)
print('Accuracy:', score_forest)

Accuracy: 0.8648648648648649


<h2>Support Vector Machine</h2>

In [25]:
vector_clf = svm.SVC(kernel = 'poly')
vector_clf.fit(X_train, y_train)
pred_vector = vector_clf.predict(X_test)

In [26]:
score_vector = accuracy_score(y_test, pred_vector)
print('Accuracy:', score_vector)

Accuracy: 0.8648648648648649


<h2>Naive Bayes</h2>

In [27]:
model_naive = GaussianNB()
model_naive.fit(X_train, y_train)
pred_naive = model_naive.predict(X_test)

In [28]:
score_naive = accuracy_score(y_test, pred_vector)
print('Accuracy:', score_naive)

Accuracy: 0.8648648648648649


<h2>Multi-Layer Perceptron</h2>

In [32]:
model_mlp = MLPClassifier(solver = 'lbfgs', alpha = 1e-5, hidden_layer_sizes = (6,), random_state = 1, max_iter = 1000)
model_mlp.fit(X_train, y_train)
pred_mlp = model_mlp.predict(X_test)

In [33]:
score_mlp = accuracy_score(y_test, pred_mlp)
print('Accuracy:', score_mlp)

Accuracy: 0.8648648648648649
