In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import RepeatedStratifiedKFold
from tqdm.notebook import tqdm


In [25]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [26]:
!pip install optuna



In [27]:
import optuna

In [28]:
train = pd.read_csv('/content/drive/MyDrive/project9/PCA/train_pca.csv')
train_labels = pd.read_csv('/content/drive/MyDrive/project9/music30s_trainlabel.csv')
test =pd.read_csv('/content/drive/MyDrive/project9/PCA/test_pca.csv')


In [29]:
train_labels =train_labels.iloc[:,1:]
train_labels.head(3)


Unnamed: 0,label
0,reggae
1,reggae
2,country


In [30]:
display(train.head(3))
display(test.head(3))

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC23,PC24,PC25,PC26,PC27,PC28,PC29,PC30,PC31,PC32
0,-6.042859,3.360133,-0.580582,0.481972,2.052833,-1.309454,-0.927022,0.684981,0.490648,-0.438662,...,0.282041,0.624148,-0.086557,0.092189,-0.450715,-0.007269,-0.221397,0.534964,0.70029,-0.457552
1,-1.924472,-0.237534,-0.794313,-1.140835,-1.996988,-0.246758,-0.689759,2.738715,-0.320075,0.084416,...,0.578843,0.500192,-0.083044,0.277533,0.561664,1.073356,-0.761747,0.25593,0.204644,-0.195949
2,-1.809602,0.782968,1.921737,1.549557,0.894004,-0.9312,-0.953658,0.44126,-1.012534,-1.201357,...,0.782811,0.413996,-1.089124,-0.339099,-1.117547,-0.278725,-0.071495,0.067914,0.222783,0.558211


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC23,PC24,PC25,PC26,PC27,PC28,PC29,PC30,PC31,PC32
0,-6.548205,-3.115204,-1.24047,-0.232519,-0.842417,-0.99101,0.96934,1.014227,1.127184,0.122532,...,0.436175,-0.539889,-0.548403,0.340316,-0.366454,0.545709,1.16112,0.065227,0.127051,0.131474
1,3.794001,-2.954257,7.572803,3.322598,-7.63203,-3.405628,-0.87895,-0.529037,1.441828,0.988991,...,0.65949,0.113366,0.995181,0.595825,-0.077484,-0.76683,1.266615,-0.225434,-0.65804,-0.819949
2,-2.089484,4.504425,-4.964538,-0.380636,-2.585731,1.882099,-0.009821,-0.850664,-0.574026,-0.61182,...,0.703774,1.404571,-0.091778,1.045946,0.452793,0.248331,0.294834,-0.466831,0.541625,0.845241


In [31]:
cols = [c for c in train.columns]

In [32]:
train = pd.concat([train,train_labels],axis=1)

In [33]:
train.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC24,PC25,PC26,PC27,PC28,PC29,PC30,PC31,PC32,label
0,-6.042859,3.360133,-0.580582,0.481972,2.052833,-1.309454,-0.927022,0.684981,0.490648,-0.438662,...,0.624148,-0.086557,0.092189,-0.450715,-0.007269,-0.221397,0.534964,0.70029,-0.457552,reggae
1,-1.924472,-0.237534,-0.794313,-1.140835,-1.996988,-0.246758,-0.689759,2.738715,-0.320075,0.084416,...,0.500192,-0.083044,0.277533,0.561664,1.073356,-0.761747,0.25593,0.204644,-0.195949,reggae
2,-1.809602,0.782968,1.921737,1.549557,0.894004,-0.9312,-0.953658,0.44126,-1.012534,-1.201357,...,0.413996,-1.089124,-0.339099,-1.117547,-0.278725,-0.071495,0.067914,0.222783,0.558211,country
3,0.422664,0.443629,-2.206801,2.090353,-0.798949,1.458579,-0.923127,0.087803,0.092352,-0.83942,...,-0.261078,-0.979358,-0.468351,0.587211,0.11102,-0.170387,0.178367,0.866886,-0.58006,blues
4,5.398688,-1.05961,-1.807378,1.35582,0.388511,-1.194312,0.22261,1.751299,-1.001092,-0.228982,...,-0.097138,-0.141619,-0.048795,0.314327,0.084201,-0.048339,-0.230336,0.180659,0.161392,metal


In [34]:

X = train.drop('label',axis=1)
y=train['label']

In [35]:
target_column = 'label'

In [36]:
features = [c for c in train.columns if c not in [target_column]]

In [37]:
## going by the optuna way:
import optuna
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
# Which hyperparameters to tune: https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

def objective(trial):

    # -- Tune estimator algorithm
    #suggest_int(name, low, high[, step, log])
    C = trial.suggest_float("C", 0,100)
    #algorithm{‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, default=’auto’
    gamma = trial.suggest_categorical("gamma",['auto', 'scale'])
    kernel = trial.suggest_categorical("kernel", ['linear', 'poly', 'rbf','sigmoid'])
    svc = SVC(C=C, gamma=gamma, kernel=kernel)
        
    # -- Cross-validate the features reduced by dimensionality reduction methods
    rskfold = RepeatedStratifiedKFold(n_splits=5, n_repeats=3)
    score = cross_val_score(svc, X,y, scoring='accuracy', cv=rskfold, n_jobs = -1, error_score = 'raise')
    score = score.mean()
    return score


In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, timeout = 4*60*60)

[32m[I 2022-04-22 11:50:20,053][0m A new study created in memory with name: no-name-f6accd09-5ca6-4ec2-b657-d883c47c77a5[0m
[32m[I 2022-04-22 11:50:20,568][0m Trial 0 finished with value: 0.39740740740740743 and parameters: {'C': 26.38731083259205, 'gamma': 'auto', 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.39740740740740743.[0m
[32m[I 2022-04-22 11:50:23,689][0m Trial 1 finished with value: 0.6888888888888888 and parameters: {'C': 16.645062255927535, 'gamma': 'auto', 'kernel': 'linear'}. Best is trial 1 with value: 0.6888888888888888.[0m
[32m[I 2022-04-22 11:50:24,697][0m Trial 2 finished with value: 0.7592592592592593 and parameters: {'C': 36.57922930644185, 'gamma': 'auto', 'kernel': 'rbf'}. Best is trial 2 with value: 0.7592592592592593.[0m
[32m[I 2022-04-22 11:50:25,189][0m Trial 3 finished with value: 0.49000000000000005 and parameters: {'C': 94.04442155991345, 'gamma': 'scale', 'kernel': 'sigmoid'}. Best is trial 2 with value: 0.7592592592592593.[0m
[32m

In [None]:
trial = study.best_trial    
print('Accuracy: {}'.format(trial.value))   


Accuracy: 0.7592592592592594


In [None]:
print("Best hyperparameters: {}".format(trial.params))


Best hyperparameters: {'C': 59.808394682620815, 'gamma': 'auto', 'kernel': 'rbf'}


In [None]:

optuna.visualization.plot_optimization_history(study)


In [38]:
svc_params = {'C': 59.808394682620815, 'gamma': 'auto', 'kernel': 'rbf'}


In [39]:

from sklearn.metrics import recall_score,precision_score,f1_score
from sklearn.metrics import classification_report
import statistics
from statistics import mean

In [40]:
%%time
N_SPLITS = 5
y_preds = []
acc= []
from sklearn.model_selection import StratifiedKFold

folds = StratifiedKFold(n_splits = N_SPLITS)

for fold, (train_id, valid_id) in enumerate(folds.split(X,y)):
  X_train = X.iloc[train_id]
  y_train = y.iloc[train_id]
  X_valid = X.iloc[valid_id]
  y_valid = y.iloc[valid_id]
  model = SVC(**svc_params)
  
  model.fit(X_train, y_train)
  valid_pred = model.predict(X_valid)

#print('fold:{}'.format(fold+1))
  #print('Validation Accuracy score : {}'.format(accuracy_score(y_valid, valid_pred)))
  #print('-'*100)
  acc.append(accuracy_score(y_valid, valid_pred))

  y_preds.append(model.predict(test))


CPU times: user 679 ms, sys: 0 ns, total: 679 ms
Wall time: 796 ms


In [41]:
print('the mean accuracy is: {}'.format(mean(acc)))


the mean accuracy is: 0.7566666666666667


In [42]:
from collections import Counter
def mode(data):
    """Return the most common data point from discrete or nominal data.
    ``mode`` assumes discrete data, and returns a single value. This is the
    standard treatment of the mode as commonly taught in schools:
        >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
        3
    This also works with nominal (non-numeric) data:
        >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
        'red'
    If there are multiple modes with same frequency, return the first one
    encountered:
        >>> mode(['red', 'red', 'green', 'blue', 'blue'])
        'red'
    If *data* is empty, ``mode``, raises StatisticsError.
    """
    pairs = Counter(iter(data)).most_common(1)
    try:
        return pairs[0][0]
    except IndexError:
        raise StatisticsError('no mode for empty data') from None




In [43]:
y_pred = pd.DataFrame(columns=['svc_pca'])
pred= []
for j in range(0,100):
    arr = [y_preds[0][j], y_preds[1][j], y_preds[2][j], y_preds[3][j],y_preds[4][j]]
    pred.append(mode(arr))

In [44]:
y_pred["svc_pca"]=pred

In [45]:
y_pred

Unnamed: 0,svc_pca
0,country
1,jazz
2,hiphop
3,reggae
4,blues
...,...
95,rock
96,disco
97,pop
98,pop


In [46]:
y_pred.to_csv('svc_pca_pred.csv', index=False)
!cp -r '/content/svc_pca_pred.csv' /content/drive/MyDrive/project9/PCA

