In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import RepeatedStratifiedKFold
from tqdm.notebook import tqdm


In [3]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [None]:
!pip install optuna



In [None]:
import optuna

In [4]:
train = pd.read_csv('/content/drive/MyDrive/project9/FS_FeatSelect/train_mutual.csv')
test =pd.read_csv('/content/drive/MyDrive/project9/FS_FeatSelect/test_mutual.csv')
train_labels = pd.read_csv('/content/drive/MyDrive/project9/music30s_trainlabel.csv')



In [5]:
test.head()

Unnamed: 0,length,chroma_stft_mean,rms_mean,spectral_centroid_mean,spectral_bandwidth_mean,rolloff_mean,zero_crossing_rate_mean,perceptr_mean,mfcc1_mean,mfcc2_mean,...,rms_var,spectral_centroid_var,rolloff_var,zero_crossing_rate_var,harmony_var,perceptr_var,mfcc1_var,mfcc4_var,mfcc6_var,mfcc20_var
0,-0.298576,-0.648987,-1.1347,-1.172775,-0.967173,-1.282262,-0.778708,-0.701822,-1.430426,1.253362,...,-0.475092,0.345904,0.775221,0.320765,-0.919577,-1.183952,1.367246,1.632628,0.655763,1.923743
1,-0.142344,0.703835,-1.919805,-1.723546,-1.199076,-1.699029,-1.720368,0.356319,-4.080446,1.840163,...,-2.789422,-1.402396,-0.390356,-2.591164,-2.834482,-2.28205,1.383816,-1.626223,-0.855013,-1.934408
2,-0.298576,0.939176,2.349185,0.841052,0.373789,0.601497,-0.148723,0.200619,0.624079,-1.585836,...,1.280954,1.24086,0.467114,1.021164,2.446276,1.468478,0.244173,0.981665,1.624116,0.258653
3,-0.142344,-0.457404,-1.302324,-0.502459,0.924814,-0.159721,-1.45196,0.204869,-1.984323,-0.305455,...,-0.393983,0.910763,2.318024,-0.88736,-1.030686,-1.415747,1.292116,0.601168,0.859639,-2.520906
4,-0.142344,-1.037194,-1.060926,-0.080046,-0.396612,-0.098638,-0.028861,0.35539,-0.906607,-0.376684,...,-0.054747,0.851908,0.340237,0.971522,-0.706472,-0.993346,0.697116,1.114452,-0.170185,0.131906


In [6]:
train_labels =train_labels.iloc[:,1:]
train_labels.head(3)

Unnamed: 0,label
0,reggae
1,reggae
2,country


In [7]:
train = pd.concat([train, train_labels],axis=1)

In [8]:
train.head()

Unnamed: 0,length,chroma_stft_mean,rms_mean,spectral_centroid_mean,spectral_bandwidth_mean,rolloff_mean,zero_crossing_rate_mean,perceptr_mean,mfcc1_mean,mfcc2_mean,...,spectral_centroid_var,rolloff_var,zero_crossing_rate_var,harmony_var,perceptr_var,mfcc1_var,mfcc4_var,mfcc6_var,mfcc20_var,label
0,-0.298576,0.642278,0.177059,1.233038,1.386263,1.266091,0.764469,0.314663,0.493279,-0.995018,...,1.702806,1.644748,1.464454,-0.012941,0.882548,1.052997,1.964702,1.22894,-0.091657,reggae
1,-0.298576,0.227633,-0.595372,-0.050038,-0.362538,-0.174821,0.380594,-1.493513,0.052394,0.067482,...,-0.399484,-0.375331,0.083251,-0.606148,-0.166932,-0.204898,0.54384,0.331641,0.060347,reggae
2,-0.142344,-0.050788,-0.477223,0.161218,0.571295,-0.00067,0.009428,0.450461,0.230404,-0.127913,...,0.530146,0.803409,0.242894,-0.537517,-0.078573,-0.762395,0.811495,1.013223,-1.167858,country
3,-0.142344,0.539357,0.680464,-0.445936,-0.631178,-0.478404,-0.333189,0.310167,0.508399,0.508329,...,-0.371194,-0.184576,-0.295304,0.474484,0.970578,0.030053,1.027655,0.600615,-0.321155,blues
4,-0.142344,0.925586,-0.51329,0.087587,-0.601319,-0.250903,0.792784,-1.659057,0.431285,0.169624,...,-1.037717,-0.806537,-0.647711,-0.691931,-0.20106,-1.028406,-0.974214,-0.193546,-1.377689,metal


In [9]:
cols = [c for c in train.columns]

In [10]:
N_SPLITS = 5
N_REPEATS = 3
EARLY_STOPPING_ROUNDS = 300
VERBOSE = False
SEED = 17171

In [11]:
target_column = 'label'

In [12]:
features = [c for c in train.columns if c not in [target_column]]

In [13]:
X = train.drop('label',axis=1)
y=train['label']

In [14]:
## going by the optuna way:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
# Which hyperparameters to tune: https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

def objective(trial):

    # -- Tune estimator algorithm
    n_neighbors = trial.suggest_int("n_neighbors", 1, 41,2)
    #suggest_int(name, low, high[, step, log])
    weights = trial.suggest_categorical("weights", ['uniform', 'distance'])
    metric = trial.suggest_categorical("metric", ['euclidean', 'manhattan', 'minkowski'])
    #algorithm{‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, default=’auto’
    algorithm = trial.suggest_categorical("algorithm",['auto', 'ball_tree', 'kd_tree', 'brute'])
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, metric=metric, algorithm =algorithm )
        
    # -- Cross-validate the features reduced by dimensionality reduction methods
    rskfold = RepeatedStratifiedKFold(n_splits=5, n_repeats=3)
    score = cross_val_score(knn, X,y, scoring='accuracy', cv=rskfold, n_jobs = -1, error_score = 'raise')
    score = score.mean()
    return score


In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, timeout = 4*60*60)

[32m[I 2022-04-22 07:26:29,953][0m A new study created in memory with name: no-name-391cdfe1-04ae-4863-8f39-ac6d9e853cdd[0m
[32m[I 2022-04-22 07:26:30,296][0m Trial 0 finished with value: 0.6848148148148147 and parameters: {'n_neighbors': 9, 'weights': 'distance', 'metric': 'euclidean', 'algorithm': 'auto'}. Best is trial 0 with value: 0.6848148148148147.[0m
[32m[I 2022-04-22 07:26:30,564][0m Trial 1 finished with value: 0.6462962962962963 and parameters: {'n_neighbors': 27, 'weights': 'distance', 'metric': 'euclidean', 'algorithm': 'auto'}. Best is trial 0 with value: 0.6848148148148147.[0m
[32m[I 2022-04-22 07:26:30,848][0m Trial 2 finished with value: 0.7140740740740742 and parameters: {'n_neighbors': 7, 'weights': 'distance', 'metric': 'manhattan', 'algorithm': 'brute'}. Best is trial 2 with value: 0.7140740740740742.[0m
[32m[I 2022-04-22 07:26:31,030][0m Trial 3 finished with value: 0.6433333333333333 and parameters: {'n_neighbors': 25, 'weights': 'distance', 'metric

In [None]:
trial = study.best_trial    
print('Accuracy: {}'.format(trial.value))   


Accuracy: 0.7207407407407407


In [None]:
print("Best hyperparameters: {}".format(trial.params))


Best hyperparameters: {'n_neighbors': 5, 'weights': 'distance', 'metric': 'manhattan', 'algorithm': 'ball_tree'}


In [None]:

optuna.visualization.plot_optimization_history(study)


In [15]:
knn_params = {'n_neighbors': 5, 'weights': 'distance', 'metric': 'manhattan', 'algorithm': 'ball_tree'}

In [16]:

from sklearn.metrics import recall_score,precision_score,f1_score
from sklearn.metrics import classification_report
import statistics
from statistics import mean

In [18]:
%%time
N_SPLITS = 5
y_preds = []
acc=[]
from sklearn.model_selection import StratifiedKFold

folds = StratifiedKFold(n_splits = N_SPLITS)

for fold, (train_id, test_id) in enumerate(folds.split(X,y)):
  X_train = X.iloc[train_id]
  y_train = y.iloc[train_id]
  X_valid = X.iloc[test_id]
  y_valid = y.iloc[test_id]
  model = KNeighborsClassifier(**knn_params)
  model.fit(X_train, y_train)

  valid_pred = model.predict(X_valid)
  

#print('fold:{}'.format(fold+1))
  #print('Validation Accuracy score : {}'.format(accuracy_score(y_valid, valid_pred)))
  #print('-'*100)
  acc.append(accuracy_score(y_valid, valid_pred))

  y_preds.append(model.predict(test))


CPU times: user 145 ms, sys: 1.71 ms, total: 146 ms
Wall time: 150 ms


In [19]:
print('the mean accuracy is: {}'.format(mean(acc)))


the mean accuracy is: 0.7133333333333334


In [20]:
from collections import Counter
def mode(data):
    """Return the most common data point from discrete or nominal data.
    ``mode`` assumes discrete data, and returns a single value. This is the
    standard treatment of the mode as commonly taught in schools:
        >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
        3
    This also works with nominal (non-numeric) data:
        >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
        'red'
    If there are multiple modes with same frequency, return the first one
    encountered:
        >>> mode(['red', 'red', 'green', 'blue', 'blue'])
        'red'
    If *data* is empty, ``mode``, raises StatisticsError.
    """
    pairs = Counter(iter(data)).most_common(1)
    try:
        return pairs[0][0]
    except IndexError:
        raise StatisticsError('no mode for empty data') from None




In [21]:
y_pred = pd.DataFrame(columns=['knn_label'])
pred= []
for j in range(0,100):
    arr = [y_preds[0][j], y_preds[1][j], y_preds[2][j], y_preds[3][j],y_preds[4][j]]
    pred.append(mode(arr))

In [22]:
y_pred["knn_label"]=pred

In [23]:
y_pred

Unnamed: 0,knn_label
0,country
1,classical
2,hiphop
3,reggae
4,rock
...,...
95,disco
96,disco
97,pop
98,pop


In [24]:
y_pred.to_csv('knn_mutual_pred.csv', index=False)
!cp -r '/content/knn_mutual_pred.csv' /content/drive/MyDrive/project9/FS_FeatSelect

