In [514]:
import numpy as np
import pandas as pd

In [515]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.cluster import DBSCAN
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaggingClassifier

In [516]:
def perform_pca(training,testing,comp,slvr,wht):
    pca = PCA(n_components=comp,whiten=wht,svd_solver=slvr)

    pca.fit(training)
    inp_pca = pca.transform(training)
    test_pca = pca.transform(testing)

    return inp_pca, test_pca

In [517]:
def perform_lda(training, target, testing, slvr, shrink, n_com):
    lda = LinearDiscriminantAnalysis(solver=slvr, shrinkage=shrink)
    lda.fit(training, target)
    transformed_data = lda.transform(training)
    transformed_test = lda.transform(testing)
    return transformed_data, transformed_test

In [518]:
def kmeans_clustering(inp, num_clusters):
    # optimise_clustering(inp)
    kmeans = KMeans(n_clusters = num_clusters, n_init=10, random_state=0)
    kmeans.fit(inp)
    cluster_ids = kmeans.labels_
    final = np.column_stack((inp, cluster_ids))
    return final

In [519]:
def dbscan_clustering(inp, epsilon, min_s):
    dbscan = DBSCAN(eps = epsilon, min_samples = min_s)
    cluster_ids = dbscan.fit_predict(inp)
    final = np.column_stack((inp, cluster_ids))
    print(cluster_ids)
    return final

In [520]:
def optimise_clustering(inp):
    wcss = []
    for i in range(1,50):
        kmeans = KMeans(n_clusters=i,n_init=10,random_state=42)
        kmeans.fit(inp)
        wcss.append(kmeans.inertia_)
    
    plt.plot(range(1, 50), wcss)
    plt.title('Elbow Method')
    plt.xlabel('Number of clusters')
    plt.ylabel('WCSS')
    plt.show()

In [521]:
def outlier_detection(inp,labels,num,cont):
    full = np.column_stack((inp,labels))
    lof = LocalOutlierFactor(n_neighbors=num,n_jobs=-1)

    # best = validation(full,lof)
    # lof.set_params(n_neighbors = best)

    preds = lof.fit_predict(inp)

    out_array = np.empty((0,full.shape[1]))
    for i in range(len(preds)):
        if(preds[i]==1):
            out_array = np.vstack((out_array, full[i]))
    return out_array

In [642]:
def logistic_regression(X_train, Y_train, X_test, slvr, iterations, c, pen):
    # scaler = StandardScaler()
    # X_train = scaler.fit_transform(X_train)
    # X_test = scaler.transform(X_test)

    logreg = LogisticRegression(multi_class='multinomial', solver=slvr, max_iter=iterations, C=c, penalty=pen)
    bagging_lr = BaggingClassifier(estimator=logreg,n_estimators=10,max_samples=0.8,bootstrap=False,warm_start=False,n_jobs=-1,random_state=0)
    bagging_lr.fit(X_train, Y_train)
    y_pred = bagging_lr.predict(X_test)
    return y_pred

In [601]:
def decision_tree(X_train, Y_train, X_test, depth, split, leaf, ftrs):
    clf = DecisionTreeClassifier(max_depth=depth,min_samples_split=split,min_samples_leaf=leaf,max_features=ftrs,random_state=21)
    clf.fit(X_train,Y_train)
    y_pred = clf.predict(X_test)
    return y_pred

In [602]:
def validation(inp,lof):
    neighbors_range = range(5,50)
    kf = KFold(n_splits=5)

    scores = np.empty(len(neighbors_range))

    for i,n_neighbors in enumerate(neighbors_range):
        lof.set_params(n_neighbors=n_neighbors)
        cv_scores = cross_val_score(lof,inp[:,:-1],inp[:,-1],cv=kf,scoring='f1')
        scores[i] = np.mean(cv_scores)
    
    best = neighbors_range[np.argmax(scores)]
    return best

In [603]:
readtrain = pd.read_csv("train.csv")

# mapping = {index: value for index,value in enumerate(readtrain['category'].unique())}

# le = LabelEncoder()
# readtrain['category'] = le.fit_transform(readtrain['category'])

labels = readtrain['category'].to_numpy()
readtrain = readtrain.drop(['ID'],axis=1)
data = readtrain.drop(['category'],axis=1).to_numpy()

testdata = pd.read_csv("test.csv")
ids = testdata['ID']
testdata = testdata.drop(['ID'],axis=1).to_numpy()

In [604]:
# for i in range(data.shape[0]):
#     for j in range(data.shape[1]):
#         data[i][j] = data[i][j] + data[i][j]**2 + data[i][j]**3

In [605]:
# def custom_scoring(estimator,X, Y=None):
#     return -estimator.decision_function(X).mean()

In [606]:
# lof = LocalOutlierFactor(n_jobs=-1)
# hypers = {
#     'n_neighbors':[5,10,15,20,24,30,32],
#     'contamination':[0.01,0.05,0.08,0.1,None]
# }
# gs = GridSearchCV(lof,param_grid=hypers,cv=8,scoring=custom_scoring)
# gs.fit(data,labels)

# best_lof = gs.best_estimator_

In [607]:
# preds = best_lof.fit_predict(data)
# data = data[preds!=-1]
# labels = labels[preds!=-1]

In [608]:
full = outlier_detection(data,labels,8,0.01)
# for i in range(15,33):
#     full = outlier_detection(data,labels,i)
labels_out = full[:,-1]
data_out = full[:,:-1]

In [609]:
print(data_out.shape,labels_out.shape)

(1211, 4096) (1211,)


In [610]:
# pipe = Pipeline([
#     ('pca', PCA()),
#     ('lda',LinearDiscriminantAnalysis()),
#     ('logreg',LogisticRegression())
# ])

# params = {
#     'pca__n_components':[0.9,0.99,0.999],
#     'logreg__multi_class':['multinomial'],
#     'logreg__solver':['lbfgs'],
#     'logreg__max_iter':[2000],
#     'logreg__C':[5,10,100],
#     'logreg__penalty':['l1','l2']
# }

# grs = GridSearchCV(pipe,param_grid=params,cv=8)
# grs.fit(data_out,labels_out)
# best = grs.best_params_

In [611]:
# pipe = Pipeline([
#     ('pca',PCA()),
#     ('lda',LinearDiscriminantAnalysis()),
#     ('dtc',DecisionTreeClassifier())
# ])

# params = {
#     'pca__n_components':[0.99],
#     'lda__solver':['eigen'],
#     'lda__shrinkage':[None],
#     'dtc__max_depth':[10,20,30,50,None],
#     'dtc__min_samples_split':[2,5,10],
#     'dtc__min_samples_leaf':[1,2,4],
#     'dtc__max_features':['sqrt','log2',None]
# }

# grs = GridSearchCV(pipe,param_grid=params,cv=8)
# grs.fit(data_out,labels_out)
# best = grs.best_params_

In [612]:
data_pca, testdata_pca = perform_pca(data_out,testdata,0.9897,'auto',False)
print(data_pca.shape)
data_lda, testdata_lda = perform_lda(data_pca, labels_out, testdata_pca, 'eigen', None, 19)

# data_cl = kmeans_clustering(data_lda, 20)
# testdata_cl = kmeans_clustering(testdata_lda, 20)

# data = dbscan_clustering(data, 0.001, 120)
# testdata = dbscan_clustering(testdata, 0.001, 120)

(1211, 355)


In [613]:
# gs.best_params_

In [614]:
# k_range = range(15,40)
# accuracy_scores = []
# for i in k_range:
#     full = outlier_detection(data,labels,i)
#     labels = full[:,-1]
#     data = full[:,:-1]
#     X_train, X_test, Y_train, Y_test = train_test_split(data,labels,test_size=0.25,random_state=100)
#     Y_pred = classification(X_train,Y_train,X_test,1000)
#     accuracy_scores.append(accuracy_score(Y_test,Y_pred))

# print(k_range[np.argmax(accuracy_scores)])

In [615]:
# print(k_range,"\n",accuracy_scores)

In [616]:
# X_train, X_test, Y_train, Y_test = train_test_split(data_lda,labels_out,test_size=0.25,random_state=42)
# Y_pred = logistic_regression(X_train,Y_train,X_test,1000,5,'l2')

In [617]:
# X_train, X_test, Y_train, Y_test = train_test_split(data_lda,labels_out,test_size=0.25,random_state=42)
# Y_pred = decision_tree(X_train,Y_train,X_test,None,8,1,None)

In [648]:
output = logistic_regression(data_lda,labels_out,testdata_lda,'lbfgs',1000,1,'l2')

In [649]:
# output = decision_tree(data_lda,labels_out,testdata_lda,20,10,1,None)

In [650]:
# output = le.inverse_transform(Y_pred)

In [651]:
output = pd.DataFrame({'ID':ids,'category':output})
output.to_csv('output.csv',index=False)

In [652]:
# print(accuracy_score(Y_test,Y_pred))