In [1682]:
import numpy as np
import pandas as pd

In [1683]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.cluster import DBSCAN
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier

In [1684]:
def perform_pca(training,testing,comp,slvr,wht):
    # pca = PCA(n_components=comp,svd_solver=slvr,whiten=wht)
    pca = PCA(n_components=comp)

    inp_pca = pca.fit_transform(training)
    test_pca = pca.transform(testing)

    return inp_pca, test_pca

In [1685]:
def perform_lda(training, target, testing, slvr, shrink, n_com):
    # lda = LinearDiscriminantAnalysis(n_components=n_com,solver=slvr,store_covariance=True)
    lda = LinearDiscriminantAnalysis(n_components=n_com)
    
    transformed_data = lda.fit_transform(training,target)
    transformed_test = lda.transform(testing)
    return transformed_data, transformed_test

In [1686]:
def kmeans_clustering(inp, num_clusters):
    # optimise_clustering(inp)
    kmeans = KMeans(n_clusters = num_clusters, n_init=10, random_state=0)
    kmeans.fit(inp)
    cluster_ids = kmeans.labels_
    final = np.column_stack((inp, cluster_ids))
    return final

In [1687]:
def dbscan_clustering(inp, epsilon, min_s):
    dbscan = DBSCAN(eps = epsilon, min_samples = min_s)
    cluster_ids = dbscan.fit_predict(inp)
    final = np.column_stack((inp, cluster_ids))
    print(cluster_ids)
    return final

In [1688]:
def optimise_clustering(inp):
    wcss = []
    for i in range(1,50):
        kmeans = KMeans(n_clusters=i,n_init=10,random_state=42)
        kmeans.fit(inp)
        wcss.append(kmeans.inertia_)
    
    plt.plot(range(1, 50), wcss)
    plt.title('Elbow Method')
    plt.xlabel('Number of clusters')
    plt.ylabel('WCSS')
    plt.show()

In [1689]:
def outlier_detection(inp,labels,num,cont,alg,met):
    full = np.column_stack((inp,labels))
    lof = LocalOutlierFactor(n_neighbors=num,algorithm=alg,metric=met)

    # best = validation(full,lof)
    # lof.set_params(n_neighbors = best)

    preds = lof.fit_predict(inp)

    return inp[preds==1], labels[preds==1]

In [1690]:
def logistic_regression(X_train, Y_train, X_test, slvr, iterations, c1, c2, c3, pen):
    # scaler = StandardScaler()
    # X_train = scaler.fit_transform(X_train)
    # X_test = scaler.transform(X_test)

    logreg1 = LogisticRegression(multi_class='multinomial',max_iter=iterations,solver=slvr,C=c1,penalty=pen)
    logreg2 = LogisticRegression(multi_class='multinomial',max_iter=iterations,solver=slvr,C=c2,penalty=pen)
    logreg3 = LogisticRegression(multi_class='multinomial',max_iter=iterations,solver=slvr,C=c3,penalty=pen)

    vf = VotingClassifier(estimators=[('lr1',logreg1),('lr2',logreg2),('lr3',logreg3)],voting='soft',weights=[2,1,1])
    vf.fit(X_train,Y_train)

    y_pred = vf.predict(X_test)
    
    return y_pred

In [1691]:
def decision_tree(X_train, Y_train, X_test, depth, split, leaf, ftrs):
    clf = DecisionTreeClassifier(max_depth=depth,min_samples_split=split,min_samples_leaf=leaf,max_features=ftrs,random_state=21)
    clf.fit(X_train,Y_train)
    y_pred = clf.predict(X_test)
    return y_pred

In [1692]:
def validation(inp,lof):
    neighbors_range = range(5,50)
    kf = KFold(n_splits=5)

    scores = np.empty(len(neighbors_range))

    for i,n_neighbors in enumerate(neighbors_range):
        lof.set_params(n_neighbors=n_neighbors)
        cv_scores = cross_val_score(lof,inp[:,:-1],inp[:,-1],cv=kf,scoring='f1')
        scores[i] = np.mean(cv_scores)
    
    best = neighbors_range[np.argmax(scores)]
    return best

In [1693]:
def correlation_analysis(inp,test,n1):
    cov_matrix = np.corrcoef(inp,rowvar=False)
    target_cov = np.abs(cov_matrix[:-1,-1])

    cov_sort = np.argsort(target_cov)[::-1]
    return inp[:, cov_sort[:n1]], test[:, cov_sort[:n1]]

In [1694]:
readtrain = pd.read_csv("train.csv")

# mapping = {index: value for index,value in enumerate(readtrain['category'].unique())}

# le = LabelEncoder()
# readtrain['category'] = le.fit_transform(readtrain['category'])

labels = readtrain['category'].to_numpy()
readtrain = readtrain.drop(['ID'],axis=1)
data = readtrain.drop(['category'],axis=1).to_numpy()

testdata = pd.read_csv("test.csv")
ids = testdata['ID']
testdata = testdata.drop(['ID'],axis=1).to_numpy()

In [1695]:
# for i in range(data.shape[0]):
#     for j in range(data.shape[1]):
#         data[i][j] = data[i][j] + data[i][j]**2 + data[i][j]**3

In [1696]:
# def custom_scoring(estimator,X, Y=None):
#     return -estimator.decision_function(X).mean()

In [1697]:
# lof = LocalOutlierFactor(n_jobs=-1)
# hypers = {
#     'n_neighbors':[5,10,15,20,24,30,32],
#     'contamination':[0.01,0.05,0.08,0.1,None]
# }
# gs = GridSearchCV(lof,param_grid=hypers,cv=8,scoring=custom_scoring)
# gs.fit(data,labels)

# best_lof = gs.best_estimator_

In [1698]:
# preds = best_lof.fit_predict(data)
# data = data[preds!=-1]
# labels = labels[preds!=-1]

In [1699]:
data_out, labels_out = outlier_detection(data,labels,5,0.01,'ball_tree','euclidean')
# for i in range(15,33):
#     full = outlier_detection(data,labels,i)

In [1700]:
print(data_out.shape,labels_out.shape)

(1205, 4096) (1205,)


In [1701]:
data_out, testdata = correlation_analysis(data_out,testdata,2000)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [1702]:
data_pca, testdata_pca = perform_pca(data_out,testdata,0.99,'auto',False)
print(data_pca.shape)
data_lda, testdata_lda = perform_lda(data_pca, labels_out, testdata_pca, 'eigen', None, 19)

# data_cl = kmeans_clustering(data_lda, 20)
# testdata_cl = kmeans_clustering(testdata_lda, 20)

# data = dbscan_clustering(data, 0.001, 120)
# testdata = dbscan_clustering(testdata, 0.001, 120)

(1205, 263)


In [1703]:
# gs.best_params_

In [1704]:
# k_range = range(15,40)
# accuracy_scores = []
# for i in k_range:
#     full = outlier_detection(data,labels,i)
#     labels = full[:,-1]
#     data = full[:,:-1]
#     X_train, X_test, Y_train, Y_test = train_test_split(data,labels,test_size=0.25,random_state=100)
#     Y_pred = classification(X_train,Y_train,X_test,1000)
#     accuracy_scores.append(accuracy_score(Y_test,Y_pred))

# print(k_range[np.argmax(accuracy_scores)])

In [1705]:
# print(k_range,"\n",accuracy_scores)

In [1706]:
# X_train, X_test, Y_train, Y_test = train_test_split(data_lda,labels_out,test_size=0.25,random_state=42)
# Y_pred = logistic_regression(X_train,Y_train,X_test,1000,5,'l2')

In [1707]:
# X_train, X_test, Y_train, Y_test = train_test_split(data_lda,labels_out,test_size=0.25,random_state=42)
# Y_pred = decision_tree(X_train,Y_train,X_test,None,8,1,None)

In [1708]:
output = logistic_regression(data_lda,labels_out,testdata_lda,'newton-cg',1000,1,1.25,1.5,'l2')

In [None]:
# output = decision_tree(data_lda,labels_out,testdata_lda,20,10,1,None)

In [None]:
# output = le.inverse_transform(Y_pred)

In [None]:
output = pd.DataFrame({'ID':ids,'category':output})
output.to_csv('output.csv',index=False)

In [None]:
# print(accuracy_score(Y_test,Y_pred))