In [523]:
import numpy as np
import pandas as pd

In [524]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.cluster import DBSCAN

In [525]:
def perform_pca(training,testing,comp):
    pca = PCA(n_components=comp)

    pca.fit(training)
    inp_pca = pca.transform(training)
    test_pca = pca.transform(testing)

    return inp_pca, test_pca

In [526]:
def perform_lda(training, labels, testing):
    lda = LinearDiscriminantAnalysis()
    lda.fit(data, labels)
    transformed_data = lda.transform(training)
    transformed_test = lda.transform(testing)
    return transformed_data, transformed_test

In [527]:
def kmeans_clustering(inp, num_clusters):
    # optimise_clustering(inp)
    kmeans = KMeans(n_clusters = num_clusters, n_init=10, random_state=0)
    kmeans.fit(inp)
    cluster_ids = kmeans.labels_
    final = np.column_stack((inp, cluster_ids))
    return final

In [528]:
def dbscan_clustering(inp, epsilon, min_s):
    dbscan = DBSCAN(eps = epsilon, min_samples = min_s)
    cluster_ids = dbscan.fit_predict(inp)
    final = np.column_stack((inp, cluster_ids))
    print(cluster_ids)
    return final

In [529]:
import matplotlib.pyplot as plt

def optimise_clustering(inp):
    wcss = []
    for i in range(1,50):
        kmeans = KMeans(n_clusters=i,n_init=10,random_state=42)
        kmeans.fit(inp)
        wcss.append(kmeans.inertia_)
    
    plt.plot(range(1, 50), wcss)
    plt.title('Elbow Method')
    plt.xlabel('Number of clusters')
    plt.ylabel('WCSS')
    plt.show()

In [530]:
def outlier_detection(inp,labels,num):
    full = np.column_stack((inp,labels))
    lof = LocalOutlierFactor(n_neighbors=num)

    # best = validation(full,lof)
    # lof.set_params(n_neighbors = best)

    preds = lof.fit_predict(inp)

    out_array = np.empty((0,full.shape[1]))
    for i in range(len(preds)):
        if(preds[i]==1):
            out_array = np.vstack((out_array, full[i]))
    return out_array

In [531]:
def logistic_regression(X_train, Y_train, X_test, iterations):
    # scaler = StandardScaler()
    # X_train = scaler.fit_transform(X_train)
    # X_test = scaler.transform(X_test)
    
    logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=iterations)
    logreg.fit(X_train, Y_train)
    y_pred = logreg.predict(X_test)
    return y_pred

In [532]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

def validation(inp,lof):
    neighbors_range = range(5,50)
    kf = KFold(n_splits=5)

    scores = np.empty(len(neighbors_range))

    for i,n_neighbors in enumerate(neighbors_range):
        lof.set_params(n_neighbors=n_neighbors)
        cv_scores = cross_val_score(lof,inp[:,:-1],inp[:,-1],cv=kf,scoring='f1')
        scores[i] = np.mean(cv_scores)
    
    best = neighbors_range[np.argmax(scores)]
    return best

In [533]:
readtrain = pd.read_csv("train.csv")

# mapping = {index: value for index,value in enumerate(readtrain['category'].unique())}

# le = LabelEncoder()
# readtrain['category'] = le.fit_transform(readtrain['category'])

labels = readtrain['category'].to_numpy()
readtrain = readtrain.drop(['ID'],axis=1)
data = readtrain.drop(['category'],axis=1).to_numpy()

testdata = pd.read_csv("test.csv")
ids = testdata['ID']
testdata = testdata.drop(['ID'],axis=1).to_numpy()

In [534]:
full = outlier_detection(data,labels,20)
# for i in range(15,33):
#     full = outlier_detection(data,labels,i)
labels = full[:,-1]
data = full[:,:-1]

In [535]:
data, testdata = perform_pca(data,testdata,0.99)

data, testdata = perform_lda(data, labels, testdata)

data = kmeans_clustering(data, 8)
testdata = kmeans_clustering(testdata, 8)

# data = dbscan_clustering(data, 0.001, 120)
# testdata = dbscan_clustering(testdata, 0.001, 120)

In [536]:
# k_range = range(15,40)
# accuracy_scores = []
# for i in k_range:
#     full = outlier_detection(data,labels,i)
#     labels = full[:,-1]
#     data = full[:,:-1]
#     X_train, X_test, Y_train, Y_test = train_test_split(data,labels,test_size=0.25,random_state=100)
#     Y_pred = classification(X_train,Y_train,X_test,1000)
#     accuracy_scores.append(accuracy_score(Y_test,Y_pred))

# print(k_range[np.argmax(accuracy_scores)])

In [537]:
# print(k_range,"\n",accuracy_scores)

In [538]:
# X_train, X_test, Y_train, Y_test = train_test_split(data,labels,test_size=0.25,random_state=42)
# Y_pred = logistic_regression(X_train,Y_train,X_test,1000)

In [539]:
output = logistic_regression(data,labels,testdata,1000)

In [540]:
# output = le.inverse_transform(Y_pred)

In [541]:
output = pd.DataFrame({'ID':ids,'category':output})
output.to_csv('output1.csv',index=False)

In [542]:
# print(accuracy_score(Y_test,Y_pred))