In [619]:
import numpy as np
import pandas as pd

In [620]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [621]:
def dimension_reduction(inp,comp):
    pca = PCA(n_components=comp)

    pca.fit(inp)
    inp_pca = pca.transform(inp)

    return inp_pca

In [622]:
def clustering(inp, num_clusters):
    kmeans = KMeans(n_clusters = num_clusters, n_init=10, random_state = 0).fit(inp)
    cluster_ids = kmeans.labels_
    final = np.column_stack((inp, cluster_ids))
    return final

In [623]:
def outlier_detection(inp,labels):
    full = np.column_stack((inp,labels))
    lof = LocalOutlierFactor(n_neighbors=20,contamination=0.1)
    preds = lof.fit_predict(inp)

    out_array = np.empty((0,full.shape[1]))
    for i in range(len(preds)):
        if(preds[i]==1):
            out_array = np.vstack((out_array, full[i]))
    return out_array

In [624]:
def classification(X_train, Y_train, X_test, iterations):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=iterations)
    logreg.fit(X_train, Y_train)
    y_pred = logreg.predict(X_test)
    return y_pred

In [625]:
readtrain = pd.read_csv("train.csv")

mapping = {index: value for index,value in enumerate(readtrain['category'].unique())}

le = LabelEncoder()
readtrain['category'] = le.fit_transform(readtrain['category'])

labels = readtrain['category'].to_numpy()
readtrain = readtrain.drop(['ID'],axis=1)
data = readtrain.drop(['category'],axis=1).to_numpy()

testdata = pd.read_csv("test.csv")
ids = testdata['ID']
testdata = testdata.drop(['ID'],axis=1).to_numpy()

In [626]:
data = dimension_reduction(data,64)
testdata = dimension_reduction(testdata,64)
data = clustering(data, 19)
testdata = clustering(testdata, 19)

In [627]:
full = outlier_detection(data,labels)
labels = full[:,-1]
data = full[:,:-1]

In [628]:
# X_train, X_test, Y_train, Y_test = train_test_split(data,labels,test_size=0.25,random_state=100)
Y_pred = classification(data,labels,testdata,1000)

In [629]:
out_array = []
for i,y in enumerate(Y_pred):
    out_array.append(mapping[y])

out_array = (np.array(out_array)).astype('str')

In [630]:
output = pd.DataFrame({'ID':ids,'category':out_array})
output.to_csv('output.csv',index=False)

In [631]:
# print(accuracy_score(Y_test,Y_pred))