In [510]:
import numpy as np
import pandas as pd

In [511]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [512]:
def dimension_reduction(inp,comp):
    pca = PCA(n_components=comp)

    pca.fit(inp)
    inp_pca = pca.transform(inp)

    return inp_pca

In [513]:
def clustering(inp, num_clusters):
    kmeans = KMeans(n_clusters = num_clusters, n_init=10, random_state = 0).fit(inp)
    cluster_ids = kmeans.labels_
    final = np.column_stack((inp, cluster_ids))
    return final

In [514]:
def outlier_detection(inp,labels):
    full = np.column_stack((inp,labels))
    lof = LocalOutlierFactor(n_neighbors=120,contamination=0.1)
    preds = lof.fit_predict(inp)

    out_array = np.empty((0,full.shape[1]))
    for i in range(len(preds)):
        if(preds[i]==1):
            out_array = np.vstack((out_array, full[i]))
            
    return out_array

In [515]:
def classification(X_train, Y_train, X_test, iterations):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=iterations)
    logreg.fit(X_train, Y_train)
    y_pred = logreg.predict(X_test)
    return y_pred

In [516]:
readtrain = pd.read_csv("train.csv")

mapping = {index: value for index,value in enumerate(readtrain['category'].unique())}

le = LabelEncoder()
readtrain['category'] = le.fit_transform(readtrain['category'])

labels = readtrain['category'].to_numpy()
readtrain = readtrain.drop(['ID'],axis=1)
data = readtrain.drop(['category'],axis=1).to_numpy()

testdata = pd.read_csv("test.csv")
testdata = testdata.drop(['ID'],axis=1).to_numpy()

In [517]:
# data = dimension_reduction(data,90)
# data = clustering(data, 50)
full = outlier_detection(data,labels)
labels = full[:,-1]
data = full[:,:-1]

In [518]:
# X_train, X_test, Y_train, Y_test = train_test_split(data,labels,test_size=0.25,random_state=100)
Y_pred = classification(data,labels,testdata,10000)

In [519]:
out_array = []
for i,y in enumerate(Y_pred):
    out_array.append(mapping[y])

out_array = (np.array(out_array)).astype('str')

In [520]:
output = np.column_stack((np.arange(len(out_array)),out_array))
np.savetxt("output.csv",output,delimiter=",",header="ID,Category",fmt="%s")

In [521]:
# print(accuracy_score(Y_test,Y_pred))

In [522]:

print(out_array)

['Strawberry_Raw' 'Leeche_Ripe' 'Banana_Raw' 'Coconut_Raw' 'Papaya_Raw'
 'Leeche_Raw' 'Papaya_Ripe' 'Banana_Ripe' 'Papaya_Ripe' 'Papaya_Ripe'
 'Orange_Raw' 'Leeche_Raw' 'Strawberry_Raw' 'Strawberry_Ripe' 'Apple_Ripe'
 'Guava_Raw' 'Leeche_Ripe' 'Mango_Raw' 'Apple_Raw' 'Strawberry_Raw'
 'Guava_Ripe' 'Leeche_Raw' 'Apple_Ripe' 'Coconut_Ripe' 'Leeche_Ripe'
 'Orange_Raw' 'Orange_Ripe' 'Papaya_Raw' 'Papaya_Ripe' 'Papaya_Raw'
 'Leeche_Raw' 'Banana_Ripe' 'Coconut_Raw' 'Mango_Raw' 'Strawberry_Raw'
 'Mango_Ripe' 'Banana_Ripe' 'Leeche_Ripe' 'Leeche_Ripe' 'Banana_Raw'
 'Guava_Ripe' 'Strawberry_Ripe' 'Pomengranate_Ripe' 'Pomengranate_Raw'
 'Pomengranate_Ripe' 'Banana_Ripe' 'Pomengranate_Ripe' 'Banana_Ripe'
 'Leeche_Raw' 'Pomengranate_Raw' 'Strawberry_Ripe' 'Apple_Raw'
 'Strawberry_Raw' 'Apple_Raw' 'Banana_Ripe' 'Guava_Raw' 'Pomengranate_Raw'
 'Strawberry_Raw' 'Mango_Ripe' 'Orange_Raw' 'Leeche_Raw' 'Guava_Ripe'
 'Orange_Ripe' 'Coconut_Ripe' 'Coconut_Raw' 'Orange_Ripe' 'Strawberry_Raw'
 'Guava_Ripe' '