In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os



訓練データとテストデータの読み込み

In [None]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [None]:
train

In [None]:
test

In [None]:
#欠損値がないか確認（nullが1つもなければ以下のprint文でTrueが出力される）
print(np.all(train.isnull().sum() == 0))
print(np.all(test.isnull().sum() == 0))

In [None]:
#データ型を確認
print(train.dtypes)

In [None]:

#正解ラベルをエンコーディングするため、行名とラベルの対応を管理するdictを作成
map_nametovalue = {}
u = train['target'].unique()
for i, row in enumerate(u):
  map_nametovalue[row] = str(i)

#正解ラベルをエンコーディングするため、行名とラベルの対応を管理するdictを作成（逆引き辞書）
map_valuetoname = {}
def get_swap_dict(d):
    return {v: k for k, v in d.items()}
map_valuetoname = get_swap_dict(map_nametovalue)



In [None]:
#訓練データ・テストデータを標準化

from sklearn import preprocessing
def scaling(df):
    '''Scaling the Dataset'''
    standard_scaler = preprocessing.StandardScaler()
    df_scale = standard_scaler.fit_transform(df)
    df_scale = pd.DataFrame(df_scale, columns=df.columns)
    return df_scale


In [None]:
train_X_scaled = scaling(train.drop(['row_id', 'target'], axis=1))
test_X_scaled = scaling(test.drop(['row_id'], axis=1))

In [None]:
train_X_scaled


In [None]:
test_X_scaled

In [None]:
#訓練データのラベルをエンコーディング
train_Y = train['target'].map(map_nametovalue)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

n = 2000
clf = LinearDiscriminantAnalysis(n_components=3).fit(
    train_X_scaled[:n], train_Y[:n]
    )
train_x_embedded = clf.transform(train_X_scaled)
fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111, projection='3d')
#cm = cm.replace(0, "red")
#cm = cm.replace(1, "blue")
print(train_x_embedded)
ax.scatter(train_x_embedded[:, 0], train_x_embedded[:, 1], train_x_embedded[:, 2])
#ax.plot(train_x_embedded)
plt.show()


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV

X_train, X_valid, y_train, y_valid = train_test_split(
    train_X_scaled, train_Y, test_size=0.20, random_state=42)
model = KNeighborsClassifier(n_neighbors=1, n_jobs=-1)
model.fit(X_train, y_train)

print(
    f'--------------------------------\n| Training Accuracy   :- {(model.score(X_train, y_train)*100).round(2)}% |')
print(
    f'--------------------------------\n| Validation Accuracy :- {(model.score(X_valid, y_valid)*100).round(2)}% |\n--------------------------------')


In [None]:
#テストデータに対してKNNを適用
test_y_pred = model.predict(test_X_scaled)


In [None]:
#テスト結果のラベルをデコーディング
test_y_pred_decode = pd.Series(test_y_pred)
test_y_pred_decode = test_y_pred_decode.map(map_valuetoname)
test_y_pred_decode_frame = pd.DataFrame(test_y_pred_decode)
test_y_pred_decode_frame.columns = ["target"]


In [None]:
test_id = pd.DataFrame(test['row_id'])


In [None]:
submission = pd.concat([test_id, test_y_pred_decode_frame], axis=1)
submission.to_csv("../submission/submission.csv", index=False)
