In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
train_data = pd.read_csv('/content/drive/MyDrive/KTaivle/3차미니프로젝트/kaggle/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/KTaivle/3차미니프로젝트/kaggle/test.csv')

In [4]:
train_data['timestamp'] = pd.to_datetime(train_data['timestamp'])

train_data['hour'] = train_data['timestamp'].dt.hour
train_data['minute'] = train_data['timestamp'].dt.minute
train_data['second'] = train_data['timestamp'].dt.second

test_data['timestamp'] = pd.to_datetime(test_data['timestamp'])

test_data['hour'] = test_data['timestamp'].dt.hour
test_data['minute'] = test_data['timestamp'].dt.minute
test_data['second'] = test_data['timestamp'].dt.second

train_data.drop(columns = ['Unnamed: 0', 'timestamp'], inplace=True)
test_data.drop(columns = ['Unnamed: 0', 'timestamp'], inplace=True)

In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 10 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   A_x     90000 non-null   float64
 1   A_y     90000 non-null   float64
 2   A_z     90000 non-null   float64
 3   B_x     90000 non-null   float64
 4   B_y     90000 non-null   float64
 5   B_z     90000 non-null   float64
 6   label   100000 non-null  int64  
 7   hour    100000 non-null  int64  
 8   minute  100000 non-null  int64  
 9   second  100000 non-null  int64  
dtypes: float64(6), int64(4)
memory usage: 7.6 MB


In [6]:
train_data

Unnamed: 0,A_x,A_y,A_z,B_x,B_y,B_z,label,hour,minute,second
0,-0.259130,-0.834869,-0.485499,0.196409,,0.384934,8,0,45,54
1,0.370490,0.175042,0.122625,-0.338242,0.358245,0.126491,2,1,37,6
2,-0.257837,-0.881947,-0.391895,0.196027,0.894537,0.411221,8,0,45,33
3,-0.937753,-0.055961,0.362041,-0.929881,0.087673,0.134609,11,0,46,22
4,-0.988320,-0.190390,0.157909,-0.954669,-0.024810,-0.388420,6,0,49,56
...,...,...,...,...,...,...,...,...,...,...
99995,-0.499562,0.012127,0.365746,,-0.040284,-0.180426,10,0,25,51
99996,-0.929146,,-0.362481,-0.951160,-0.047168,-0.344213,3,1,37,53
99997,-0.940124,-0.301950,,-0.983245,-0.011691,-0.178657,6,0,1,2
99998,-1.152895,-0.149863,-0.746005,-0.945021,-0.686593,0.317497,5,0,31,54


In [7]:
selected_columns = ['label', 'A_x', 'A_y', 'A_z', 'B_x', 'B_y', 'B_z', 'hour', 'minute', 'second']
selected_data = train_data[selected_columns]

correlation_matrix = selected_data.corr()

label_correlation = correlation_matrix['label']
print("Label과 각 Column들 간의 상관계수:\n", label_correlation)

Label과 각 Column들 간의 상관계수:
 label     1.000000
A_x       0.165722
A_y      -0.275390
A_z       0.315315
B_x       0.207724
B_y       0.065651
B_z       0.223359
hour     -0.079200
minute    0.000107
second   -0.001501
Name: label, dtype: float64


In [8]:
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

In [9]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53219 entries, 1 to 99999
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A_x     53219 non-null  float64
 1   A_y     53219 non-null  float64
 2   A_z     53219 non-null  float64
 3   B_x     53219 non-null  float64
 4   B_y     53219 non-null  float64
 5   B_z     53219 non-null  float64
 6   label   53219 non-null  int64  
 7   hour    53219 non-null  int64  
 8   minute  53219 non-null  int64  
 9   second  53219 non-null  int64  
dtypes: float64(6), int64(4)
memory usage: 4.5 MB


In [10]:
def eda_1_num(data, target):
    category_counts = data[target].value_counts()
    category_proportions = data[target].value_counts(normalize=True)

    print("빈도 수")
    print(category_counts)
    print("빈도 비율")
    print(category_proportions)

eda_1_num(train_data, 'label')

빈도 수
2     6453
1     6188
8     6056
7     5670
10    5068
9     4891
11    4838
3     4428
4     3853
5     2913
6     2861
Name: label, dtype: int64
빈도 비율
2     0.121254
1     0.116274
8     0.113794
7     0.106541
10    0.095229
9     0.091903
11    0.090907
3     0.083203
4     0.072399
5     0.054736
6     0.053759
Name: label, dtype: float64


In [11]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train = train_data.drop(columns = 'label')
y_train = train_data['label']

X_smote, y_smote = smote.fit_resample(X_train, y_train)

X_smote_df = pd.DataFrame(X_smote, columns = X_train.columns)
y_smote_df = pd.Series(y_smote, name = 'label')

train_data = pd.concat([X_smote_df, y_smote_df], axis = 1)

In [12]:
train_data

Unnamed: 0,A_x,A_y,A_z,B_x,B_y,B_z,hour,minute,second,label
0,0.370490,0.175042,0.122625,-0.338242,0.358245,0.126491,1,37,6,2
1,-0.257837,-0.881947,-0.391895,0.196027,0.894537,0.411221,0,45,33,8
2,-0.937753,-0.055961,0.362041,-0.929881,0.087673,0.134609,0,46,22,11
3,-0.988320,-0.190390,0.157909,-0.954669,-0.024810,-0.388420,0,49,56,6
4,-0.654583,0.068285,-0.029109,-0.176341,-0.256252,-0.510816,1,34,24,2
...,...,...,...,...,...,...,...,...,...,...
70978,-1.109399,-0.095319,0.381631,-0.823535,-0.028876,0.146295,0,17,8,11
70979,-0.959333,-0.067261,0.352323,-1.038467,0.088525,0.026165,0,34,2,11
70980,-0.928065,-0.057637,0.467929,-1.253888,-0.058589,0.094800,0,12,26,11
70981,-0.768061,-0.124282,0.291681,-1.176916,0.035379,0.355506,0,19,54,11


In [13]:
eda_1_num(train_data, 'label')

빈도 수
2     6453
8     6453
11    6453
6     6453
4     6453
5     6453
10    6453
1     6453
3     6453
7     6453
9     6453
Name: label, dtype: int64
빈도 비율
2     0.090909
8     0.090909
11    0.090909
6     0.090909
4     0.090909
5     0.090909
10    0.090909
1     0.090909
3     0.090909
7     0.090909
9     0.090909
Name: label, dtype: float64


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegressionCV, RidgeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier

In [16]:
random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
svm_clf = LinearSVC(max_iter=100, tol=20, random_state=42)
mlp_clf = MLPClassifier(random_state=42)

In [18]:
named_estimators = [
    ("random_forest_clf", random_forest_clf),
    ("extra_trees_clf", extra_trees_clf),
    ("svm_clf", svm_clf),
    ("mlp_clf", mlp_clf),
]

In [19]:
X = train_data.drop(columns = 'label')
y = train_data['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify = y, random_state=42, test_size=0.2)

In [20]:
voting_clf = VotingClassifier(named_estimators)

voting_clf.fit(X_train, y_train)



In [22]:
final_predictions = voting_clf.predict(X_val)

accuracy = accuracy_score(y_val, final_predictions)
print("Accuracy : ", accuracy)
print(confusion_matrix(y_val, final_predictions))
print(classification_report(y_val, final_predictions))

Accuracy :  0.9679509755582165
[[1235   22    7   10    0    1    0    0    0   12    3]
 [  15 1265    0    0    0    0    0    0    0    9    2]
 [   4    2 1229   23   32    0    1    0    0    0    0]
 [   5    0  105 1181    0    0    0    0    0    0    0]
 [   0    0  145    0 1146    0    0    0    0    0    0]
 [   2    0    2    0    0 1284    0    0    0    2    0]
 [   0    0    0    0    0    0 1291    0    0    0    0]
 [   0    0    0    0    0    0    0 1291    0    0    0]
 [   0    0    1    0    0    0    0    0 1290    0    0]
 [   6    9    1    0    0    3    0    0    0 1267    4]
 [   2    0    1    0    0    0    0    0    0   24 1263]]
              precision    recall  f1-score   support

           1       0.97      0.96      0.97      1290
           2       0.97      0.98      0.98      1291
           3       0.82      0.95      0.88      1291
           4       0.97      0.91      0.94      1291
           5       0.97      0.89      0.93      1291
     

In [24]:
y_pred = voting_clf.predict(test_data)

result_df = pd.DataFrame({'label': y_pred})

result_df.to_csv('sample.csv', index_label='ID')