In [15]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import cross_val_score
from IPython.display import display
seed = 19
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

# MODELS
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split



In [16]:
df = pd.read_table('training_data.txt', sep='\t', skiprows=1, header=None)  # 读入txt文件，分隔符为\t
subject = df.iloc[:, 0]
activity = df.iloc[:, 1]

df = df.iloc[:, 2:]
X = df.values
y = activity.values
y[y>=7]=7

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=123)

df = pd.read_table('test_data.txt', sep='\t', skiprows=1, header=None)  # 读入txt文件，分隔符为\t
subject = df.iloc[:, 0]
df = df.iloc[:, 1:]
X_test = df.values

In [17]:
models = [
    LogisticRegression(max_iter=20000, random_state=seed),
    KNeighborsClassifier(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    SGDClassifier(random_state=seed), 
    RidgeClassifier(random_state=seed),
    AdaBoostClassifier(random_state=seed),
    ExtraTreesClassifier(random_state=seed),
    RandomForestClassifier(random_state=seed),
    BaggingClassifier(random_state=seed),
    SVC(kernel='rbf', gamma=1e-3, C=1000)
]

models_name = [
    'LogisticRegression',
    'KNeighborsClassifier',
    'LinearDiscriminantAnalysis',
    'QuadraticDiscriminantAnalysis',
    'SGDClassifier(',
    'RidgeClassifier',
    'AdaBoostClassifier',
    'ExtraTreesClassifier',
    'RandomForestClassifier',
    'BaggingClassifier',
    'SVM'
]


In [18]:
import warnings
warnings.filterwarnings("ignore")

score=[]
std = []

for i in models: 
    try: 
        history = cross_val_score(estimator=i,X=X,y= y, cv=3, scoring='accuracy')
        score.append(np.mean(history)) 
        std.append(np.std(history))       
        print(i,': DONE') 
    except: 
        score.append('Error Occured') 
        print(i,': ERROR')

LogisticRegression(max_iter=20000, random_state=19) : DONE
KNeighborsClassifier() : DONE
LinearDiscriminantAnalysis() : DONE
QuadraticDiscriminantAnalysis() : DONE
SGDClassifier(random_state=19) : DONE
RidgeClassifier(random_state=19) : DONE
AdaBoostClassifier(random_state=19) : DONE
ExtraTreesClassifier(random_state=19) : DONE
RandomForestClassifier(random_state=19) : DONE
BaggingClassifier(random_state=19) : DONE
SVC(C=1000, gamma=0.001) : DONE


In [19]:
table = pd.DataFrame(models_name) 
table.columns = ['Models'] 
table['Result'] = score
table['Deviation'] = std
table.sort_values(by=["Result"], ascending = False)

Unnamed: 0,Models,Result,Deviation
2,LinearDiscriminantAnalysis,0.952105,0.00908
10,SVM,0.94541,0.01024
5,RidgeClassifier,0.943479,0.012122
0,LogisticRegression,0.940003,0.010673
7,ExtraTreesClassifier,0.91863,0.017346
4,SGDClassifier(,0.916184,0.028312
8,RandomForestClassifier,0.905111,0.022234
1,KNeighborsClassifier,0.888889,0.010556
9,BaggingClassifier,0.862366,0.009234
3,QuadraticDiscriminantAnalysis,0.796318,0.024977


In [20]:
from sklearn.ensemble import VotingClassifier

voting = VotingClassifier(
    estimators=[('ld', LinearDiscriminantAnalysis()),
                ('svm', SVC(kernel='rbf', gamma=1e-3, C=1000, probability=True)), 
                ('log', LogisticRegression(max_iter=20000, random_state=seed)) ], voting="soft")
scores = cross_val_score(voting, X, y, scoring="accuracy", cv=4)
print(scores.mean())


0.9479893071139289


In [21]:
model = VotingClassifier(
    estimators=[('ld', LinearDiscriminantAnalysis()),
                ('svm', SVC(kernel='rbf', gamma=1e-3, C=1000, probability=True)), 
                ('log', LogisticRegression(max_iter=20000, random_state=seed)) ], voting="soft")
model.fit(X, y)


In [26]:
pred = model.predict(X_train)
count = np.sum(y_train==pred)
print(f"Train Data Accuracy: {count/len(y_train)}")

Train Data Accuracy: 0.9934009335264767


In [27]:
pred = model.predict(X_val)
count = np.sum(y_val==pred)
print(f"Validation Data Accuracy: {count/len(y_val)}")

Validation Data Accuracy: 0.9929214929214929
