In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import tree, linear_model
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
%matplotlib inline

In [2]:
rawdata=pd.read_csv('hr-analytics.csv')
rawdata.fillna(rawdata.mean())
rawdata.dropna(axis=1,inplace=True)
rawdata=pd.get_dummies(rawdata)
#rawdata

In [3]:
y=rawdata['left']
X=rawdata.drop(columns='left')
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [4]:
start_time = time.time()
clf = tree.DecisionTreeClassifier()
left_clf = clf.fit(X_train,y_train)
pred = left_clf.predict(X_test)
elapsed_time = time.time() - start_time
print('accuracy of decision tree:',accuracy_score(y_test,pred))
print('Time:',elapsed_time)

accuracy of decision tree: 0.9742222222222222
Time: 0.028782129287719727


In [5]:
start_time=time.time()
svm = SVC(kernel='linear', probability=True)
svm.fit(X_train,y_train)
pred=svm.predict(X_test)
elapsed_time=time.time()-start_time
print('accuracy of SVM:',accuracy_score(y_test,pred))
print('Time:',elapsed_time)

accuracy of SVM: 0.7717777777777778
Time: 144.56733083724976


In [6]:
start_time=time.time()
logistic_regr=linear_model.LogisticRegression()
logistic_regr.fit(X_train,y_train)
pred=logistic_regr.predict(X_test)
accuracy_score(y_test,pred)
elapsed_time=time.time()-start_time
print('accuracy of logistic_regr:',accuracy_score(y_test,pred))
print('Time:',elapsed_time)

accuracy of logistic_regr: 0.7855555555555556
Time: 0.10630011558532715


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## 模型探討：
Decision tree 較準確且執行時間較短



# Titanic:

In [7]:
rawdata=pd.read_csv('titanic_train.csv')
rawdata=rawdata.drop(['name','body','home.dest','cabin','ticket'],axis=1)
rawdata=rawdata.fillna(rawdata.mean())
#rawdata.dropna(axis=1,inplace=True)
rawdata=pd.get_dummies(rawdata)
#rawdata

In [8]:
y=rawdata['survived']
X=rawdata.drop(columns='survived')
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
start_time=time.time()
svm = SVC(kernel='linear', probability=True)
svm.fit(X_train,y_train)
pred=svm.predict(X_test)
elapsed_time=time.time()-start_time
matrix=confusion_matrix(y_test,pred)
class_names=[0,1]
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
print('f1_score of SVM:',f1_score(y_test, pred))
print('Time:',elapsed_time)

In [None]:
start_time=time.time()
forest=RandomForestClassifier(criterion='entropy',n_estimators=10,random_state=3,n_jobs=6)
forest.fit(X_train,y_train)
pred=forest.predict(X_test)
elapsed_time=time.time()-start_time
matrix=confusion_matrix(y_test,pred)
class_names=[0,1]
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
print('f1_score of Random Forest:',f1_score(y_test, pred))
print('Time:',elapsed_time)

In [None]:
start_time=time.time()
model = XGBClassifier(max_depth=2, objective='binary:logistic')
model.fit(X_train,y_train)
pred=model.predict(X_test)


elapsed_time=time.time()-start_time
matrix=confusion_matrix(y_test,pred)
class_names=[0,1]
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
print('f1_score of XGBoost:',f1_score(y_test, pred))
print('Time:',elapsed_time)

In [None]:
ori_test=pd.read_csv('titanic_test.csv')
X_test=ori_test.drop(['name','body','home.dest','cabin','ticket'],axis=1)
X_test=X_test.fillna(X_test.mean())
X_test=pd.get_dummies(X_test)
missing_cols = set( X_train.columns ) - set( X_test.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_test[c] = 0
X_test = X_test[X_train.columns]
pred=svm.predict(X_test)
dic={'Name':ori_test['name'],'Survival':pred}
df=pd.DataFrame(dic)
df.to_csv('titanic_pred.csv',index=0)