# Machine Learning Project Part 3 - Feature Selection

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost

In [3]:
df = pd.read_csv('./dataset/cleandata.csv', index_col=[0])
x = df.drop('income', axis=1)
y = df['income']

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

In [5]:
# scale the training and testing data
scaler = StandardScaler() 
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# PCA

In [6]:
pca = PCA(n_components = 70)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

### 1. apply PCA on Logistic Regression

In [None]:
logistic = LogisticRegression(random_state=0)
logistic.fit(x_train_pca, y_train)
logistic_pred = logistic.predict(x_test_pca)

In [None]:
print(classification_report(y_test, logistic_pred))

### 2. apply PCA on SVM

In [None]:
SVM_model = LinearSVC()
SVM_model.fit(x_train_pca, y_train)
SVM_pred = SVM_model.predict(x_test_pca)

In [None]:
print(classification_report(y_test, SVM_pred))

### 3. apply PCA on Decision Tree

In [None]:
Dtree = DecisionTreeClassifier()
Dtree.fit(x_train_pca, y_train)
Dtree_pred = Dtree.predict(x_test_pca)

In [None]:
print(f"{classification_report(y_test, Dtree_pred)}")

### 4. apply PCA on KNN

In [7]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train_pca, y_train)
knn_pred = knn.predict(x_test_pca)

In [None]:
print(f"{classification_report(y_test, knn_pred)}")

### 5. apply PCA on Random Forest

In [None]:
randomforest = RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=42)
randomforest.fit(x_train_pca, y_train)
randomforest_pred = randomforest.predict(x_test_pca)

In [None]:
print(f"{classification_report(y_test, randomforest_pred)}")

### 6. apply PCA on AdaBoost

In [None]:
ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=100, random_state=10)
ada.fit(x_train_pca, y_train)
ada_pred = ada.predict(x_test_pca)

In [None]:
print(f"{classification_report(y_test, ada_pred)}")

### 7. apply PCA on XGBoost

In [None]:
xg = xgboost.XGBClassifier()
xg.fit(x_train_pca, y_train)
xg_pred = xg.predict(x_test_pca)

In [None]:
print(f"{classification_report(y_test, xg_pred)}")