In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
dataset = pd.read_csv("data/dataset_train.csv")


In [None]:
def encode_dataset(dataframe):
    dataframe = dataframe.drop("capital-gain", axis=1)
    dataframe = dataframe.drop("capital-loss", axis=1)
#     dataframe = dataframe.drop("native-country", axis=1)
    dataframe["workclass"] = dataframe["workclass"].map({'Private': 0.5, 'Self-emp-not-inc': 0.5,'Local-gov': 0.5, '?':0.5,'State-gov': 0.7, 'Self-emp-inc': 1, 'Federal-gov': 0.85, 'Without-pay': 0.2,'Never-worked':0.1})
    dataframe["education"] = dataframe["education"].map({'HS-grad': 0.4, 'Some-college': 0.4,'Bachelors': 0.75,'Masters': 0.75, 'Assoc-voc': 0.5, '11th': 0.3, 'Assoc-acdm': 0.5,'10th':0.3, '7th-8th': 0.3,'Prof-school':0.95, '9th': 0.3, '12th': 0.3,'Doctorate': 0.95,'5th-6th': 0.3, '1st-4th': 0.1,'Preschool': 0.1})
    dataframe["marital-status"] = dataframe["marital-status"].map({'Married-civ-spouse': 0.7, 'Never-married': 0.2, 'Divorced': 0.2,'Widowed': 0.2, 'Separated': 0.2, 'Married-spouse-absent': 0.5,'Married-AF-spouse': 0.5})
    dataframe["occupation"] = dataframe["occupation"].map({'Prof-specialty': 0.8, 'Craft-repair': 0.5, 'Exec-managerial': 0.8,'Adm-clerical': 0.4, 'Sales': 0.5, 'Other-service': 0.3,'Machine-op-inspct': 0.3, '?': 0.5,'Transport-moving': 0.5, 'Handlers-cleaners': 0.3,'Farming-fishing': 0.4,'Tech-support': 0.45,'Protective-serv': 0.45,'Priv-house-serv': 0.1, 'Armed-Forces': 0.4})
    dataframe["relationship"] = dataframe["relationship"].map({'Husband': 0.9, 'Not-in-family': 0.3,'Own-child': 0.1, 'Wife':0.9,'Other-relative': 0.3, 'Unmarried': 0.3})
    dataframe["race"] = dataframe["race"].map({'White': 0.5, 'Black': 0.3,'Asian-Pac-Islander': 0.5, 'Amer-Indian-Eskimo':0.3,'Other': 0.3})
    dataframe["gender"] = dataframe["gender"].map({'Male': 0.7, 'Female': 0.3})
    dataframe["native-country"] = dataframe["native-country"].map({'Mexico':0.3 ,'Puerto-Rico':0.3 ,'El-Salvador':0.3 ,'Dominican-Republic':0.3 ,'Columbia':0.3, 'United-States' :0.5 , '?' :0.5 , 'Cuba':0.5, 'Philippines' :0.5 , 'Germany' :0.5 , 'China' :0.5 , 'India' :0.5 , 'England' :0.5 , 'Jamaica' :0.5 , 'South' :0.5 , 'Guatemala' :0.5 , 'Vietnam' :0.5 , 'Poland' :0.5 , 'Italy' :0.5 , 'Haiti' :0.5 , 'Portugal' :0.5 , 'Japan' :0.5 , 'Peru' :0.5 , 'Taiwan' :0.5 , 'Nicaragua' :0.5 , 'Ecuador' :0.5 , 'Iran' :0.5 , 'Greece' :0.5 , 'Thailand' :0.5 , 'Trinadad&Tobago' :0.5 , 'Outlying-US(Guam-USVI-etc)' :0.5 , 'Cambodia' :0.5 , 'Ireland' :0.5 , 'Laos' :0.5 , 'France' :0.5 , 'Hong' :0.5 , 'Honduras' :0.5 , 'Scotland' :0.5 , 'Yugoslavia' :0.5 , 'Hungary' :0.5 , 'Holand-Netherlands':0.5, 'Canada':0.7})
    dataframe["income"] = dataframe["income"].map({'<=50K':0, '>50K':1})
    dataframe["age"] = (dataframe["age"]-17)/73
    dataframe["educational-num"] = (dataframe["educational-num"]-1)/15
    dataframe["hours-per-week"]  = dataframe["hours-per-week"].apply(lambda x: 1 if x > 80 else (x-1)/80)
    dataframe["fnlwgt"] = dataframe["fnlwgt"].apply(lambda x: 1 if x > 700000 else (x-12285)/787715)
    
    return dataframe

In [None]:
data_norm = encode_dataset(dataset)
X_dataset = data_norm.drop('income', axis=1)
y_dataset = data_norm['income']

In [None]:
data_norm.head()

In [None]:
pdata = pd.read_csv("data/p_test.csv")
pdata_norm = encode_dataset(pdata)
X_p = pdata_norm.drop('income', axis=1)
y_p = pdata_norm['income']

In [None]:
clf = RandomForestClassifier(n_estimators=300, max_features=5)
clf.fit(X_dataset, y_dataset)

In [None]:
y_pred = clf.predict(X_p)
print("RF accuracy: ", accuracy_score(y_p, y_pred))

In [None]:
clf = RandomForestClassifier(n_estimators=300, max_features=10)
clf.fit(X_dataset, y_dataset)
y_pred = clf.predict(X_p)
print("RF accuracy: ", accuracy_score(y_p, y_pred))

In [None]:
import numpy as np

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 150, stop = 700, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_dataset, y_dataset)

In [None]:
rf_random.best_params_

In [None]:
clf = RandomForestClassifier(n_estimators=516,min_samples_split=5,min_samples_leaf=4,max_features="sqrt",max_depth=80,bootstrap=True)
clf.fit(X_dataset, y_dataset)
y_pred = clf.predict(X_p)
print("RF accuracy: ", accuracy_score(y_p, y_pred))

In [29]:
trainset = pd.read_csv("data/trainset.csv")

In [None]:
def encode_dataset_2(dataframe):
    dataframe["workclass"] = dataframe["workclass"].map({'Private': 0, 'Self-emp-not-inc': 1,'Local-gov': 2, '?':0,'State-gov': 4, 'Self-emp-inc': 5, 'Federal-gov': 6, 'Without-pay': 7,'Never-worked':8 })
    dataframe["education"] = dataframe["education"].map({'HS-grad': 0, 'Some-college': 1,'Bachelors': 2, '?':0,'Masters': 4, 'Assoc-voc': 5, '11th': 6, 'Assoc-acdm': 7,'10th':8, '7th-8th': 9,'Prof-school':10, '9th': 10, '12th': 11,'Doctorate': 12,'5th-6th': 13, '1st-4th': 14,'Preschool': 15   })
    dataframe["marital-status"] = dataframe["marital-status"].map({'Married-civ-spouse': 0, 'Never-married': 1, 'Divorced': 2,'Widowed': 3, 'Separated': 4, 'Married-spouse-absent': 5,'Married-AF-spouse': 6 })
    dataframe["occupation"] = dataframe["occupation"].map({'Prof-specialty': 0, 'Craft-repair': 1, 'Exec-managerial': 2,'Adm-clerical': 3, 'Sales': 4, 'Other-service': 5,'Machine-op-inspct': 6, '?': 0,'Transport-moving': 8, 'Handlers-cleaners': 9,'Farming-fishing': 10,'Tech-support': 11,'Protective-serv': 12,'Priv-house-serv': 13, 'Armed-Forces': 14})
    dataframe["relationship"] = dataframe["relationship"].map({'Husband': 0, 'Not-in-family': 1,'Own-child': 2, 'Wife':3,'Other-relative': 4, 'Unmarried': 5})
    dataframe["race"] = dataframe["race"].map({'White': 0, 'Black': 1,'Asian-Pac-Islander': 2, 'Amer-Indian-Eskimo':3,'Other': 4})
    dataframe["gender"] = dataframe["gender"].map({'Male': 1, 'Female': 0})
#     dataframe["native-country"] = dataframe["native-country"].replace(['United-States', '?'], 0)
#     dataframe["native-country"] = dataframe["native-country"].replace(['Mexico', 'Philippines', 'Puerto-Rico', 'Canada', 'Germany', 'El-Salvador', 'Cuba', 'India', 'England', 'China', 'Dominican-Republic', 'Italy', 'South', 'Columbia', 'Japan', 'Jamaica', 'Poland', 'Guatemala', 'Haiti', 'Vietnam', 'Taiwan', 'Iran', 'Portugal', 'Ecuador', 'Nicaragua', 'Greece', 'Peru', 'Ireland', 'France', 'Cambodia', 'Thailand', 'Hong', 'Trinadad&Tobago', 'Honduras', 'Laos', 'Outlying-US(Guam-USVI-etc)', 'Yugoslavia', 'Hungary', 'Scotland', 'Holand-Netherlands'], 1)
    dataframe["native-country"] = dataframe["native-country"].map({'Mexico':3 ,'Puerto-Rico':3 ,'El-Salvador':3 ,'Dominican-Republic':3 ,'Columbia':3, 'United-States' :0 , '?' :0 , 'Cuba':1, 'Philippines' :1 , 'Germany' :1 , 'China' :1 , 'India' :1 , 'England' :1 , 'Jamaica' :1 , 'South' :1 , 'Guatemala' :1 , 'Vietnam' :1 , 'Poland' :1 , 'Italy' :1 , 'Haiti' :1 , 'Portugal' :1 , 'Japan' :1 , 'Peru' :1 , 'Taiwan' :1 , 'Nicaragua' :1 , 'Ecuador' :1 , 'Iran' :1 , 'Greece' :1 , 'Thailand' :1, 'Trinadad&Tobago' :1 , 'Outlying-US(Guam-USVI-etc)' :1 , 'Cambodia' :1 , 'Ireland': 1 , 'Laos' :1 , 'France' :2 , 'Hong' :2 , 'Honduras' :2 , 'Scotland' :2 , 'Yugoslavia' :2 , 'Hungary' :2 , 'Holand-Netherlands':2, 'Canada':3})
    dataframe["income"] = dataframe["income"].map({'<=50K':0, '>50K':1})
    return dataframe

In [None]:
trainset_norm = encode_dataset_2(trainset)
X_train = trainset_norm.drop('income', axis=1)
y_train = trainset_norm['income']

In [None]:
import numpy as np

In [32]:
X_train.isnull().any()

age                False
workclass          False
fnlwgt             False
education          False
educational-num    False
marital-status     False
occupation         False
relationship       False
race               False
gender             False
capital-gain       False
capital-loss       False
hours-per-week     False
native-country     False
dtype: bool

In [33]:
testset = pd.read_csv("data/testset.csv")
testset_norm = encode_dataset_2(testset)
X_test = testset_norm.drop('income', axis=1)
y_test = testset_norm['income']

In [34]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 600, num = 300)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 210, num = 200)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16, 17, 18, 19, 20]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16, 17, 18, 19, 20]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [35]:
X_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,21,0,34816,5,11,0,3,3,0,0,0,0,12,0
1,42,1,221172,0,9,2,1,1,0,1,0,0,40,0
2,68,0,286869,9,4,3,0,1,0,0,0,1668,40,0
3,30,0,348592,0,9,0,6,0,0,1,0,0,44,0
4,30,0,94235,2,13,0,0,0,0,1,0,1977,50,0


In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 200, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
rf_random.best_params_

In [None]:
clf = RandomForestClassifier(n_estimators=271,min_samples_split=2,min_samples_leaf=4,max_features="auto",max_depth=40,bootstrap=True)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print("RF2 accuracy: ", accuracy_score(y_test, y_pred))

In [21]:
p_data = pd.read_csv("data/p_test.csv")
p_norm = encode_dataset_2(p_data)
X_p = p_data.drop('income', axis=1)
y_p = p_data['income']

In [22]:
y_ppred = clf.predict(X_p)
print("RF3 accuracy: ", accuracy_score(y_p, y_ppred))

RF3 accuracy:  0.8671307196232982


In [None]:
dataset_2 = pd.read_csv("data/dataset_train.csv")
dts_norm = encode_dataset_2(dataset_2)
X_dts = dts_norm.drop('income', axis=1)
y_dts = dts_norm['income']

In [None]:
clf = RandomForestClassifier(n_estimators=271,min_samples_split=2,min_samples_leaf=4,max_features="auto",max_depth=40,bootstrap=True)
clf.fit(X_dts, y_dts)

In [None]:
y_ppred = clf.predict(X_p)
print("RF3 accuracy: ", accuracy_score(y_p, y_ppred))

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 600, num = 300)]
# Number of features to consider at every split
max_features = [4,5,6,7,8,9,10,11,12,13,14]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 210, num = 200)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16, 17, 18, 19, 20]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16, 17, 18, 19, 20]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# KHO MAU LUON NE
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_dts, y_dts)

In [None]:
rf_random.best_params_

In [None]:
clf = RandomForestClassifier(n_estimators=314,min_samples_split=17,min_samples_leaf=2,max_features="sqrt",max_depth=16,bootstrap=True)
clf.fit(X_dts, y_dts)

In [None]:
y_ppred = clf.predict(X_p)
print("RF4 accuracy: ", accuracy_score(y_p, y_ppred))

In [None]:
def write_to_file(list_data, filename):
    list_data = ["<=50K" if item ==0 else ">50K" for item in list_data]
    with open(filename, mode='wt', encoding='utf-8') as myfile:
        myfile.write('\n'.join(str(line) for line in list_data))

In [None]:
write_to_file(y_ppred, "linhph_03.txt")