In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('dataset/adult.data', engine='python', delimiter=', ',encoding='latin1', header=None)
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'class']  
df.drop(labels=['fnlwgt', 'education_num'], axis=1, inplace=True)
df = df[df.workclass != '?']
df = df[df.occupation != '?']
df = df[df.native_country != '?']
df.head()

Unnamed: 0,age,workclass,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [2]:
mappings = {}
mappings['workclass'] = {'Private':0, 'Self-emp-not-inc':1, 'Self-emp-inc':1, 'Federal-gov':2, 'Local-gov':2, 'State-gov':2, 'Without-pay':3, 'Never-worked':3}
mappings['education'] = {'Preschool':0, '1st-4th':0, '5th-6th':1, '7th-8th': 1, '9th':1, '10th':1, '11th':1, '12th':1,'HS-grad':2, 'Prof-school':3, 'Assoc-acdm':4, 'Assoc-voc':4,'Some-college':5, 'Bachelors':6,  'Masters':7, 'Doctorate':8 }
mappings['marital_status'] = {'Married-civ-spouse':0, 'Divorced':1, 'Never-married':2, 'Separated':3, 'Widowed':4, 'Married-spouse-absent':5, 'Married-AF-spouse':6}
mappings['occupation'] = {'Tech-support':0, 'Craft-repair':1, 'Other-service':2, 'Sales':3, 'Exec-managerial':4, 'Prof-specialty':5, 'Handlers-cleaners':6, 'Machine-op-inspct':7, 'Adm-clerical':8, 'Farming-fishing':9, 'Transport-moving':10, 'Priv-house-serv':11, 'Protective-serv':12, 'Armed-Forces':13}
mappings['relationship'] = {'Not-in-family':0, 'Husband':1, 'Wife':2, 'Own-child':3, 'Unmarried':4, 'Other-relative':5}
mappings['race'] = {'White':0, 'Black':1, 'Asian-Pac-Islander':2, 'Amer-Indian-Eskimo':3, 'Other':4}
mappings['sex'] = {'Male':0, 'Female':1}
mappings['native_country'] = {'United-States':0, 'Cambodia':2, 'England':0, 'Puerto-Rico':2, 'Canada':0, 'Germany':0, 'Outlying-US(Guam-USVI-etc)':2, 'India':2, 'Japan':'8', 'Greece':1, 'South':1, 'China':1, 'Cuba':2, 'Iran':2, 'Honduras':2, 'Philippines':2, 'Italy':1, 'Poland':1, 'Jamaica':2, 'Vietnam':2, 'Mexico':2, 'Portugal':1, 'Ireland':0, 'France':0, 'Dominican-Republic':2, 'Laos':2, 'Ecuador':2, 'Taiwan':2, 'Haiti':2, 'Columbia':2, 'Hungary':1, 'Guatemala':2, 'Nicaragua':2, 'Scotland':0, 'Thailand':2, 'Yugoslavia':2, 'El-Salvador':2, 'Trinadad&Tobago':2, 'Peru':2, 'Hong':1, 'Holand-Netherlands':0}
mappings['class'] = {'<=50K':0, '>50K':1}
for k in mappings:
    df[k] = df[k].map(mappings[k])

bins = [-1000, df['capital_gain'].median(), df['capital_gain'].max(), 100000000]
df['capital_gain'] = pd.cut(df['capital_gain'], bins=bins, labels=[0,1,2]).to_frame()

bins = [-1000, df['capital_loss'].median(), df['capital_loss'].max(), 100000000]
df['capital_loss'] = pd.cut(df['capital_loss'], bins=bins, labels=[0,1,2]).to_frame()

bins = [-1000, 30, 55, 100000000]
df['hours_per_week'] = pd.cut(df['hours_per_week'], bins=bins, labels=[0,1,2]).to_frame()

bins = [-100, 23, 40, 65, 1000]
df['age'] = pd.cut(df['age'], bins=bins, labels=[0,1,2,3]).to_frame()
df.head()

Unnamed: 0,age,workclass,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class
0,1,2,6,2,8,0,0,0,1,0,1,0,0
1,2,1,6,0,4,1,0,0,0,0,0,0,0
2,1,0,2,1,6,0,0,0,0,0,1,0,0
3,2,0,1,0,6,1,1,0,0,0,1,0,0
4,1,0,6,0,5,2,1,1,0,0,1,2,0


In [3]:
from sklearn.model_selection import train_test_split

y=df.as_matrix(columns=[df.columns[-1]])
#df.drop('class',axis=1,inplace=True)
X=df.as_matrix(columns=df.columns[:-1])
#np.delete(X, np.s_[12], axis=1)  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print(X_train)

[[2 0 2 ..., 1 1 0]
 [1 0 5 ..., 0 2 0]
 [1 2 2 ..., 0 1 0]
 ..., 
 [2 0 6 ..., 0 1 0]
 [2 1 6 ..., 0 2 0]
 [1 0 2 ..., 0 1 0]]


  This is separate from the ipykernel package so we can avoid doing imports until
  """


In [4]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)
print(X_train_std)

[[ 0.94548441 -0.55407703 -0.85982452 ...,  4.4660314   0.12142347
  -0.25005579]
 [-0.41219853 -0.55407703  0.64597981 ..., -0.22391244  2.22274616
  -0.25005579]
 [-0.41219853  2.20363827 -0.85982452 ..., -0.22391244  0.12142347
  -0.25005579]
 ..., 
 [ 0.94548441 -0.55407703  1.14791459 ..., -0.22391244  0.12142347
  -0.25005579]
 [ 0.94548441  0.82478062  1.14791459 ..., -0.22391244  2.22274616
  -0.25005579]
 [-0.41219853 -0.55407703 -0.85982452 ..., -0.22391244  0.12142347
  -0.25005579]]




In [5]:
from sklearn.decomposition import PCA

pca = PCA()
X_train_pca = pca.fit_transform(X_train_std)

pca.explained_variance_ratio_

array([ 0.16867888,  0.10658178,  0.10226796,  0.08906505,  0.08685772,
        0.07442369,  0.07308282,  0.0695445 ,  0.066327  ,  0.05893579,
        0.05838208,  0.04585273])

In [6]:
plt.bar(range(12), pca.explained_variance_ratio_, alpha=0.5, align='center')
plt.step(range(12), np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.show()



In [34]:
%matplotlib inline 
import matplotlib.pyplot as plt 
import matplotlib.gridspec as gridspec

from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version

from numpy import linalg as LA
from matplotlib.colors import ListedColormap
from sklearn.svm import LinearSVC


svm1=LinearSVC(C=1.0, max_iter=4000, tol=1e-05)

svm1=svm1.fit(X,y)
prediction = [d for d in svm1.predict(X_test_std)]
expected = [d[0] for d in y_test]

yay = 0
nay = 0
for i in range(0, len(prediction)):
    if expected[i] == prediction[i]:
        yay += 1
    else:
        nay += 1
print("Total tested: " + str(len(prediction)))
print("Yay: " + str(yay))
print("Nay: " + str(nay))
print("Precision: " + str(yay/len(prediction)))

  y = column_or_1d(y, warn=True)


Total tested: 9049
Yay: 6979
Nay: 2070
Precision: 0.771245441485247


