In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsOneClassifier
from sklearn.model_selection import train_test_split,cross_val_score

In [2]:
# Input file containing data
input_file = './files/income_data.txt'

In [3]:
X = list()
y = list()
count_class1 = 0
count_class2 = 0
max_datapoints = 25000


In [4]:
with open(input_file, 'r') as f:
    for line in f.readlines():
        # print(line)
        if count_class1 >= max_datapoints and count_class2 >= max_datapoints:
            break
        if '?' in line:
            continue
        
        # print(line.split(','))
        data = line.replace('\n','').split(',')
        data = [_data.strip() for _data in data]
        # [:-1]
        # print(data)
        if data[-1] == '<=50K' and count_class1 < max_datapoints:
            X.append(data)
            count_class1 += 1
        if data[-1] == '>50K' and count_class2 < max_datapoints:
            X.append(data)
            count_class2 += 1
X   

[['39',
  'State-gov',
  '77516',
  'Bachelors',
  '13',
  'Never-married',
  'Adm-clerical',
  'Not-in-family',
  'White',
  'Male',
  '2174',
  '0',
  '40',
  'United-States',
  '<=50K'],
 ['50',
  'Self-emp-not-inc',
  '83311',
  'Bachelors',
  '13',
  'Married-civ-spouse',
  'Exec-managerial',
  'Husband',
  'White',
  'Male',
  '0',
  '0',
  '13',
  'United-States',
  '<=50K'],
 ['38',
  'Private',
  '215646',
  'HS-grad',
  '9',
  'Divorced',
  'Handlers-cleaners',
  'Not-in-family',
  'White',
  'Male',
  '0',
  '0',
  '40',
  'United-States',
  '<=50K'],
 ['53',
  'Private',
  '234721',
  '11th',
  '7',
  'Married-civ-spouse',
  'Handlers-cleaners',
  'Husband',
  'Black',
  'Male',
  '0',
  '0',
  '40',
  'United-States',
  '<=50K'],
 ['28',
  'Private',
  '338409',
  'Bachelors',
  '13',
  'Married-civ-spouse',
  'Prof-specialty',
  'Wife',
  'Black',
  'Female',
  '0',
  '0',
  '40',
  'Cuba',
  '<=50K'],
 ['37',
  'Private',
  '284582',
  'Masters',
  '14',
  'Married-civ-s

In [5]:
# Convert the list into a numpy array so that it can be used as an input to the sklearn function:
# Convert to numpy array
X = np.array(X)
X

array([['39', 'State-gov', '77516', ..., '40', 'United-States', '<=50K'],
       ['50', 'Self-emp-not-inc', '83311', ..., '13', 'United-States',
        '<=50K'],
       ['38', 'Private', '215646', ..., '40', 'United-States', '<=50K'],
       ...,
       ['58', 'Private', '151910', ..., '40', 'United-States', '<=50K'],
       ['22', 'Private', '201490', ..., '20', 'United-States', '<=50K'],
       ['52', 'Self-emp-inc', '287927', ..., '40', 'United-States',
        '>50K']], dtype='<U26')

In [6]:
"""
If any attribute is a string, it needs to be encoded. If it is a number, it can be kept as is.
Note that we will end up with multiple label encoders and we need to keep track of
all of them:
"""
# print(X[0])
# print(X.shape)
# Convert string data to numerical data
label_encoder = []
X_encoded = np.empty(X.shape)
for i,item in enumerate(X[0]):
    if item.isdigit():
        X_encoded[:, i] = X[:, i]
    else:
        label_encoder.append(preprocessing.LabelEncoder())
        X_encoded[:, i] = label_encoder[-1].fit_transform(X[:, i])


In [7]:
label_encoder

[LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder()]

In [8]:
X_encoded

array([[3.90000e+01, 5.00000e+00, 7.75160e+04, ..., 4.00000e+01,
        3.80000e+01, 0.00000e+00],
       [5.00000e+01, 4.00000e+00, 8.33110e+04, ..., 1.30000e+01,
        3.80000e+01, 0.00000e+00],
       [3.80000e+01, 2.00000e+00, 2.15646e+05, ..., 4.00000e+01,
        3.80000e+01, 0.00000e+00],
       ...,
       [5.80000e+01, 2.00000e+00, 1.51910e+05, ..., 4.00000e+01,
        3.80000e+01, 0.00000e+00],
       [2.20000e+01, 2.00000e+00, 2.01490e+05, ..., 2.00000e+01,
        3.80000e+01, 0.00000e+00],
       [5.20000e+01, 3.00000e+00, 2.87927e+05, ..., 4.00000e+01,
        3.80000e+01, 1.00000e+00]])

In [9]:
X = X_encoded[:, :-1].astype(int)
X

array([[    39,      5,  77516, ...,      0,     40,     38],
       [    50,      4,  83311, ...,      0,     13,     38],
       [    38,      2, 215646, ...,      0,     40,     38],
       ...,
       [    58,      2, 151910, ...,      0,     40,     38],
       [    22,      2, 201490, ...,      0,     20,     38],
       [    52,      3, 287927, ...,      0,     40,     38]])

In [10]:
y = X_encoded[:, -1].astype(int)
y

array([0, 0, 0, ..., 0, 0, 1])

In [11]:
# Create the SVM classifier with a linear kernel:
classifier = OneVsOneClassifier(LinearSVC(random_state=0))

In [12]:
# Train the classifier
classifier.fit(X, y)

In [13]:
# Perform cross-validation using an 80/20 split for training and testing, and then predict the output for the training data:
# Cross validation
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=5)

In [14]:
classifier = OneVsOneClassifier(LinearSVC(random_state=0))
classifier.fit(X_train, y_train)

In [15]:
y_test_pred = classifier.predict(X_test)

In [16]:
# Compute the F1 score of the SVM classifier
f1 = cross_val_score(classifier, X, y, scoring='f1_weighted', cv=3)
print("F1 score: " + str(round(100*f1.mean(), 2)) + "%")

F1 score: 76.09%


In [17]:
# Now that the classifier is ready, let's see how to take a random input datapoint and predict the output. Let's define one such datapoint:
# Predict output for a test datapoint
input_data = ['37', 'Private', '215646', 'HS-grad', '9', 'Nevermarried','Handlers-cleaners', 'Not-in-family', 'White', 'Male', '0','0', '40', 'United-States']
input_data

['37',
 'Private',
 '215646',
 'HS-grad',
 '9',
 'Nevermarried',
 'Handlers-cleaners',
 'Not-in-family',
 'White',
 'Male',
 '0',
 '0',
 '40',
 'United-States']

In [18]:
# Before predictions can be performed, the datapoint needs to be encoded using the label encoders created earlier:
# Encode test datapoint
input_data_encoded = [-1] * len(input_data)
input_data_encoded

[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]

In [19]:
count = 0
for i, item in enumerate(input_data):
    if item.isdigit():
        input_data_encoded[i] = int(input_data[i])
    else:
        _label_encoder = label_encoder[count].transform(input_data[i])
        _label_encoder = int(_label_encoder)
        input_data_encoded[i] = _label_encoder
       
        count += 1
input_data_encoded = np.array(input_data_encoded)
input_data_encoded

ValueError: y should be a 1d array, got an array of shape () instead.

In [209]:
# Run classifier on encoded datapoint and print output
predicted_class = classifier.predict(input_data_encoded)
print(label_encoder[-1].inverse_transform(predicted_class)[0])

ValueError: Expected 2D array, got 1D array instead:
array=[37 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.