In [13]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
train_data = pd.read_csv('data/train.csv', low_memory=False)
test_data = pd.read_csv('data/test.csv', low_memory=False)

In [3]:
combined_set = pd.concat([train_data, test_data])

In [4]:
combined_set['combined_var'] = (combined_set.hair_length * .40) + (combined_set.has_soul * .40)

In [5]:
# Replace categorical variables with numbers
def label_encoding(df, col):
    label_map = { key: float(n) for n, key in enumerate(df[col].unique()) }
    label_reverse_map = { label_map[key]: key for key in label_map }
    df[col] = df[col].apply(lambda x: label_map[x])
    return df, label_map, label_reverse_map

combined_set, _, _ = label_encoding(combined_set, 'color')

In [7]:
train_set = combined_set[:len(train_data.index)]
test_set = combined_set[len(train_data.index):]

train_set, type_label_map, type_label_reverse_map = label_encoding(train_set, 'type')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [8]:
train_cols = ['combined_var', 'rotting_flesh', 'bone_length', 'has_soul']
target_var = 'type'
selected_cols = train_cols + [target_var]

In [10]:
linear_svm = LinearSVC(random_state=7)

In [14]:
train, val = train_test_split(train_set[selected_cols], stratify = train_set[target_var])

In [15]:
linear_svm.fit(train[train_cols], train[target_var])

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=7, tol=0.0001,
     verbose=0)

In [19]:
print(classification_report(linear_svm.predict(val[train_cols]), val[target_var]))

             precision    recall  f1-score   support

        0.0       0.88      0.64      0.74        45
        1.0       0.32      0.67      0.43        15
        2.0       0.90      0.79      0.84        33

avg / total       0.80      0.70      0.73        93



In [25]:
predictions = linear_svm.predict(test_set[train_cols])

In [23]:
linear_svm.coef_

array([[ 3.42424637, -0.57147762,  2.10928217,  1.05695275],
       [ 0.52519143, -1.61238525, -0.71638699, -0.17406447],
       [-4.3937979 ,  2.47724637, -1.79212444, -1.05840626]])

In [26]:
sub = pd.DataFrame({'id': test_set.id, 'type': predictions})
sub['type'] = sub.type.apply(lambda x: type_label_reverse_map[x])
sub.to_csv('submission6_linear_vsm.csv', index=False)