# Linear SVM

In [None]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

In [None]:
df = pd.read_pickle('cvr_prediction_20201228.pkl')

In [None]:
feature_col_names = ['ari_class', 'bormuth_score', 'bormuth_class', 'coleman_liau_class',
                     'flesch_class', 'flesch_kincaid_class', 'fog_score', 'fog_class',
                     'lix_class', 'rix_score', 'rix_class', 'smog_class', 'strain_class',
                     'aws', 'pdw', 'pew', 'ppw', 'psw', 'puw', 'sentences']
predicted_class_names = ['cvr_class']

x = df[feature_col_names].values
y = df[predicted_class_names].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42) 

## Validate distibution

In [None]:
print('{:.2f}% in training set'.format((len(x_train) / len(df)) * 100))
print('{:.2f}% in test set'.format((len(x_test) / len(df)) * 100))

In [None]:
print('Original 0: {} ({:.2f}%)'.format(len(df.loc[df.cvr_class == 0]),
                                        (len(df.loc[df.cvr_class == 0]) / len(df)) * 100.0))
print('Original 1: {} ({:.2f}%)'.format(len(df.loc[df.cvr_class == 1]),
                                        (len(df.loc[df.cvr_class == 1]) / len(df)) * 100.0))

In [None]:
print('Training 0: {} ({:.2f}%)'.format(len(y_train[y_train[:] == 0]),
                                        (len(y_train[y_train[:] == 0]) / len(y_train) * 100.0)))
print('Training 1: {} ({:.2f}%)'.format(len(y_train[y_train[:] == 1]),
                                        (len(y_train[y_train[:] == 1]) / len(y_train) * 100.0)))

In [None]:
print('Test 0: {} ({:.2f}%)'.format(len(y_test[y_test[:] == 0]),
                                    (len(y_test[y_test[:] == 0]) / len(y_test) * 100.0)))
print('Test 1: {} ({:.2f}%)'.format(len(y_test[y_test[:] == 1]),
                                    (len(y_test[y_test[:] == 1]) / len(y_test) * 100.0)))

## Train

In [None]:
model = LinearSVC(dual=False)
model.fit(x_train,  y_train.ravel())
svm_predict_test = model.predict(x_test)

## Results

In [None]:
print('Confusion Matrix:')
print('{}'.format(metrics.confusion_matrix(y_test, svm_predict_test, labels=[1, 0])))

In [None]:
print(metrics.classification_report(y_test, svm_predict_test, labels=[0,1]))

In [None]:
metrics.accuracy_score(y_test, svm_predict_test).round(6) # 0.7397260273972602