In [1]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# from keras.utils import to_categorical
from sklearn import metrics
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer

import pickle
import numpy as np # linear algebra
import pandas as pd

In [2]:
#read csv
dict_label = {
    'Datetime':0, 
    'Sentence':1, 
    'Custom Object': 2, 
    'URL': 3, 
    'Numbers': 4, 
    'List': 5}
data = pd.read_csv('data/needs_extraction_data/labelled_data.csv')

data['y_act'] = [dict_label[i] for i in data['y_act']]
y = data.loc[:,['y_act']]

In [3]:
data1 = data[['%_nans', 'mean_word_count', 'std_dev_word_count', 'has_delimiters']]
data1 = data1.fillna(0)

data1 = data1.rename(columns={'mean_word_count': 'scaled_mean_token_count', 'std_dev_word_count': 'scaled_std_dev_token_count', '%_nans': 'scaled_perc_nans'})
data1.loc[data1['scaled_mean_token_count'] > 10000, 'scaled_mean_token_count'] = 10000
data1.loc[data1['scaled_mean_token_count'] < -10000, 'scaled_mean_token_count'] = -10000
data1.loc[data1['scaled_std_dev_token_count'] > 10000, 'scaled_std_dev_token_count'] = 10000
data1.loc[data1['scaled_std_dev_token_count'] < -10000, 'scaled_std_dev_token_count'] = -10000
data1.loc[data1['scaled_perc_nans'] > 10000, 'scaled_perc_nans'] = 10000
data1.loc[data1['scaled_perc_nans'] < -10000, 'scaled_perc_nans'] = -10000
column_names_to_normalize = ['scaled_mean_token_count', 'scaled_std_dev_token_count','scaled_perc_nans']
x = data1[column_names_to_normalize].values
x = np.nan_to_num(x)
x_scaled = StandardScaler().fit_transform(x)
df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = data1.index)
data1[column_names_to_normalize] = df_temp

y.y_act = y.y_act.astype(float)

print(f"> Data mean: \n{data1.mean()}")
print(f"> Data median: \n{data1.median()}")
print(f"> Data stdev: \n{data1.std()}")

# data1.to_csv('before.csv')
# f = open('current.txt','w')
# d = enchant.Dict("en_US")

# for i in data.index:
#     ival = data.at[i,'Attribute_name']
#     if ival != 'id' and d.check(ivadf_tempdata1)
#         print(f,ival)
#         print(f,y.at[i,'y_act'])
#         data1.at[i,'dictionary_item'] = 1
#     else:
#         data1.at[i,'dictionary_item'] = 0

# data1.to_csv('after.csv')
# f.close()
# print(data1.columns)

> Data mean: 
scaled_perc_nans             -2.745801e-16
scaled_mean_token_count      -1.117919e-16
scaled_std_dev_token_count   -2.236863e-17
has_delimiters                3.105360e-01
dtype: float64
> Data median: 
scaled_perc_nans             -0.653046
scaled_mean_token_count      -0.144106
scaled_std_dev_token_count   -0.171320
has_delimiters                0.000000
dtype: float64
> Data stdev: 
scaled_perc_nans              1.000925
scaled_mean_token_count       1.000925
scaled_std_dev_token_count    1.000925
has_delimiters                0.463141
dtype: float64


In [4]:
print("===[VECTORIZATION]===")
arr = data['Attribute_name'].values
data = data.fillna(0)
arr1 = data['sample_1'].values
arr1 = [str(x) for x in arr1]
arr2 = data['sample_2'].values
arr2 = [str(x) for x in arr2]

vectorizer = CountVectorizer(ngram_range=(3, 3), analyzer='char')
X = vectorizer.fit_transform(arr)
X1 = vectorizer.fit_transform(arr1)
X2 = vectorizer.fit_transform(arr2)

print(f"> Length of vectorized feature_names: {len(vectorizer.get_feature_names())}")

data1.to_csv('data/preprocessing/before.csv')
tempdf = pd.DataFrame(X.toarray())
tempdf1 = pd.DataFrame(X1.toarray())
tempdf2 = pd.DataFrame(X2.toarray())

data2 = pd.concat([data1, tempdf, tempdf1, tempdf2], axis=1, sort=False)
data2.to_csv('data/preprocessing/after.csv')
data2.head()

X_train, X_test, y_train, y_test = train_test_split(
    data2, y, test_size=0.2, random_state=100)
atr_train,atr_test = train_test_split(data2, test_size=0.2,random_state=100)

# X_train_train, X_test_train,y_train_train,y_test_train = train_test_split(X_train,y_train, test_size=0.25)
# print(X_train.head())
# print(y_train.head())

X_train_new = X_train.reset_index(drop=True)
y_train_new = y_train.reset_index(drop=True)
print(f"X_train preview: {X_train.head()}")
print(f"y_train preview: {y_train.head()}")

X_train_new = X_train_new.values
y_train_new = y_train_new.values

===[VECTORIZATION]===
> Length of vectorized feature_names: 8528
X_train preview:      scaled_perc_nans  scaled_mean_token_count  scaled_std_dev_token_count  \
453         -0.653097                 0.686283                    3.364514   
43          -0.653120                 0.162079                   -0.054513   
133          1.978459                -0.148544                   -0.167108   
205         -0.653120                -0.141062                   -0.175870   
282         -0.653120                -0.148960                   -0.175870   

     has_delimiters  0  1  2  3  4  5  ...   8518  8519  8520  8521  8522  \
453            True  0  0  0  0  0  0  ...      0     0     0     0     0   
43             True  0  0  0  0  0  0  ...      0     0     0     0     0   
133            True  0  0  0  0  0  0  ...      0     0     0     0     0   
205           False  0  0  0  0  0  0  ...      0     0     0     0     0   
282           False  0  0  0  0  0  0  ...      0     0     0   

In [5]:
k = 5
kf = KFold(n_splits=k)
avg_train_acc, avg_test_acc = 0, 0

cvals = [0.1, 1, 10, 100, 1000]
gamavals = [0.0001, 0.001, 0.01, 0.1, 1, 10]


avgsc_lst, avgsc_train_lst, avgsc_hld_lst = [], [], []
avgsc, avgsc_train, avgsc_hld = 0, 0, 0

for train_index, test_index in kf.split(X_train_new):
    X_train_cur, X_test_cur = X_train_new[train_index], X_train_new[test_index]
    y_train_cur, y_test_cur = y_train_new[train_index], y_train_new[test_index]
    X_train_train, X_val, y_train_train, y_val = train_test_split(
        X_train_cur, y_train_cur, test_size=0.25, random_state=100)

    bestPerformingModel = svm.SVC(
        C=100, decision_function_shape="ovo", gamma=0.001, probability=True)
    bestscore = 0
    for cval in cvals:
        for gval in gamavals:
            clf = svm.SVC(C=cval, decision_function_shape="ovo", gamma=gval, probability=True)
            clf.fit(X_train_train, y_train_train)
            sc = clf.score(X_val, y_val)
            print(f"[C: {cval}, accuracy: {sc}]")
            if bestscore < sc:
                bestscore = sc
                bestPerformingModel = clf
#                 print(bestPerformingModel)

    bscr_train = bestPerformingModel.score(X_train_cur, y_train_cur)
    bscr = bestPerformingModel.score(X_test_cur, y_test_cur)
    bscr_hld = bestPerformingModel.score(X_test, y_test)

    avgsc_train_lst.append(bscr_train)
    avgsc_lst.append(bscr)
    avgsc_hld_lst.append(bscr_hld)

    avgsc_train = avgsc_train + bscr_train
    avgsc = avgsc + bscr
    avgsc_hld = avgsc_hld + bscr_hld

    print('='*10)
    print(f"> Best training score: {bscr_train}")
    print(f"> Best test score: {bscr}")
    print(f"> Best held score: {bscr_hld}")
print('='*10)

  y = column_or_1d(y, warn=True)


> Best training score: 0.9623188405797102
> Best test score: 0.7701149425287356
> Best held score: 0.8165137614678899
> Best training score: 0.9420289855072463
> Best test score: 0.7931034482758621
> Best held score: 0.8532110091743119
> Best training score: 0.9364161849710982
> Best test score: 0.7906976744186046
> Best held score: 0.8532110091743119
> Best training score: 0.9479768786127167
> Best test score: 0.7906976744186046
> Best held score: 0.8532110091743119
> Best training score: 0.9479768786127167
> Best test score: 0.8023255813953488
> Best held score: 0.8165137614678899


In [6]:
print(f"> Average training score list: {avgsc_train_lst}")
print(f"> Average testing score list: {avgsc_lst}")
print(f"> Average held score list: {avgsc_hld_lst}")
print()
print(f"> Average training score list/k: {avgsc_train/k}")
print(f"> Average testing score list/k: {avgsc/k}")
print(f"> Average held score list/k: {avgsc_hld/k}")
print()
y_pred = bestPerformingModel.predict(X_test)
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
print('Confusion Matrix: Actual (Row) vs Predicted (Column)')
print(cnf_matrix)

> Average training score list: [0.9623188405797102, 0.9420289855072463, 0.9364161849710982, 0.9479768786127167, 0.9479768786127167]
> Average testing score list: [0.7701149425287356, 0.7931034482758621, 0.7906976744186046, 0.7906976744186046, 0.8023255813953488]
> Average held score list: [0.8165137614678899, 0.8532110091743119, 0.8532110091743119, 0.8532110091743119, 0.8165137614678899]

> Average training score list/k: 0.9473435536566976
> Average testing score list/k: 0.7893878642074312
> Average held score list/k: 0.8385321100917432

Confusion Matrix: Actual (Row) vs Predicted (Column)
[[24  0  3  0  0  0]
 [ 0 15  7  0  0  0]
 [ 0  4 48  0  0  0]
 [ 0  0  0  2  0  0]
 [ 0  0  1  0  0  0]
 [ 0  2  3  0  0  0]]


In [7]:
# save the model to disk
filename = 'data/pretrained/svm_finalized_model.pickle'
pickle.dump(bestPerformingModel, open(filename, 'wb'))

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
y_prob = bestPerformingModel.predict_proba(X_test)

df = pd.DataFrame.from_records(y_prob)
print(df)
df.to_csv('data/model_predictions/svm_predictions.csv', index=False)

[[6.88232656e-02 1.96000095e-01 5.77570078e-01 1.94492187e-02
  5.48207777e-02 8.33365659e-02]
 [1.15221634e-03 4.25933140e-01 5.37585057e-02 4.55662726e-01
  9.21192339e-04 6.25722192e-02]
 [9.68496108e-01 9.96144982e-03 4.34072162e-03 5.03628343e-03
  7.01386725e-03 5.15157005e-03]
 [9.43835003e-01 1.23136220e-02 1.37453207e-02 5.49645085e-03
  1.74741769e-02 7.13542669e-03]
 [1.46545964e-02 2.40406198e-01 5.42337477e-01 2.65797248e-02
  1.09191671e-02 1.65102837e-01]
 [2.02686398e-01 7.22981549e-02 6.98703725e-01 4.71469547e-03
  1.14255611e-02 1.01714656e-02]
 [2.21161833e-02 4.79545083e-02 8.93136346e-01 4.41783910e-03
  2.09284464e-02 1.14466770e-02]
 [9.59720963e-01 1.38022432e-02 7.08591175e-03 5.67572317e-03
  7.39432631e-03 6.32083234e-03]
 [6.79211202e-01 7.24444541e-02 6.70622202e-02 1.66061680e-02
  9.03307487e-02 7.43452068e-02]
 [1.23680273e-02 5.16530990e-01 3.41744581e-01 1.80273074e-02
  1.37366976e-02 9.75923961e-02]
 [1.68988382e-01 8.24090242e-02 5.56500250e-01 1.3