In [1]:
from leven import levenshtein  
from sklearn.neighbors import DistanceMetric
from sklearn.metrics import accuracy_score
from statistics import mode
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from scipy import stats
import pandas as pd
import numpy as np 
import time
import editdistance

# Preprocessing

In [2]:
#read csv
dict_label = {
    'Datetime':0, 
    'Sentence':1, 
    'Custom Object': 2, 
    'URL': 3, 
    'Numbers': 4, 
    'List': 5}
data = pd.read_csv('data/needs_extraction_data/labelled_added.csv')

data['y_act'] = [dict_label[i] for i in data['y_act']]
y = data.loc[:,['y_act']]
key_name = data['Attribute_name']

In [3]:
data1 = data[['%_nans', 'mean_word_count',
              'std_dev_word_count', 'has_delimiters', 'mean_stopword_total',
              'mean_whitespace_count', 'mean_char_count', 'mean_delim_count',
              'stdev_stopword_total', 'stdev_whitespace_count', 'stdev_char_count',
              'stdev_delim_count', 'has_url', 'has_date']]
data1 = data1.fillna(0)

data1 = data1.rename(columns={
    'mean_word_count': 'scaled_mean_token_count',
    'std_dev_word_count': 'scaled_std_dev_token_count',
    '%_nans': 'scaled_perc_nans',
    'mean_stopword_total': 'scaled_mean_stopword_total',
    'mean_whitespace_count': 'scaled_mean_whitespace_count',
    'mean_char_count': 'scaled_mean_char_count',
    'mean_delim_count': 'scaled_mean_delim_count',
    'stdev_stopword_total': 'scaled_stdev_stopword_total',
    'stdev_whitespace_count': 'scaled_stdev_whitespace_count',
    'stdev_char_count': 'scaled_stdev_char_count',
    'stdev_delim_count': 'scaled_stdev_delim_count'
})
data1.loc[data1['scaled_mean_token_count'] >
          10000, 'scaled_mean_token_count'] = 10000
data1.loc[data1['scaled_mean_token_count'] < -
          10000, 'scaled_mean_token_count'] = -10000

data1.loc[data1['scaled_std_dev_token_count'] >
          10000, 'scaled_std_dev_token_count'] = 10000
data1.loc[data1['scaled_std_dev_token_count'] < -
          10000, 'scaled_std_dev_token_count'] = -10000

data1.loc[data1['scaled_perc_nans'] > 10000, 'scaled_perc_nans'] = 10000
data1.loc[data1['scaled_perc_nans'] < -10000, 'scaled_perc_nans'] = -10000

data1.loc[data1['scaled_mean_stopword_total'] >
          10000, 'scaled_mean_stopword_total'] = 10000
data1.loc[data1['scaled_mean_stopword_total'] < -
          10000, 'scaled_mean_stopword_total'] = -10000

data1.loc[data1['scaled_mean_whitespace_count'] >
          10000, 'scaled_mean_whitespace_count'] = 10000
data1.loc[data1['scaled_mean_whitespace_count'] < -
          10000, 'scaled_mean_whitespace_count'] = -10000

data1.loc[data1['scaled_mean_char_count'] >
          10000, 'scaled_mean_char_count'] = 10000
data1.loc[data1['scaled_mean_char_count'] < -
          10000, 'scaled_mean_char_count'] = -10000

data1.loc[data1['scaled_mean_delim_count'] >
          10000, 'scaled_mean_delim_count'] = 10000
data1.loc[data1['scaled_mean_delim_count'] < -
          10000, 'scaled_mean_delim_count'] = -10000

data1.loc[data1['scaled_stdev_stopword_total'] >
          10000, 'scaled_stdev_stopword_total'] = 10000
data1.loc[data1['scaled_stdev_stopword_total'] < -
          10000, 'scaled_stdev_stopword_total'] = -10000

data1.loc[data1['scaled_stdev_whitespace_count'] >
          10000, 'scaled_stdev_whitespace_count'] = 10000
data1.loc[data1['scaled_stdev_whitespace_count'] < -
          10000, 'scaled_stdev_whitespace_count'] = -10000

data1.loc[data1['scaled_stdev_char_count'] >
          10000, 'scaled_stdev_char_count'] = 10000
data1.loc[data1['scaled_stdev_char_count'] < -
          10000, 'scaled_stdev_char_count'] = -10000

data1.loc[data1['scaled_stdev_delim_count'] >
          10000, 'scaled_stdev_delim_count'] = 10000
data1.loc[data1['scaled_stdev_delim_count'] < -
          10000, 'scaled_stdev_delim_count'] = -10000

column_names_to_normalize = ['scaled_mean_token_count',
                             'scaled_std_dev_token_count',
                             'scaled_perc_nans',
                             'scaled_mean_stopword_total',
                             'scaled_mean_whitespace_count',
                             'scaled_mean_char_count',
                             'scaled_mean_delim_count',
                             'scaled_stdev_stopword_total',
                             'scaled_stdev_whitespace_count',
                             'scaled_stdev_char_count',
                             'scaled_stdev_delim_count']
x = data1[column_names_to_normalize].values
x = np.nan_to_num(x)
x_scaled = StandardScaler().fit_transform(x)
df_temp = pd.DataFrame(
    x_scaled, columns=column_names_to_normalize, index=data1.index)
data1[column_names_to_normalize] = df_temp

y.y_act = y.y_act.astype(float)

print(f"> Data mean: {data1.mean()}\n")
print(f"> Data median: {data1.median()}\n")
print(f"> Data stdev: {data1.std()}")

> Data mean: scaled_perc_nans                -2.745801e-16
scaled_mean_token_count         -1.117919e-16
scaled_std_dev_token_count      -2.236863e-17
has_delimiters                   3.105360e-01
scaled_mean_stopword_total       8.619107e-18
scaled_mean_whitespace_count    -1.126127e-16
scaled_mean_char_count           5.130421e-17
scaled_mean_delim_count         -5.915375e-17
scaled_stdev_stopword_total      6.413026e-17
scaled_stdev_whitespace_count   -2.236863e-17
scaled_stdev_char_count         -3.488686e-18
scaled_stdev_delim_count         9.516930e-17
has_url                          8.687616e-02
has_date                         7.560074e-01
dtype: float64

> Data median: scaled_perc_nans                -0.653046
scaled_mean_token_count         -0.144106
scaled_std_dev_token_count      -0.171320
has_delimiters                   0.000000
scaled_mean_stopword_total      -0.178121
scaled_mean_whitespace_count    -0.144106
scaled_mean_char_count          -0.166657
scaled_mean_delim_

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data1, y, test_size=0.2, random_state=100)
atr_train,atr_test = train_test_split(key_name, test_size=0.2,random_state=100)

# X_train_train, X_test_train,y_train_train,y_test_train = train_test_split(X_train,y_train, test_size=0.25)
# print(X_train.head())
# print(y_train.head())

X_train_new = X_train.reset_index(drop=True)
y_train_new = y_train.reset_index(drop=True)
print(f"X_train preview: {X_train.head()}")
print(f"y_train preview: {y_train.head()}")

X_train_new = X_train_new.values
y_train_new = y_train_new.values

X_train.reset_index(inplace=True,drop=True)
y_train.reset_index(inplace=True,drop=True)
X_test.reset_index(inplace=True,drop=True)
y_test.reset_index(inplace=True,drop=True)
atr_train.reset_index(inplace=True,drop=True)
atr_test.reset_index(inplace=True,drop=True)

X_train preview:      scaled_perc_nans  scaled_mean_token_count  scaled_std_dev_token_count  \
453         -0.653097                 0.686283                    3.364514   
43          -0.653120                 0.162079                   -0.054513   
133          1.978459                -0.148544                   -0.167108   
205         -0.653120                -0.141062                   -0.175870   
282         -0.653120                -0.148960                   -0.175870   

     has_delimiters  scaled_mean_stopword_total  scaled_mean_whitespace_count  \
453            True                    0.945220                      0.686283   
43             True                    0.126161                      0.162079   
133            True                   -0.187400                     -0.148544   
205           False                   -0.178121                     -0.141062   
282           False                   -0.187845                     -0.148960   

     scaled_mean_char_count

In [5]:
st = time.time()

Matrix = [[0 for x in range(len(X_train))] for y in range(len(X_test))]
dist_euc = DistanceMetric.get_metric('euclidean')

# for i in range(len(X_test)):
#     for j in range(len(X_train)):
#         dist = numpy.linalg.norm(a-b)
#         Matrix[i][j] = dist_euc.pairwise([data1.loc[i],data1.loc[j]])[0][1]

np_X_train = X_train.as_matrix()
np_X_test = X_test.as_matrix()

for i in range(len(X_test)):
    if i % 100 == 0:
        print(f"> i: {i}")
    a = np_X_test[i]
    for j in range(len(X_train)):
        b = np_X_train[j]
        dist = np.linalg.norm(a-b)
        Matrix[i][j] = dist

# print(f"> {np_X_train[0]}")print(f"> Time elapsed: {time.time()-st} seconds")

> i: 0


  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


> i: 100


# 1-nearest neighbor on just Euclidean distance

In [6]:
y_pred = []
for i in range(len(X_test)):
    dist = np.argsort(Matrix[i])[:1]
    pred = y_train.iloc[dist[0]]
    y_pred.append(pred)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

Accuracy: 0.8807339449541285


In [7]:
for neighbr in range(1, 11):
    y_pred = []
    for i in range(len(X_test)):
        #     print('---')
        dist = np.argsort(Matrix[i])[:neighbr]
        ys = []
        for x in dist:
            ys.append(y_train.iloc[x])
    #     print(ys)
        ho = stats.mode(ys)
    #     print(ho)
    #     print(ho[0][0])
        pred = ho[0][0]
        y_pred.append(pred)
    # print(y_pred)
    # print(y_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{neighbr} neighbors: {acc}")

1 neighbors: 0.8807339449541285
2 neighbors: 0.7981651376146789
3 neighbors: 0.7614678899082569
4 neighbors: 0.7798165137614679
5 neighbors: 0.7981651376146789
6 neighbors: 0.7798165137614679
7 neighbors: 0.7798165137614679
8 neighbors: 0.7889908256880734
9 neighbors: 0.7431192660550459
10 neighbors: 0.7431192660550459


In [8]:
st = time.time()

Matrix_ed = [[0 for x in range(len(X_train))] for y in range(len(X_test))]
dist_euc = DistanceMetric.get_metric('euclidean')

np_X_train = atr_train.values
np_X_test = atr_test.values

for i in range(len(X_test)):
    if i % 100 == 0:
        print(f"> i: {i}")
    a = atr_test.iloc[i]
    for j in range(len(X_train)):
        b = atr_train.iloc[j]
        dist = editdistance.eval(a, b)
        Matrix_ed[i][j] = dist

# print(np_X_train[0])
print(f"> Time elapsed: {time.time()-st}")

> i: 0
> i: 100
> Time elapsed: 2.1541621685028076


# 1-nearest neighbor on just Attribute Name Levenstein distance

In [9]:
y_pred = []
for i in range(len(X_test)):
    dist = np.argsort(Matrix_ed[i])[:1]
    pred = y_train.iloc[dist[0]]
    y_pred.append(pred)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

Accuracy: 0.7981651376146789


In [10]:
for neighbr in range(1, 11):
    y_pred = []
    for i in range(len(X_test)):
        #     print('---')
        dist = np.argsort(Matrix_ed[i])[:neighbr]
        ys = []
        for x in dist:
            ys.append(y_train.iloc[x])
    #     print(ys)
        ho = stats.mode(ys)
    #     print(ho)
    #     print(ho[0][0])
        pred = ho[0][0]
        y_pred.append(pred)
    # print(y_pred)
    # print(y_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{neighbr} neighbors: {acc}")

1 neighbors: 0.7981651376146789
2 neighbors: 0.7889908256880734
3 neighbors: 0.7706422018348624
4 neighbors: 0.7889908256880734
5 neighbors: 0.7889908256880734
6 neighbors: 0.7339449541284404
7 neighbors: 0.7431192660550459
8 neighbors: 0.7247706422018348
9 neighbors: 0.7706422018348624
10 neighbors: 0.7339449541284404


# Majority vote using both Euclidean and Levenstein distance

In [11]:
for neighbr in range(1, 11):
    y_pred = []
    for i in range(len(X_test)):
        #     print('---')
        dist_leven = np.argsort(Matrix_ed[i])[:neighbr]
        ys_leven = []
        for x in dist_leven:
            ys_leven.append(y_train.iloc[x])

        dist_euc = np.argsort(Matrix[i])[:neighbr]
        for x in dist_euc:
            ys_leven.append(y_train.iloc[x])
        ho_leven = stats.mode(ys_leven)
        pred_leven = ho_leven[0][0]

        y_pred.append(pred_leven)

    acc = accuracy_score(y_test, y_pred)
    print(f"{neighbr} neighbors: {acc}")

1 neighbors: 0.8532110091743119
2 neighbors: 0.8623853211009175
3 neighbors: 0.8715596330275229
4 neighbors: 0.8623853211009175
5 neighbors: 0.8623853211009175
6 neighbors: 0.8256880733944955
7 neighbors: 0.7798165137614679
8 neighbors: 0.7706422018348624
9 neighbors: 0.8073394495412844
10 neighbors: 0.8073394495412844


# Majority vote using both Euclidean and Levenstein distance

In [12]:
for neighbr in range(1, 11):
    y_pred = []
    for i in range(len(X_test)):
        #         print('---')
        dist_leven = np.argsort(Matrix_ed[i])[:neighbr]
        ys_leven = []
        for x in dist_leven:
            ys_leven.append(y_train.iloc[x])
#             ys_leven.append(y_train.iloc[x])
        ho_leven = stats.mode(ys_leven)
        pred_leven1 = ho_leven[0][0]
#         print(pred_leven)

        dist_euc = np.argsort(Matrix[i])[:neighbr]
        ys_euc = []
        for x in dist_euc:
            ys_euc.append(y_train.iloc[x])
        ho_leven = stats.mode(ys_euc)
        pred_leven2 = ho_leven[0][0]
#         print(pred_leven)

        merged_list = ys_leven + ys_euc
        ho_merged = stats.mode(merged_list)
        pred_leven3 = ho_merged[0][0]

#         print(pred_leven1,pred_leven2,pred_leven3)
        label2return = 0
        if pred_leven1 == pred_leven2:
            label2return = pred_leven2
        else:
            label2return = 5

        y_pred.append(label2return)

    acc = accuracy_score(y_test, y_pred)
    print(f"{neighbr} neighbors: {acc}")

1 neighbors: 0.7798165137614679


ValueError: Classification metrics can't handle a mix of multiclass and unknown targets

# Hyperparameter testing

In [None]:
param_dict = {}
Matrix_net = [[0 for x in range(len(X_train))] for y in range(len(X_test))]
alpha = [0.001, 0.01, 0.05, 0.075, 0.1, 0.5, 0.75, 1, 10, 100, 1000]

for alp in alpha:
    print('='*50)
    for i in range(len(Matrix)):
        for j in range(len(Matrix[i])):
            Matrix_net[i][j] = Matrix[i][j] + alp*Matrix_ed[i][j]

    for neighbr in range(1, 11):
        y_pred = []
        for i in range(len(X_test)):
            #   print('---')
            #   print(Matrix_net[i])
            dist = np.argsort(Matrix_net[i])[:neighbr]
            ys = []
            for x in dist:
                ys.append(y_train.iloc[x])
            ho = stats.mode(ys)
            pred = ho[0][0]
            y_pred.append(pred)
        acc = accuracy_score(y_test, y_pred)
        param_dict[str(f"neighbors: {neighbr}, alpha: {alp}")] = acc
        print(f"{neighbr} neighbors ---> Alpha {alp} ---> {acc}")
print('='*50)

In [None]:
print("="*20,"[Best combination]","="*20)
best = max(param_dict, key=lambda i: param_dict[i])
print(f"> {best} : {param_dict[best]*100}%")

In [None]:
param_dict = {}
Matrix_net = [[0 for x in range(len(X_train))] for y in range(len(X_test))]
alpha = [0.001, 0.01, 0.05, 0.075, 0.1, 0.5, 0.75, 1, 10, 100, 1000]

for alp in alpha:
    print('='*50)
    for i in range(len(Matrix)):
        for j in range(len(Matrix[i])):
            Matrix_net[i][j] = alp*Matrix[i][j] + Matrix_ed[i][j]
    for neighbr in range(1, 11):
        y_pred = []
        for i in range(len(X_test)):
            #   print('---')
            #   print(Matrix_net[i])
            dist = np.argsort(Matrix_net[i])[:neighbr]
            ys = []
            for x in dist:
                ys.append(y_train.iloc[x])
            ho = stats.mode(ys)
            pred = ho[0][0]
            y_pred.append(pred)
        acc = accuracy_score(y_test, y_pred)
        param_dict[str(f"neighbors: {neighbr}, alpha: {alp}")] = acc
        print(f"{neighbr} neighbors ---> Alpha {alp} ---> {acc}")
print('='*50)

In [None]:
print("="*20,"[Best combination]","="*20)
best = max(param_dict, key=lambda i: param_dict[i])
print(f"> {best} : {param_dict[best]*100}%")