In [1]:
from leven import levenshtein  
from sklearn.neighbors import DistanceMetric
from sklearn.metrics import accuracy_score
from statistics import mode
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from scipy import stats
import pandas as pd
import numpy as np 
import time
import editdistance

In [2]:
#read csv
dict_label = {
    'Datetime':0, 
    'Sentence':1, 
    'Custom Object': 2, 
    'URL': 3, 
    'Numbers': 4, 
    'List': 5}
data = pd.read_csv('data/needs_extraction_data/labelled_data.csv')

data['y_act'] = [dict_label[i] for i in data['y_act']]
y = data.loc[:,['y_act']]

In [3]:
data1 = data[['%_nans', 'mean_word_count', 'std_dev_word_count', 'has_delimiters']]
data1 = data1.fillna(0)

data1 = data1.rename(columns={'mean_word_count': 'scaled_mean_token_count', 'std_dev_word_count': 'scaled_std_dev_token_count', '%_nans': 'scaled_perc_nans'})
data1.loc[data1['scaled_mean_token_count'] > 10000, 'scaled_mean_token_count'] = 10000
data1.loc[data1['scaled_mean_token_count'] < -10000, 'scaled_mean_token_count'] = -10000
data1.loc[data1['scaled_std_dev_token_count'] > 10000, 'scaled_std_dev_token_count'] = 10000
data1.loc[data1['scaled_std_dev_token_count'] < -10000, 'scaled_std_dev_token_count'] = -10000
data1.loc[data1['scaled_perc_nans'] > 10000, 'scaled_perc_nans'] = 10000
data1.loc[data1['scaled_perc_nans'] < -10000, 'scaled_perc_nans'] = -10000
column_names_to_normalize = ['scaled_mean_token_count', 'scaled_std_dev_token_count','scaled_perc_nans']
x = data1[column_names_to_normalize].values
x = np.nan_to_num(x)
x_scaled = StandardScaler().fit_transform(x)
df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = data1.index)
data1[column_names_to_normalize] = df_temp

y.y_act = y.y_act.astype(float)

print(f"> Data mean: \n{data1.mean()}")
print(f"> Data median: \n{data1.median()}")
print(f"> Data stdev: \n{data1.std()}")

# data1.to_csv('before.csv')
# f = open('current.txt','w')
# d = enchant.Dict("en_US")

# for i in data.index:
#     ival = data.at[i,'Attribute_name']
#     if ival != 'id' and d.check(ivadf_tempdata1)
#         print(f,ival)
#         print(f,y.at[i,'y_act'])
#         data1.at[i,'dictionary_item'] = 1
#     else:
#         data1.at[i,'dictionary_item'] = 0

# data1.to_csv('after.csv')
# f.close()
# print(data1.columns)

> Data mean: 
scaled_perc_nans             -2.745801e-16
scaled_mean_token_count      -1.117919e-16
scaled_std_dev_token_count   -2.236863e-17
has_delimiters                3.105360e-01
dtype: float64
> Data median: 
scaled_perc_nans             -0.653046
scaled_mean_token_count      -0.144106
scaled_std_dev_token_count   -0.171320
has_delimiters                0.000000
dtype: float64
> Data stdev: 
scaled_perc_nans              1.000925
scaled_mean_token_count       1.000925
scaled_std_dev_token_count    1.000925
has_delimiters                0.463141
dtype: float64


In [4]:
# print("===[VECTORIZATION]===")
# arr = data['Attribute_name'].values
# data = data.fillna(0)
# arr1 = data['sample_1'].values
# arr1 = [str(x) for x in arr1]
# arr2 = data['sample_2'].values
# arr2 = [str(x) for x in arr2]

# vectorizer = CountVectorizer(ngram_range=(3, 3), analyzer='char')
# X = vectorizer.fit_transform(arr)
# X1 = vectorizer.fit_transform(arr1)
# X2 = vectorizer.fit_transform(arr2)

# print(f"> Length of vectorized feature_names: {len(vectorizer.get_feature_names())}")

# data1.to_csv('data/preprocessing/before.csv')
# tempdf = pd.DataFrame(X.toarray())
# tempdf1 = pd.DataFrame(X1.toarray())
# tempdf2 = pd.DataFrame(X2.toarray())

# data2 = pd.concat([data1, tempdf, tempdf1, tempdf2], axis=1, sort=False)
# data2.to_csv('data/preprocessing/after.csv')
# data2.head()

X_train, X_test, y_train, y_test = train_test_split(
    data1, y, test_size=0.2, random_state=100)
atr_train,atr_test = train_test_split(data1, test_size=0.2,random_state=100)

# X_train_train, X_test_train,y_train_train,y_test_train = train_test_split(X_train,y_train, test_size=0.25)
# print(X_train.head())
# print(y_train.head())

X_train_new = X_train.reset_index(drop=True)
y_train_new = y_train.reset_index(drop=True)
print(f"X_train preview: {X_train.head()}")
print(f"y_train preview: {y_train.head()}")

X_train_new = X_train_new.values
y_train_new = y_train_new.values

X_train preview:      scaled_perc_nans  scaled_mean_token_count  scaled_std_dev_token_count  \
453         -0.653097                 0.686283                    3.364514   
43          -0.653120                 0.162079                   -0.054513   
133          1.978459                -0.148544                   -0.167108   
205         -0.653120                -0.141062                   -0.175870   
282         -0.653120                -0.148960                   -0.175870   

     has_delimiters  
453            True  
43             True  
133            True  
205           False  
282           False  
y_train preview:      y_act
453    1.0
43     1.0
133    2.0
205    0.0
282    0.0


In [5]:
st = time.time()

Matrix = [[0 for x in range(len(X_train))] for y in range(len(X_test))]
dist_euc = DistanceMetric.get_metric('euclidean')

# for i in range(len(X_test)):
#     for j in range(len(X_train)):
#         dist = numpy.linalg.norm(a-b)
#         Matrix[i][j] = dist_euc.pairwise([data1.loc[i],data1.loc[j]])[0][1]

np_X_train = X_train.as_matrix()
np_X_test = X_test.as_matrix()

for i in range(len(X_test)):
    if i % 100 == 0:
        print(f"> i: {i}")
    a = np_X_test[i]
    for j in range(len(X_train)):
        b = np_X_train[j]
        dist = np.linalg.norm(a-b)
        Matrix[i][j] = dist

# print(f"> {np_X_train[0]}")
print(f"> Time elapsed: {time.time()-st} seconds")

> i: 0


  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


> i: 100
> Time elapsed: 0.7656998634338379 seconds


In [6]:
# 1-nearest neighbor on just Euclidean distance
y_pred = []
for i in range(len(X_test)):
    dist = np.argsort(Matrix[i])[:1]
    pred = y_train.iloc[dist[0]]
    y_pred.append(pred)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

Accuracy: 0.7798165137614679


In [7]:
for neighbr in range(1, 11):
    y_pred = []
    for i in range(len(X_test)):
        #     print('---')
        dist = np.argsort(Matrix[i])[:neighbr]
        ys = []
        for x in dist:
            ys.append(y_train.iloc[x])
    #     print(ys)
        ho = stats.mode(ys)
    #     print(ho)
    #     print(ho[0][0])
        pred = ho[0][0]
        y_pred.append(pred)
    # print(y_pred)
    # print(y_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{neighbr} neighbors: {acc}")

1 neighbors: 0.7798165137614679
2 neighbors: 0.7431192660550459
3 neighbors: 0.7431192660550459
4 neighbors: 0.7064220183486238
5 neighbors: 0.6972477064220184
6 neighbors: 0.6972477064220184
7 neighbors: 0.6972477064220184
8 neighbors: 0.7064220183486238
9 neighbors: 0.7064220183486238
10 neighbors: 0.7155963302752294


In [8]:
st = time.time()

Matrix_ed = [[0 for x in range(len(X_train))] for y in range(len(X_test))]
dist_euc = DistanceMetric.get_metric('euclidean')

# for i in range(len(X_test)):
#     for j in range(len(X_train)):
#         dist = numpy.linalg.norm(a-b)
#         Matrix[i][j] = dist_euc.pairwise([data1.loc[i],data1.loc[j]])[0][1]

np_X_train = atr_train.values
np_X_test = atr_test.values

for i in range(len(X_test)):
    if i % 100 == 0:
        print(f"> i: {i}")
    a = atr_test.iloc[i]
    for j in range(len(X_train)):
        b = atr_train.iloc[j]
        dist = editdistance.eval(a, b)
        Matrix_ed[i][j] = dist

# print(np_X_train[0])
print(f"> Time elapsed: {time.time()-st}")

> i: 0
> i: 100
> Time elapsed: 16.87529182434082


In [9]:
# 1-nearest neighbor on just Attribute Name Levenstein distance
y_pred = []
for i in range(len(X_test)):
    dist = np.argsort(Matrix_ed[i])[:1]
    pred = y_train.iloc[dist[0]]
    y_pred.append(pred)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

Accuracy: 0.5412844036697247


In [10]:
for neighbr in range(1, 11):
    y_pred = []
    for i in range(len(X_test)):
        #     print('---')
        dist = np.argsort(Matrix_ed[i])[:neighbr]
        ys = []
        for x in dist:
            ys.append(y_train.iloc[x])
    #     print(ys)
        ho = stats.mode(ys)
    #     print(ho)
    #     print(ho[0][0])
        pred = ho[0][0]
        y_pred.append(pred)
    # print(y_pred)
    # print(y_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{neighbr} neighbors: {acc}")

1 neighbors: 0.5412844036697247
2 neighbors: 0.5229357798165137
3 neighbors: 0.5504587155963303
4 neighbors: 0.5321100917431193
5 neighbors: 0.5137614678899083
6 neighbors: 0.5137614678899083
7 neighbors: 0.5504587155963303
8 neighbors: 0.5504587155963303
9 neighbors: 0.45871559633027525
10 neighbors: 0.5137614678899083


In [11]:
# Majority vote using both Euclidean and Levenstein distance
for neighbr in range(1, 11):
    y_pred = []
    for i in range(len(X_test)):
        #     print('---')
        dist_leven = np.argsort(Matrix_ed[i])[:neighbr]
        ys_leven = []
        for x in dist_leven:
            ys_leven.append(y_train.iloc[x])
#         ho_leven = stats.mode(ys_leven)
#         pred_leven = ho_leven[0][0]

        dist_euc = np.argsort(Matrix[i])[:neighbr]
#         ys_euc = []
        for x in dist_euc:
            ys_leven.append(y_train.iloc[x])
        ho_leven = stats.mode(ys_leven)
        pred_leven = ho_leven[0][0]

        y_pred.append(pred_leven)

    acc = accuracy_score(y_test, y_pred)
    print(f"{neighbr} neighbors: {acc}")

1 neighbors: 0.5963302752293578
2 neighbors: 0.7614678899082569
3 neighbors: 0.7889908256880734
4 neighbors: 0.7247706422018348
5 neighbors: 0.6880733944954128
6 neighbors: 0.6880733944954128
7 neighbors: 0.6972477064220184
8 neighbors: 0.6880733944954128
9 neighbors: 0.6880733944954128
10 neighbors: 0.6972477064220184


In [12]:
# Majority vote using both Euclidean and Levenstein distance
for neighbr in range(1, 11):
    y_pred = []
    for i in range(len(X_test)):
        #         print('---')
        dist_leven = np.argsort(Matrix_ed[i])[:neighbr]
        ys_leven = []
        for x in dist_leven:
            ys_leven.append(y_train.iloc[x])
#             ys_leven.append(y_train.iloc[x])
        ho_leven = stats.mode(ys_leven)
        pred_leven1 = ho_leven[0][0]
#         print(pred_leven)

        dist_euc = np.argsort(Matrix[i])[:neighbr]
        ys_euc = []
        for x in dist_euc:
            ys_euc.append(y_train.iloc[x])
        ho_leven = stats.mode(ys_euc)
        pred_leven2 = ho_leven[0][0]
#         print(pred_leven)

        merged_list = ys_leven + ys_euc
        ho_merged = stats.mode(merged_list)
        pred_leven3 = ho_merged[0][0]

#         print(pred_leven1,pred_leven2,pred_leven3)
        label2return = 0
        if pred_leven1 == pred_leven2:
            label2return = pred_leven2
        else:
            label2return = 5

        y_pred.append(label2return)

    acc = accuracy_score(y_test, y_pred)
    print(f"{neighbr} neighbors: {acc}")

1 neighbors: 0.5321100917431193
2 neighbors: 0.4954128440366973
3 neighbors: 0.48623853211009177
4 neighbors: 0.44954128440366975
5 neighbors: 0.43119266055045874
6 neighbors: 0.43119266055045874
7 neighbors: 0.48623853211009177
8 neighbors: 0.48623853211009177
9 neighbors: 0.44036697247706424
10 neighbors: 0.44954128440366975


In [13]:
Matrix_net = [[0 for x in range(len(X_train))] for y in range(len(X_test))]
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

for alp in alpha:
    print('='*50)
    for i in range(len(Matrix)):
        for j in range(len(Matrix[i])):
            Matrix_net[i][j] = Matrix[i][j] + alp*Matrix_ed[i][j]

    for neighbr in range(1, 11):
        y_pred = []
        for i in range(len(X_test)):
            #   print('---')
            #   print(Matrix_net[i])
            dist = np.argsort(Matrix_net[i])[:neighbr]
            ys = []
            for x in dist:
                ys.append(y_train.iloc[x])
            ho = stats.mode(ys)
            pred = ho[0][0]
            y_pred.append(pred)
        acc = accuracy_score(y_test, y_pred)
        print(f"{neighbr} neighbors ---> Alpha {alp} ---> {acc}")
print('='*50)

1 neighbors ---> Alpha 0.001 ---> 0.7798165137614679
2 neighbors ---> Alpha 0.001 ---> 0.7431192660550459
3 neighbors ---> Alpha 0.001 ---> 0.7431192660550459
4 neighbors ---> Alpha 0.001 ---> 0.7064220183486238
5 neighbors ---> Alpha 0.001 ---> 0.6972477064220184
6 neighbors ---> Alpha 0.001 ---> 0.6788990825688074
7 neighbors ---> Alpha 0.001 ---> 0.6880733944954128
8 neighbors ---> Alpha 0.001 ---> 0.6972477064220184
9 neighbors ---> Alpha 0.001 ---> 0.6972477064220184
10 neighbors ---> Alpha 0.001 ---> 0.7064220183486238
1 neighbors ---> Alpha 0.01 ---> 0.6972477064220184
2 neighbors ---> Alpha 0.01 ---> 0.6513761467889908
3 neighbors ---> Alpha 0.01 ---> 0.6788990825688074
4 neighbors ---> Alpha 0.01 ---> 0.6238532110091743
5 neighbors ---> Alpha 0.01 ---> 0.6330275229357798
6 neighbors ---> Alpha 0.01 ---> 0.6330275229357798
7 neighbors ---> Alpha 0.01 ---> 0.6238532110091743
8 neighbors ---> Alpha 0.01 ---> 0.6972477064220184
9 neighbors ---> Alpha 0.01 ---> 0.6238532110091743
1

In [14]:
Matrix_net = [[0 for x in range(len(X_train))] for y in range(len(X_test))]
alpha = [0.05, 0.075, 0.1, 0.5, 0.75]

for alp in alpha:
    print('='*50)
    for i in range(len(Matrix)):
        for j in range(len(Matrix[i])):
            Matrix_net[i][j] = Matrix[i][j] + alp*Matrix_ed[i][j]

    for neighbr in range(1, 11):
        y_pred = []
        for i in range(len(X_test)):
            #   print('---')
            #   print(Matrix_net[i])
            dist = np.argsort(Matrix_net[i])[:neighbr]
            ys = []
            for x in dist:
                ys.append(y_train.iloc[x])
            ho = stats.mode(ys)
            pred = ho[0][0]
            y_pred.append(pred)
        acc = accuracy_score(y_test, y_pred)
        print(f"{neighbr} neighbors ---> Alpha {alp} ---> {acc}")
print('='*50)

1 neighbors ---> Alpha 0.05 ---> 0.7614678899082569
2 neighbors ---> Alpha 0.05 ---> 0.7339449541284404
3 neighbors ---> Alpha 0.05 ---> 0.7339449541284404
4 neighbors ---> Alpha 0.05 ---> 0.7155963302752294
5 neighbors ---> Alpha 0.05 ---> 0.7155963302752294
6 neighbors ---> Alpha 0.05 ---> 0.7155963302752294
7 neighbors ---> Alpha 0.05 ---> 0.7064220183486238
8 neighbors ---> Alpha 0.05 ---> 0.7064220183486238
9 neighbors ---> Alpha 0.05 ---> 0.7064220183486238
10 neighbors ---> Alpha 0.05 ---> 0.7155963302752294
1 neighbors ---> Alpha 0.075 ---> 0.7064220183486238
2 neighbors ---> Alpha 0.075 ---> 0.7247706422018348
3 neighbors ---> Alpha 0.075 ---> 0.7247706422018348
4 neighbors ---> Alpha 0.075 ---> 0.7155963302752294
5 neighbors ---> Alpha 0.075 ---> 0.6972477064220184
6 neighbors ---> Alpha 0.075 ---> 0.7064220183486238
7 neighbors ---> Alpha 0.075 ---> 0.7064220183486238
8 neighbors ---> Alpha 0.075 ---> 0.7064220183486238
9 neighbors ---> Alpha 0.075 ---> 0.7064220183486238
10

In [15]:
Matrix_net = [[0 for x in range(len(X_train))] for y in range(len(X_test))]
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

for alp in alpha:
    for i in range(len(Matrix)):
        for j in range(len(Matrix[i])):
            Matrix_net[i][j] = alp*Matrix[i][j] + Matrix_ed[i][j]
for neighbr in range(1, 11):
    print('='*50)
    y_pred = []
    for i in range(len(X_test)):
        #   print('---')
        #   print(Matrix_net[i])
        dist = np.argsort(Matrix_net[i])[:neighbr]
        ys = []
        for x in dist:
            ys.append(y_train.iloc[x])
        ho = stats.mode(ys)
        pred = ho[0][0]
        y_pred.append(pred)
    acc = accuracy_score(y_test, y_pred)
    print(f"{neighbr} neighbors ---> Alpha {alp} ---> {acc}")
print('='*50)

1 neighbors ---> Alpha 1000 ---> 0.7798165137614679
2 neighbors ---> Alpha 1000 ---> 0.7431192660550459
3 neighbors ---> Alpha 1000 ---> 0.7431192660550459
4 neighbors ---> Alpha 1000 ---> 0.7064220183486238
5 neighbors ---> Alpha 1000 ---> 0.6972477064220184
6 neighbors ---> Alpha 1000 ---> 0.6788990825688074
7 neighbors ---> Alpha 1000 ---> 0.6880733944954128
8 neighbors ---> Alpha 1000 ---> 0.6972477064220184
9 neighbors ---> Alpha 1000 ---> 0.6972477064220184
10 neighbors ---> Alpha 1000 ---> 0.7064220183486238
