In [1]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import metrics
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model

import numpy as np # linear algebra
import pandas as pd
import pickle

In [2]:
#read csv
dict_label = {
    'Datetime':0, 
    'Sentence':1, 
    'Custom Object': 2, 
    'URL': 3, 
    'Numbers': 4, 
    'List': 5}
data = pd.read_csv('data/needs_extraction_data/labelled_data.csv')

data['y_act'] = [dict_label[i] for i in data['y_act']]
y = data.loc[:,['y_act']]

In [3]:
data1 = data[['%_nans', 'mean_word_count', 'std_dev_word_count', 'has_delimiters']]
data1 = data1.fillna(0)

data1 = data1.rename(columns={'mean_word_count': 'scaled_mean_token_count', 'std_dev_word_count': 'scaled_std_dev_token_count', '%_nans': 'scaled_perc_nans'})
data1.loc[data1['scaled_mean_token_count'] > 10000, 'scaled_mean_token_count'] = 10000
data1.loc[data1['scaled_mean_token_count'] < -10000, 'scaled_mean_token_count'] = -10000
data1.loc[data1['scaled_std_dev_token_count'] > 10000, 'scaled_std_dev_token_count'] = 10000
data1.loc[data1['scaled_std_dev_token_count'] < -10000, 'scaled_std_dev_token_count'] = -10000
data1.loc[data1['scaled_perc_nans'] > 10000, 'scaled_perc_nans'] = 10000
data1.loc[data1['scaled_perc_nans'] < -10000, 'scaled_perc_nans'] = -10000
column_names_to_normalize = ['scaled_mean_token_count', 'scaled_std_dev_token_count','scaled_perc_nans']
x = data1[column_names_to_normalize].values
x = np.nan_to_num(x)
x_scaled = StandardScaler().fit_transform(x)
df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = data1.index)
data1[column_names_to_normalize] = df_temp

y.y_act = y.y_act.astype(float)

print(f"> Data mean: {data1.mean()}")
print(f"> Data median: {data1.median()}")
print(f"> Data stdev: {data1.std()}")

# data1.to_csv('before.csv')
# f = open('current.txt','w')
# d = enchant.Dict("en_US")

# for i in data.index:
#     ival = data.at[i,'Attribute_name']
#     if ival != 'id' and d.check(ivadf_tempdata1)
#         print(f,ival)
#         print(f,y.at[i,'y_act'])
#         data1.at[i,'dictionary_item'] = 1
#     else:
#         data1.at[i,'dictionary_item'] = 0

# data1.to_csv('after.csv')
# f.close()
# print(data1.columns)

> Data mean: scaled_perc_nans             -2.745801e-16
scaled_mean_token_count      -1.117919e-16
scaled_std_dev_token_count   -2.236863e-17
has_delimiters                3.105360e-01
dtype: float64
> Data median: scaled_perc_nans             -0.653046
scaled_mean_token_count      -0.144106
scaled_std_dev_token_count   -0.171320
has_delimiters                0.000000
dtype: float64
> Data stdev: scaled_perc_nans              1.000925
scaled_mean_token_count       1.000925
scaled_std_dev_token_count    1.000925
has_delimiters                0.463141
dtype: float64


In [4]:
print("===[VECTORIZATION]===")
arr = data['Attribute_name'].values
data = data.fillna(0)
arr1 = data['sample_1'].values
arr1 = [str(x) for x in arr1]
arr2 = data['sample_2'].values
arr2 = [str(x) for x in arr2]

vectorizer = CountVectorizer(ngram_range=(3, 3), analyzer='char')
X = vectorizer.fit_transform(arr)
X1 = vectorizer.fit_transform(arr1)
X2 = vectorizer.fit_transform(arr2)

print(f"> Length of vectorized feature_names: {len(vectorizer.get_feature_names())}")

data1.to_csv('data/preprocessing/before.csv')
attr_df = pd.DataFrame(X.toarray())
sample1_df = pd.DataFrame(X1.toarray())
sample2_df = pd.DataFrame(X2.toarray())

data2 = pd.concat([data1, attr_df, sample1_df, sample2_df], axis=1, sort=False)
data2.to_csv('data/preprocessing/after.csv')
data2.head()

X_train, X_test, y_train, y_test = train_test_split(
    data2, y, test_size=0.2, random_state=100)

# X_train_train, X_test_train,y_train_train,y_test_train = train_test_split(X_train,y_train, test_size=0.25)
# print(X_train.head())
# print(y_train.head())

X_train_new = X_train.reset_index(drop=True)
y_train_new = y_train.reset_index(drop=True)
print(f"X_train preview: {X_train.head()}")
print(f"y_train preview: {y_train.head()}")

X_train_new = X_train_new.values
y_train_new = y_train_new.values

===[VECTORIZATION]===
> Length of vectorized feature_names: 8528
X_train preview:      scaled_perc_nans  scaled_mean_token_count  scaled_std_dev_token_count  \
453         -0.653097                 0.686283                    3.364514   
43          -0.653120                 0.162079                   -0.054513   
133          1.978459                -0.148544                   -0.167108   
205         -0.653120                -0.141062                   -0.175870   
282         -0.653120                -0.148960                   -0.175870   

     has_delimiters  0  1  2  3  4  5  ...   8518  8519  8520  8521  8522  \
453            True  0  0  0  0  0  0  ...      0     0     0     0     0   
43             True  0  0  0  0  0  0  ...      0     0     0     0     0   
133            True  0  0  0  0  0  0  ...      0     0     0     0     0   
205           False  0  0  0  0  0  0  ...      0     0     0     0     0   
282           False  0  0  0  0  0  0  ...      0     0     0   

In [5]:
try:
    acc_df = pd.read_csv('data/model_data.csv')
    index = len(acc_df)
except FileNotFoundError:
    acc_df = pd.DataFrame(columns=['Model', 'Params', 'Feats', 'Train', 'Validation', 'Test', 'Precision'])
    index = 0

In [6]:
k = 5
kf = KFold(n_splits=k)
avg_train_acc, avg_test_acc = 0, 0

val_arr = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]

avgsc_lst, avgsc_train_lst, avgsc_hld_lst = [], [], []
avgsc, avgsc_train, avgsc_hld = 0, 0, 0

best_param_count = {'cval': {}}
for train_index, test_index in kf.split(X_train_new):
    X_train_cur, X_test_cur = X_train_new[train_index], X_train_new[test_index]
    y_train_cur, y_test_cur = y_train_new[train_index], y_train_new[test_index]
    X_train_train, X_val, y_train_train, y_val = train_test_split(
        X_train_cur, y_train_cur, test_size=0.25, random_state=100)

    bestPerformingModel = LogisticRegression(
        penalty='l2', multi_class='multinomial', solver='lbfgs', C=1)
    bestscore = 0
    print('='*10)
    for val in val_arr:
        clf = LogisticRegression(
            penalty='l2', multi_class='multinomial', solver='lbfgs', C=val)
        clf.fit(X_train_train, y_train_train)
        sc = clf.score(X_val, y_val)
        print(f"[C: {val}, accuracy: {sc}]")
        if bestscore < sc:
            bestcval = val
            bestscore = sc
            bestPerformingModel = clf
    
    if str(bestcval) in best_param_count['cval']:
        best_param_count['cval'][str(bestcval)] += 1
    else:
        best_param_count['cval'][str(bestcval)] = 1
        
    bscr_train = bestPerformingModel.score(X_train_cur, y_train_cur)
    bscr = bestPerformingModel.score(X_test_cur, y_test_cur)
    bscr_hld = bestPerformingModel.score(X_test, y_test)

    avgsc_train_lst.append(bscr_train)
    avgsc_lst.append(bscr)
    avgsc_hld_lst.append(bscr_hld)

    avgsc_train = avgsc_train + bscr_train
    avgsc = avgsc + bscr
    avgsc_hld = avgsc_hld + bscr_hld
    print()
    print(f"> Best C: {bestcval}")
    print(f"> Best training score: {bscr_train}")
    print(f"> Best test score: {bscr}")
    print(f"> Best held score: {bscr_hld}")
print('='*10)



  y = column_or_1d(y, warn=True)


[C: 0.0001, accuracy: 0.47126436781609193]
[C: 0.001, accuracy: 0.4827586206896552]
[C: 0.01, accuracy: 0.6896551724137931]
[C: 0.1, accuracy: 0.7701149425287356]
[C: 1, accuracy: 0.8045977011494253]
[C: 10, accuracy: 0.8045977011494253]
[C: 100, accuracy: 0.8275862068965517]
[C: 1000, accuracy: 0.8160919540229885]
[C: 10000, accuracy: 0.8275862068965517]
[C: 100000, accuracy: 0.8390804597701149]

> Best C: 100000
> Best training score: 0.9594202898550724
> Best test score: 0.7816091954022989
> Best held score: 0.7798165137614679
[C: 0.0001, accuracy: 0.4482758620689655]
[C: 0.001, accuracy: 0.47126436781609193]
[C: 0.01, accuracy: 0.6436781609195402]
[C: 0.1, accuracy: 0.7011494252873564]
[C: 1, accuracy: 0.7126436781609196]
[C: 10, accuracy: 0.7241379310344828]
[C: 100, accuracy: 0.7241379310344828]
[C: 1000, accuracy: 0.7241379310344828]
[C: 10000, accuracy: 0.7241379310344828]
[C: 100000, accuracy: 0.7241379310344828]

> Best C: 10
> Best training score: 0.9304347826086956
> Best t

In [7]:
y_pred = bestPerformingModel.predict(X_test)
prec = metrics.precision_score(y_test, y_pred, average=None)
cat_prec = {
    'Datetime': prec[0],
    'Sentence': prec[1],
    'Custom Object': prec[2],
    'URL': prec[3],
    'Numbers': prec[4],
    'List': prec[5],
}

  'precision', 'predicted', average, warn_for)


In [8]:
bestcval = max(best_param_count['cval'], key=lambda i: best_param_count['cval'][i])
bestparams = {'C': bestcval}
print(f"> Best C param : {bestcval}")
print(f"> Average training score list: {avgsc_train_lst}")
print(f"> Average testing score list: {avgsc_lst}")
print(f"> Average held score list: {avgsc_hld_lst}")
print()
avgsc_train = avgsc_train/k
avgsc = avgsc/k
avgsc_hld = avgsc_hld/k
print(f"> Average training score list: {avgsc_train}")
print(f"> Average testing score list: {avgsc}")
print(f"> Average held score list: {avgsc_hld}")
acc_df.loc[index] = ['logistic_regression', str(bestparams),"X_stats, X_name, X_sample1, X_sample2", avgsc_train, avgsc, avgsc_hld, str(cat_prec)]
index += 1
print()

y_pred = bestPerformingModel.predict(X_test)
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
print('Confusion Matrix: Actual (Row) vs Predicted (Column)')
print(cnf_matrix)

> Best C param : 1
> Average training score list: [0.9594202898550724, 0.9304347826086956, 0.930635838150289, 0.9479768786127167, 0.9508670520231214]
> Average testing score list: [0.7816091954022989, 0.8160919540229885, 0.8488372093023255, 0.7790697674418605, 0.8023255813953488]
> Average held score list: [0.7798165137614679, 0.8440366972477065, 0.8256880733944955, 0.8073394495412844, 0.8073394495412844]

> Average training score list: 0.9438669682499791
> Average testing score list: 0.8055867415129645
> Average held score list: 0.8128440366972477

Confusion Matrix: Actual (Row) vs Predicted (Column)
[[24  0  3  0  0  0]
 [ 0 13  9  0  0  0]
 [ 1  1 49  0  0  1]
 [ 0  0  0  2  0  0]
 [ 0  0  1  0  0  0]
 [ 0  1  3  1  0  0]]


In [9]:
# save the model to disk
filename = 'data/pretrained/lr_finalized_model.pickle'
pickle.dump(bestPerformingModel, open(filename, 'wb+'))

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb+'))
result = loaded_model.score(X_test, y_test)
y_prob = bestPerformingModel.predict_proba(X_test)

df = pd.DataFrame.from_records(y_prob)
print(df)
df.to_csv('data/model_predictions/lr_predictions.csv',index=False)

                0         1             2             3             4  \
0    1.774810e-01  0.111524  5.616456e-01  1.653244e-02  9.958602e-02   
1    1.647054e-05  0.000145  2.566129e-05  9.997353e-01  2.914670e-05   
2    9.951246e-01  0.000891  2.016093e-03  3.281248e-04  8.957817e-04   
3    9.746274e-01  0.003230  1.247266e-02  1.162901e-03  5.810947e-03   
4    1.584281e-01  0.102213  5.345663e-01  1.670232e-02  1.152096e-01   
5    2.496679e-01  0.026481  7.021329e-01  1.959321e-03  1.653270e-02   
6    5.423825e-03  0.005053  9.805596e-01  5.396741e-04  7.472988e-03   
7    9.961150e-01  0.000748  1.546386e-03  2.653828e-04  7.761667e-04   
8    8.982853e-01  0.015375  3.939505e-02  4.148656e-03  3.151666e-02   
9    4.791981e-02  0.410763  4.899998e-01  8.017798e-03  1.846321e-02   
10   2.988086e-01  0.042995  5.257421e-01  1.169134e-02  6.983392e-02   
11   1.401693e-02  0.800999  1.609837e-01  3.502841e-03  4.021134e-03   
12   9.989725e-01  0.000128  4.012316e-04  7.937463

# Feature combination testing

In [10]:
def test_feat_combos(index):
    combos = {
        "X_stats": data1,
        "X_name": attr_df,
        "X_stats, X_name": pd.concat([data1, attr_df], axis=1, sort=False),
        "X_sample1":  pd.concat([sample1_df], axis=1, sort=False),
        "X_name, X_sample1":  pd.concat([attr_df, sample1_df], axis=1, sort=False),
        "X_stats, X_sample1":  pd.concat([data1, sample1_df], axis=1, sort=False),
        "X_stats, X_name, X_sample1":  pd.concat([data1, attr_df, sample1_df], axis=1, sort=False)
    }
    

    for combo in combos:
        print("="*50, combo, "="*50)
        X_train, X_test, y_train, y_test = train_test_split(
            combos[combo], y, test_size=0.2, random_state=100)

        X_train_new = X_train.reset_index(drop=True)
        y_train_new = y_train.reset_index(drop=True)
        X_train_new = X_train_new.values
        y_train_new = y_train_new.values
        best_param_count = {'cval': {}}
        k = 5
        kf = KFold(n_splits=k)
        avg_train_acc, avg_test_acc = 0, 0

        val_arr = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]

        avgsc_lst, avgsc_train_lst, avgsc_hld_lst = [], [], []
        avgsc, avgsc_train, avgsc_hld = 0, 0, 0

        best_param_count = {'cval': {}}
        for train_index, test_index in kf.split(X_train_new):
            X_train_cur, X_test_cur = X_train_new[train_index], X_train_new[test_index]
            y_train_cur, y_test_cur = y_train_new[train_index], y_train_new[test_index]
            X_train_train, X_val, y_train_train, y_val = train_test_split(
                X_train_cur, y_train_cur, test_size=0.25, random_state=100)

            bestPerformingModel = LogisticRegression(
                penalty='l2', multi_class='multinomial', solver='lbfgs', C=1)
            bestscore = 0
            print('\t', '-'*10)
            for val in val_arr:
                clf = LogisticRegression(
                    penalty='l2', multi_class='multinomial', solver='lbfgs', C=val)
                clf.fit(X_train_train, y_train_train)
                sc = clf.score(X_val, y_val)
                print(f"\t[C: {val}, accuracy: {sc}]")
                if bestscore < sc:
                    bestcval = val
                    bestscore = sc
                    bestPerformingModel = clf

            if str(bestcval) in best_param_count['cval']:
                best_param_count['cval'][str(bestcval)] += 1
            else:
                best_param_count['cval'][str(bestcval)] = 1
            bscr_train = bestPerformingModel.score(X_train_cur, y_train_cur)
            bscr = bestPerformingModel.score(X_test_cur, y_test_cur)
            bscr_hld = bestPerformingModel.score(X_test, y_test)

            avgsc_train_lst.append(bscr_train)
            avgsc_lst.append(bscr)
            avgsc_hld_lst.append(bscr_hld)

            avgsc_train = avgsc_train + bscr_train
            avgsc = avgsc + bscr
            avgsc_hld = avgsc_hld + bscr_hld
            print()
            print(f"\t> Best C: {bestcval}")
            print(f"\t> Best training score: {bscr_train}")
            print(f"\t> Best test score: {bscr}")
            print(f"\t> Best held score: {bscr_hld}")
        print('\t', '-'*10)
        
        y_pred = bestPerformingModel.predict(X_test)
        prec = metrics.precision_score(y_test, y_pred, average=None)
        cat_prec = {
            'Datetime': prec[0],
            'Sentence': prec[1],
            'Custom Object': prec[2],
            'URL': prec[3],
            'Numbers': prec[4],
            'List': prec[5],
        }    
        bestcval = max(best_param_count['cval'], key=lambda i: best_param_count['cval'][i])
        bestparams = {'C': bestcval}
        print(f"\t> Best C param : {bestcval}")
        print(f"\t> Average training score list: {avgsc_train_lst}")
        print(f"\t> Average testing score list: {avgsc_lst}")
        print(f"\t> Average held score list: {avgsc_hld_lst}")
        print()
        avgsc_train = avgsc_train/k
        avgsc = avgsc/k
        avgsc_hld = avgsc_hld/k
        print(f"\t> Average training score list: {avgsc_train}")
        print(f"\t> Average testing score list: {avgsc}")
        print(f"\t> Average held score list: {avgsc_hld}")
        acc_df.loc[index] = ['logistic_regression', str(bestparams), combo, avgsc_train, avgsc, avgsc_hld, str(cat_prec)]
        index += 1
        print()

        y_pred = bestPerformingModel.predict(X_test)
        cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
        print('\tConfusion Matrix: Actual (Row) vs Predicted (Column)')
        print('\t', cnf_matrix)

In [11]:
test_feat_combos(index)
acc_df.to_csv('data/model_data.csv', index=False)

	 ----------
	[C: 0.0001, accuracy: 0.4367816091954023]
	[C: 0.001, accuracy: 0.4367816091954023]
	[C: 0.01, accuracy: 0.45977011494252873]
	[C: 0.1, accuracy: 0.5402298850574713]
	[C: 1, accuracy: 0.5287356321839081]
	[C: 10, accuracy: 0.5517241379310345]
	[C: 100, accuracy: 0.6206896551724138]


  y = column_or_1d(y, warn=True)


	[C: 1000, accuracy: 0.7241379310344828]
	[C: 10000, accuracy: 0.7471264367816092]
	[C: 100000, accuracy: 0.7586206896551724]

	> Best C: 100000
	> Best training score: 0.7304347826086957
	> Best test score: 0.6551724137931034
	> Best held score: 0.7247706422018348
	 ----------
	[C: 0.0001, accuracy: 0.41379310344827586]
	[C: 0.001, accuracy: 0.41379310344827586]
	[C: 0.01, accuracy: 0.4367816091954023]
	[C: 0.1, accuracy: 0.45977011494252873]
	[C: 1, accuracy: 0.47126436781609193]
	[C: 10, accuracy: 0.4942528735632184]
	[C: 100, accuracy: 0.5402298850574713]
	[C: 1000, accuracy: 0.6206896551724138]
	[C: 10000, accuracy: 0.6551724137931034]
	[C: 100000, accuracy: 0.6436781609195402]

	> Best C: 10000
	> Best training score: 0.672463768115942
	> Best test score: 0.6781609195402298
	> Best held score: 0.6330275229357798
	 ----------
	[C: 0.0001, accuracy: 0.4482758620689655]
	[C: 0.001, accuracy: 0.4482758620689655]
	[C: 0.01, accuracy: 0.45977011494252873]
	[C: 0.1, accuracy: 0.48275862

  'precision', 'predicted', average, warn_for)


	[C: 0.0001, accuracy: 0.4367816091954023]
	[C: 0.001, accuracy: 0.4367816091954023]
	[C: 0.01, accuracy: 0.5747126436781609]
	[C: 0.1, accuracy: 0.7241379310344828]
	[C: 1, accuracy: 0.7816091954022989]
	[C: 10, accuracy: 0.7701149425287356]
	[C: 100, accuracy: 0.7586206896551724]
	[C: 1000, accuracy: 0.7586206896551724]
	[C: 10000, accuracy: 0.7471264367816092]
	[C: 100000, accuracy: 0.7471264367816092]

	> Best C: 1
	> Best training score: 0.9072463768115943
	> Best test score: 0.8620689655172413
	> Best held score: 0.8073394495412844
	 ----------
	[C: 0.0001, accuracy: 0.41379310344827586]
	[C: 0.001, accuracy: 0.41379310344827586]
	[C: 0.01, accuracy: 0.5172413793103449]
	[C: 0.1, accuracy: 0.6896551724137931]
	[C: 1, accuracy: 0.7931034482758621]
	[C: 10, accuracy: 0.8160919540229885]
	[C: 100, accuracy: 0.7701149425287356]
	[C: 1000, accuracy: 0.7701149425287356]
	[C: 10000, accuracy: 0.7586206896551724]
	[C: 100000, accuracy: 0.7471264367816092]

	> Best C: 10
	> Best training 

	[C: 10000, accuracy: 0.6551724137931034]
	[C: 100000, accuracy: 0.6781609195402298]

	> Best C: 10
	> Best training score: 0.8782608695652174
	> Best test score: 0.7816091954022989
	> Best held score: 0.8165137614678899
	 ----------
	[C: 0.0001, accuracy: 0.45977011494252873]
	[C: 0.001, accuracy: 0.47126436781609193]
	[C: 0.01, accuracy: 0.5057471264367817]
	[C: 0.1, accuracy: 0.6666666666666666]
	[C: 1, accuracy: 0.6896551724137931]
	[C: 10, accuracy: 0.7011494252873564]
	[C: 100, accuracy: 0.7011494252873564]
	[C: 1000, accuracy: 0.7241379310344828]
	[C: 10000, accuracy: 0.7241379310344828]
	[C: 100000, accuracy: 0.7471264367816092]

	> Best C: 100000
	> Best training score: 0.8641618497109826
	> Best test score: 0.7674418604651163
	> Best held score: 0.8256880733944955
	 ----------
	[C: 0.0001, accuracy: 0.5287356321839081]
	[C: 0.001, accuracy: 0.5402298850574713]
	[C: 0.01, accuracy: 0.5862068965517241]
	[C: 0.1, accuracy: 0.7241379310344828]
	[C: 1, accuracy: 0.7241379310344828

	[C: 0.01, accuracy: 0.6781609195402298]
	[C: 0.1, accuracy: 0.7586206896551724]
	[C: 1, accuracy: 0.7126436781609196]
	[C: 10, accuracy: 0.6436781609195402]
	[C: 100, accuracy: 0.6666666666666666]
	[C: 1000, accuracy: 0.6436781609195402]
	[C: 10000, accuracy: 0.6091954022988506]
	[C: 100000, accuracy: 0.7011494252873564]

	> Best C: 0.1
	> Best training score: 0.8352601156069365
	> Best test score: 0.686046511627907
	> Best held score: 0.7798165137614679
	 ----------
	[C: 0.0001, accuracy: 0.5632183908045977]
	[C: 0.001, accuracy: 0.5862068965517241]
	[C: 0.01, accuracy: 0.7011494252873564]
	[C: 0.1, accuracy: 0.735632183908046]
	[C: 1, accuracy: 0.7126436781609196]
	[C: 10, accuracy: 0.6666666666666666]
	[C: 100, accuracy: 0.6551724137931034]
	[C: 1000, accuracy: 0.6551724137931034]
	[C: 10000, accuracy: 0.6666666666666666]
	[C: 100000, accuracy: 0.7701149425287356]

	> Best C: 100000
	> Best training score: 0.9161849710982659
	> Best test score: 0.813953488372093
	> Best held score: