In [1]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import metrics
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model

import numpy as np # linear algebra
import pandas as pd
import pickle

In [2]:
#read csv
dict_label = {
    'Datetime':0, 
    'Sentence':1, 
    'Custom Object': 2, 
    'URL': 3, 
    'Numbers': 4, 
    'List': 5}
data = pd.read_csv('data/needs_extraction_data/labelled_data.csv')

data['y_act'] = [dict_label[i] for i in data['y_act']]
y = data.loc[:,['y_act']]

In [3]:
data1 = data[['%_nans', 'mean_word_count', 'std_dev_word_count', 'has_delimiters']]
data1 = data1.fillna(0)

data1 = data1.rename(columns={'mean_word_count': 'scaled_mean_token_count', 'std_dev_word_count': 'scaled_std_dev_token_count', '%_nans': 'scaled_perc_nans'})
data1.loc[data1['scaled_mean_token_count'] > 10000, 'scaled_mean_token_count'] = 10000
data1.loc[data1['scaled_mean_token_count'] < -10000, 'scaled_mean_token_count'] = -10000
data1.loc[data1['scaled_std_dev_token_count'] > 10000, 'scaled_std_dev_token_count'] = 10000
data1.loc[data1['scaled_std_dev_token_count'] < -10000, 'scaled_std_dev_token_count'] = -10000
data1.loc[data1['scaled_perc_nans'] > 10000, 'scaled_perc_nans'] = 10000
data1.loc[data1['scaled_perc_nans'] < -10000, 'scaled_perc_nans'] = -10000
column_names_to_normalize = ['scaled_mean_token_count', 'scaled_std_dev_token_count','scaled_perc_nans']
x = data1[column_names_to_normalize].values
x = np.nan_to_num(x)
x_scaled = StandardScaler().fit_transform(x)
df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = data1.index)
data1[column_names_to_normalize] = df_temp

y.y_act = y.y_act.astype(float)

print(f"> Data mean: {data1.mean()}")
print(f"> Data median: {data1.median()}")
print(f"> Data stdev: {data1.std()}")

# data1.to_csv('before.csv')
# f = open('current.txt','w')
# d = enchant.Dict("en_US")

# for i in data.index:
#     ival = data.at[i,'Attribute_name']
#     if ival != 'id' and d.check(ivadf_tempdata1)
#         print(f,ival)
#         print(f,y.at[i,'y_act'])
#         data1.at[i,'dictionary_item'] = 1
#     else:
#         data1.at[i,'dictionary_item'] = 0

# data1.to_csv('after.csv')
# f.close()
# print(data1.columns)

> Data mean: scaled_perc_nans             -2.745801e-16
scaled_mean_token_count      -1.117919e-16
scaled_std_dev_token_count   -2.236863e-17
has_delimiters                3.105360e-01
dtype: float64
> Data median: scaled_perc_nans             -0.653046
scaled_mean_token_count      -0.144106
scaled_std_dev_token_count   -0.171320
has_delimiters                0.000000
dtype: float64
> Data stdev: scaled_perc_nans              1.000925
scaled_mean_token_count       1.000925
scaled_std_dev_token_count    1.000925
has_delimiters                0.463141
dtype: float64


In [4]:
print("===[VECTORIZATION]===")
arr = data['Attribute_name'].values
data = data.fillna(0)
arr1 = data['sample_1'].values
arr1 = [str(x) for x in arr1]
arr2 = data['sample_2'].values
arr2 = [str(x) for x in arr2]

vectorizer = CountVectorizer(ngram_range=(3, 3), analyzer='char')
X = vectorizer.fit_transform(arr)
X1 = vectorizer.fit_transform(arr1)
X2 = vectorizer.fit_transform(arr2)

print(f"> Length of vectorized feature_names: {len(vectorizer.get_feature_names())}")

data1.to_csv('data/preprocessing/before.csv')
tempdf = pd.DataFrame(X.toarray())
tempdf1 = pd.DataFrame(X1.toarray())
tempdf2 = pd.DataFrame(X2.toarray())

data2 = pd.concat([data1, tempdf, tempdf1, tempdf2], axis=1, sort=False)
data2.to_csv('data/preprocessing/after.csv')
data2.head()

X_train, X_test, y_train, y_test = train_test_split(
    data2, y, test_size=0.2, random_state=100)

# X_train_train, X_test_train,y_train_train,y_test_train = train_test_split(X_train,y_train, test_size=0.25)
# print(X_train.head())
# print(y_train.head())

X_train_new = X_train.reset_index(drop=True)
y_train_new = y_train.reset_index(drop=True)
print(f"X_train preview: {X_train.head()}")
print(f"y_train preview: {y_train.head()}")

X_train_new = X_train_new.values
y_train_new = y_train_new.values

===[VECTORIZATION]===
> Length of vectorized feature_names: 8528
X_train preview:      scaled_perc_nans  scaled_mean_token_count  scaled_std_dev_token_count  \
453         -0.653097                 0.686283                    3.364514   
43          -0.653120                 0.162079                   -0.054513   
133          1.978459                -0.148544                   -0.167108   
205         -0.653120                -0.141062                   -0.175870   
282         -0.653120                -0.148960                   -0.175870   

     has_delimiters  0  1  2  3  4  5  ...   8518  8519  8520  8521  8522  \
453            True  0  0  0  0  0  0  ...      0     0     0     0     0   
43             True  0  0  0  0  0  0  ...      0     0     0     0     0   
133            True  0  0  0  0  0  0  ...      0     0     0     0     0   
205           False  0  0  0  0  0  0  ...      0     0     0     0     0   
282           False  0  0  0  0  0  0  ...      0     0     0   

In [5]:
# logisticRegr = LogisticRegression(penalty='l2',multi_class='multinomial', solver='lbfgs',C = 100,max_iter=200)
# logisticRegr = LogisticRegressionCV(cv=5,penalty='l2',multi_class='multinomial', solver='lbfgs',Cs = 1,max_iter=200)

k = 5
kf = KFold(n_splits=k)
avg_train_acc, avg_test_acc = 0, 0

val_arr = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]
# bestPerformingModel = LogisticRegression(penalty='l2',multi_class='multinomial', solver='lbfgs',C = 1)
# bestscore = 0
# for val in val_arr:
#     logisticRegr = LogisticRegression(penalty='l2',multi_class='multinomial', solver='lbfgs',C = val)
#     avgsc = 0
#     for train_index, test_index in kf.split(X_train_new):
#         X_train_cur, X_test_cur = X_train_new[train_index], X_train_new[test_index]
#         y_train_cur, y_test_cur = y_train_new[train_index], y_train_new[test_index]

#         logisticRegr.fit(X_train_cur, y_train_cur)
#         sc = logisticRegr.score(X_test_cur, y_test_cur)
#         avgsc = avgsc + sc
#     avgsc = avgsc/k
#     print(avgsc)
#     if bestscore < avgsc:
#         bestscore = avgsc
#         bestPerformingModel = logisticRegr
#         print(bestPerformingModel)


avgsc_lst, avgsc_train_lst, avgsc_hld_lst = [], [], []
avgsc, avgsc_train, avgsc_hld = 0, 0, 0

for train_index, test_index in kf.split(X_train_new):
    X_train_cur, X_test_cur = X_train_new[train_index], X_train_new[test_index]
    y_train_cur, y_test_cur = y_train_new[train_index], y_train_new[test_index]
    X_train_train, X_val, y_train_train, y_val = train_test_split(
        X_train_cur, y_train_cur, test_size=0.25, random_state=100)

    bestPerformingModel = LogisticRegression(
        penalty='l2', multi_class='multinomial', solver='lbfgs', C=1)
    bestscore = 0
    for val in val_arr:
        clf = LogisticRegression(
            penalty='l2', multi_class='multinomial', solver='lbfgs', C=val)
        clf.fit(X_train_train, y_train_train)
        sc = clf.score(X_val, y_val)
        print(f"[C: {val}, accuracy: {sc}]")
        if bestscore < sc:
            bestscore = sc
            bestPerformingModel = clf
#                 print(bestPerformingModel)

    bscr_train = bestPerformingModel.score(X_train_cur, y_train_cur)
    bscr = bestPerformingModel.score(X_test_cur, y_test_cur)
    bscr_hld = bestPerformingModel.score(X_test, y_test)

    avgsc_train_lst.append(bscr_train)
    avgsc_lst.append(bscr)
    avgsc_hld_lst.append(bscr_hld)

    avgsc_train = avgsc_train + bscr_train
    avgsc = avgsc + bscr
    avgsc_hld = avgsc_hld + bscr_hld
    print('='*10)
    print(f"> Best training score: {bscr_train}")
    print(f"> Best test score: {bscr}")
    print(f"> Best held score: {bscr_hld}")
print('='*10)

  y = column_or_1d(y, warn=True)


> Best training score: 0.9594202898550724
> Best test score: 0.7816091954022989
> Best held score: 0.7798165137614679
> Best training score: 0.9304347826086956
> Best test score: 0.8160919540229885
> Best held score: 0.8440366972477065
> Best training score: 0.930635838150289
> Best test score: 0.8488372093023255
> Best held score: 0.8256880733944955
> Best training score: 0.9479768786127167
> Best test score: 0.7790697674418605
> Best held score: 0.8073394495412844
> Best training score: 0.9508670520231214
> Best test score: 0.8023255813953488
> Best held score: 0.8073394495412844


In [6]:
print(f"> Average training score list: {avgsc_train_lst}")
print(f"> Average testing score list: {avgsc_lst}")
print(f"> Average held score list: {avgsc_hld_lst}")
print()
print(f"> Average training score list/k: {avgsc_train/k}")
print(f"> Average testing score list/k: {avgsc/k}")
print(f"> Average held score list/k: {avgsc_hld/k}")
print()
y_pred = bestPerformingModel.predict(X_test)
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
print('Confusion Matrix: Actual (Row) vs Predicted (Column)')
print(cnf_matrix)

> Average training score list: [0.9594202898550724, 0.9304347826086956, 0.930635838150289, 0.9479768786127167, 0.9508670520231214]
> Average testing score list: [0.7816091954022989, 0.8160919540229885, 0.8488372093023255, 0.7790697674418605, 0.8023255813953488]
> Average held score list: [0.7798165137614679, 0.8440366972477065, 0.8256880733944955, 0.8073394495412844, 0.8073394495412844]

> Average training score list/k: 0.9438669682499791
> Average testing score list/k: 0.8055867415129645
> Average held score list/k: 0.8128440366972477

Confusion Matrix: Actual (Row) vs Predicted (Column)
[[24  0  3  0  0  0]
 [ 0 13  9  0  0  0]
 [ 1  1 49  0  0  1]
 [ 0  0  0  2  0  0]
 [ 0  0  1  0  0  0]
 [ 0  1  3  1  0  0]]


In [7]:
# save the model to disk
filename = 'data/pretrained/lr_finalized_model.pickle'
pickle.dump(bestPerformingModel, open(filename, 'wb+'))

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb+'))
result = loaded_model.score(X_test, y_test)
y_prob = bestPerformingModel.predict_proba(X_test)

df = pd.DataFrame.from_records(y_prob)
print(df)
df.to_csv('data/model_predictions/lr_predictions.csv',index=False)

> Y probabilities: [[1.77481015e-001 1.11523637e-001 5.61645555e-001 1.65324400e-002
  9.95860204e-002 3.32313337e-002]
 [1.64705375e-005 1.45111174e-004 2.56612939e-005 9.99735295e-001
  2.91467002e-005 4.83148543e-005]
 [9.95124649e-001 8.90535645e-004 2.01609320e-003 3.28124846e-004
  8.95781674e-004 7.44815716e-004]
 [9.74627350e-001 3.22952839e-003 1.24726633e-002 1.16290058e-003
  5.81094725e-003 2.69661004e-003]
 [1.58428141e-001 1.02213113e-001 5.34566275e-001 1.67023191e-002
  1.15209643e-001 7.28805087e-002]
 [2.49667932e-001 2.64809080e-002 7.02132939e-001 1.95932113e-003
  1.65326998e-002 3.22620050e-003]
 [5.42382483e-003 5.05286387e-003 9.80559643e-001 5.39674123e-004
  7.47298765e-003 9.51006475e-004]
 [9.96115043e-001 7.48212406e-004 1.54638634e-003 2.65382754e-004
  7.76166742e-004 5.48808621e-004]
 [8.98285320e-001 1.53749231e-002 3.93950505e-002 4.14865581e-003
  3.15166614e-002 1.12793896e-002]
 [4.79198127e-002 4.10762684e-001 4.89999760e-001 8.01779796e-003
  1.84