In [1]:
%matplotlib inline
from matplotlib import pyplot as plt

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import metrics
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
  
from statistics import mode

from scipy import stats

import numpy as np
import pandas as pd
import os
import glob
import math
import re
import enchant
import pickle
import time
import editdistance

  from numpy.core.umath_tests import inner1d


In [2]:
dict_label = {
    'Datetime':0, 
    'Sentence':1, 
    'Custom Object': 2, 
    'URL': 3, 
    'Numbers': 4, 
    'List': 5}

data = pd.read_csv('data/needs_extraction_data/labelled_data.csv')
data['y_act'] = [dict_label[i] for i in data['y_act']]
y = data.loc[:,['y_act']]
data.columns

Index(['Record_id', 'Attribute_name', 'y_pred', 'y_act', 'Reason',
       'total_vals', 'num_nans', '%_nans', 'mean_word_count',
       'std_dev_word_count', 'has_delimiters', 'sample_1', 'sample_2',
       'sample_3', 'sample_4', 'sample_5'],
      dtype='object')

In [3]:
data1 = data[['%_nans', 'mean_word_count', 'std_dev_word_count', 'has_delimiters']]
data1 = data1.fillna(0)

data1 = data1.rename(columns={'mean_word_count': 'scaled_mean_token_count', 'std_dev_word_count': 'scaled_std_dev_token_count', '%_nans': 'scaled_perc_nans'})
data1.loc[data1['scaled_mean_token_count'] > 10000, 'scaled_mean_token_count'] = 10000
data1.loc[data1['scaled_mean_token_count'] < -10000, 'scaled_mean_token_count'] = -10000
data1.loc[data1['scaled_std_dev_token_count'] > 10000, 'scaled_std_dev_token_count'] = 10000
data1.loc[data1['scaled_std_dev_token_count'] < -10000, 'scaled_std_dev_token_count'] = -10000
data1.loc[data1['scaled_perc_nans'] > 10000, 'scaled_perc_nans'] = 10000
data1.loc[data1['scaled_perc_nans'] < -10000, 'scaled_perc_nans'] = -10000
column_names_to_normalize = ['scaled_mean_token_count', 'scaled_std_dev_token_count','scaled_perc_nans']
x = data1[column_names_to_normalize].values
x = np.nan_to_num(x)
x_scaled = StandardScaler().fit_transform(x)
df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = data1.index)
data1[column_names_to_normalize] = df_temp

y.y_act = y.y_act.astype(float)

print(f"> Data mean: {data1.mean()}\n")
print(f"> Data median: {data1.median()}\n")
print(f"> Data stdev: {data1.std()}\n")

print("===[VECTORIZATION]===")
arr = data['Attribute_name'].values
data = data.fillna(0)
arr1 = data['sample_1'].values
arr1 = [str(x) for x in arr1]
arr2 = data['sample_2'].values
arr2 = [str(x) for x in arr2]

vectorizer = CountVectorizer(ngram_range=(3, 3), analyzer='char')
X = vectorizer.fit_transform(arr)
X1 = vectorizer.fit_transform(arr1)
X2 = vectorizer.fit_transform(arr2)

print(f"> Length of vectorized feature_names: {len(vectorizer.get_feature_names())}")

# data1.to_csv('data/preprocessing/before.csv')
attr_df = pd.DataFrame(X.toarray())
sample1_df = pd.DataFrame(X1.toarray())
sample2_df = pd.DataFrame(X2.toarray())

data2 = pd.concat([data1, attr_df, sample1_df, sample2_df], axis=1, sort=False)
# data2.to_csv('data/preprocessing/after.csv')
data2.head()

X_train, X_test_held, y_train, y_test_held = train_test_split(
    data2, y, test_size=0.2, random_state=100)

X_train_new = X_train.reset_index(drop=True)
y_train_new = y_train.reset_index(drop=True)
print(f"X_train preview:\n{X_train.head()}")
print(f"y_train preview:\n{y_train.head()}")

X_train_new = X_train_new.values
y_train_new = y_train_new.values

> Data mean: scaled_perc_nans             -2.745801e-16
scaled_mean_token_count      -1.117919e-16
scaled_std_dev_token_count   -2.236863e-17
has_delimiters                3.105360e-01
dtype: float64

> Data median: scaled_perc_nans             -0.653046
scaled_mean_token_count      -0.144106
scaled_std_dev_token_count   -0.171320
has_delimiters                0.000000
dtype: float64

> Data stdev: scaled_perc_nans              1.000925
scaled_mean_token_count       1.000925
scaled_std_dev_token_count    1.000925
has_delimiters                0.463141
dtype: float64

===[VECTORIZATION]===
> Length of vectorized feature_names: 8528
X_train preview:
     scaled_perc_nans  scaled_mean_token_count  scaled_std_dev_token_count  \
453         -0.653097                 0.686283                    3.364514   
43          -0.653120                 0.162079                   -0.054513   
133          1.978459                -0.148544                   -0.167108   
205         -0.653120           

In [4]:
X_train_train, X_val, y_train_train, y_val = train_test_split(
    X_train_new, y_train_new, test_size=0.25, random_state=100)

In [5]:
def record_failed(model, x_test, y_test):
    data = pd.read_csv('data/needs_extraction_data/labelled_data.csv')
    dict_label = {
        0: 'Datetime',
        1: 'Sentence',
        2: 'Custom Object',
        3: 'URL',
        4: 'Numbers',
        5: 'List'
    }
    preds = model.predict(x_test)
    ret_df = pd.DataFrame(columns=['Index', 'y_pred', 'y_act', 'sample_1', 'sample_2', 'sample_3'])
    count = 0
    for pred in preds:
        y_true = int(y_test.values[count])
        if pred != y_true:
            index = x_test.index[count]
            row = data.loc[index]
            samples = [
                row.sample_1,
                row.sample_2,
                row.sample_3
            ]
            ret_df.loc[count] = [index, pred, y_true] + samples
        count += 1
    ret_df['y_act'] = ret_df['y_act'].astype(int)
    ret_df['y_pred'] = ret_df['y_pred'].astype(int)
    ret_df['y_act'] = [dict_label[i] for i in ret_df['y_act']]
    ret_df['y_pred'] = [dict_label[i] for i in ret_df['y_pred']]
    return ret_df


def get_false_nums(df):
    false_labels = df.y_pred.unique()
    true_labels = df.y_act.unique()
    length = len(df)
    for label in false_labels:
        total = len(df.loc[df.y_pred == label])
        print(f">>> Incorrectly predicted as {label}: {total}, {'{0:.3g}'.format((total/length)*100)}%")
        
        temp_df = df.loc[df.y_pred == label]
        temp_labels = temp_df.y_act.unique()
        t_len = len(temp_df)
        for t_label in temp_labels:
            t_total = len(temp_df[temp_df.y_act == t_label])
            print(f"\t>>> {t_label} predicted as {label}: {t_total}, {'{0:.5g}'.format((t_total/t_len)*100)}%")
    print("\n")

In [6]:
rf_clf = RandomForestClassifier(n_estimators=50, max_depth=50, random_state=100)
lr_clf = LogisticRegression(C=1, random_state=100)
svm_clf = svm.SVC(C=10, gamma=0.1, random_state=100)

rf_clf.fit(X_train_train, y_train_train)
lr_clf.fit(X_train_train, y_train_train)
svm_clf.fit(X_train_train, y_train_train)

print("="*20,"[Random Forest]","="*20)
trainsc = rf_clf.score(X_train, y_train)
valsc = rf_clf.score(X_val, y_val)
heldsc = rf_clf.score(X_test_held, y_test_held)
held_pr = metrics.precision_recall_fscore_support(y_test_held, rf_clf.predict(X_test_held))
print(f"> Train: {trainsc}")
print(f"> Validate: {valsc}")
print(f"> Test: {heldsc}")
print(f"> Precision Numbers: {held_pr[0][1]}")
print(f"> Precision Not Numbers: {held_pr[0][0]}")
print(f"> Recall Numbers: {held_pr[1][1]}")
print(f"> Recall Not Numbers: {held_pr[1][0]}")
print()
print("="*20,"[Logistic Regression]","="*20)
trainsc = lr_clf.score(X_train, y_train)
valsc = lr_clf.score(X_val, y_val)
heldsc = lr_clf.score(X_test_held, y_test_held)
held_pr = metrics.precision_recall_fscore_support(y_test_held, lr_clf.predict(X_test_held))
print(f"> Train: {trainsc}")
print(f"> Validate: {valsc}")
print(f"> Test: {heldsc}")
print(f"> Precision Numbers: {held_pr[0][1]}")
print(f"> Precision Not Numbers: {held_pr[0][0]}")
print(f"> Recall Numbers: {held_pr[1][1]}")
print(f"> Recall Not Numbers: {held_pr[1][0]}")
print()
print("="*20,"[SVM]","="*20)
trainsc = svm_clf.score(X_train, y_train)
valsc = svm_clf.score(X_val, y_val)
heldsc = svm_clf.score(X_test_held, y_test_held)
held_pr = metrics.precision_recall_fscore_support(y_test_held, svm_clf.predict(X_test_held))
print(f"> Train: {trainsc}")
print(f"> Validate: {valsc}")
print(f"> Test: {heldsc}")
print(f"> Precision Numbers: {held_pr[0][1]}")
print(f"> Precision Not Numbers: {held_pr[0][0]}")
print(f"> Recall Numbers: {held_pr[1][1]}")
print(f"> Recall Not Numbers: {held_pr[1][0]}")

  """
  y = column_or_1d(y, warn=True)




  'precision', 'predicted', average, warn_for)


> Train: 0.9328703703703703
> Validate: 0.7592592592592593
> Test: 0.8073394495412844
> Precision Numbers: 0.9285714285714286
> Precision Not Numbers: 0.9230769230769231
> Recall Numbers: 0.5909090909090909
> Recall Not Numbers: 0.8888888888888888

> Train: 0.9351851851851852
> Validate: 0.7592592592592593
> Test: 0.8807339449541285
> Precision Numbers: 1.0
> Precision Not Numbers: 1.0
> Recall Numbers: 0.7727272727272727
> Recall Not Numbers: 0.9259259259259259

> Train: 0.8842592592592593
> Validate: 0.5370370370370371
> Test: 0.5963302752293578
> Precision Numbers: 1.0
> Precision Not Numbers: 0.8571428571428571
> Recall Numbers: 0.09090909090909091
> Recall Not Numbers: 0.4444444444444444


In [7]:
rf_clf = RandomForestClassifier(n_estimators=50, max_depth=50, random_state=100)
lr_clf = LogisticRegression(C=1, random_state=100)
svm_clf = svm.SVC(C=10, gamma=0.1, random_state=100)

rf_clf.fit(X_train_train, y_train_train)
lr_clf.fit(X_train_train, y_train_train)
svm_clf.fit(X_train_train, y_train_train)

print("="*20,"[Random Forest]","="*20)
fails = record_failed(rf_clf, X_test_held, y_test_held)
print(f">>> Total incorrect predictions: {len(fails)}, {'{0:.4g}'.format(((len(fails)/len(y_test_held))*100))}%")
get_false_nums(fails)
print(fails.head(10))
print("="*20,"[Logistic Regression]","="*20)
fails = record_failed(lr_clf, X_test_held, y_test_held)
print(f">>> Total incorrect predictions: {len(fails)}, {'{0:.4g}'.format(((len(fails)/len(y_test_held))*100))}%")
get_false_nums(fails)
print(fails.head(10))
print("="*20,"[SVM]","="*20)
fails = record_failed(svm_clf, X_test_held, y_test_held)
print(f">>> Total incorrect predictions: {len(fails)}, {'{0:.4g}'.format(((len(fails)/len(y_test_held))*100))}%")
get_false_nums(fails)
print(fails.head(10))

  """
  y = column_or_1d(y, warn=True)


>>> Total incorrect predictions: 21, 19.27%
>>> Incorrectly predicted as Custom Object: 18, 85.7%
	>>> List predicted as Custom Object: 5, 27.778%
	>>> Sentence predicted as Custom Object: 9, 50%
	>>> Numbers predicted as Custom Object: 1, 5.5556%
	>>> Datetime predicted as Custom Object: 3, 16.667%
>>> Incorrectly predicted as Datetime: 2, 9.52%
	>>> Custom Object predicted as Datetime: 2, 100%
>>> Incorrectly predicted as Sentence: 1, 4.76%
	>>> Custom Object predicted as Sentence: 1, 100%


   Index         y_pred          y_act  \
0    485  Custom Object           List   
1    188  Custom Object           List   
9    175  Custom Object       Sentence   
15   360  Custom Object        Numbers   
21   356  Custom Object       Datetime   
24   341       Datetime  Custom Object   
46    31  Custom Object       Sentence   
51   321  Custom Object           List   
56   393  Custom Object       Sentence   
59    56  Custom Object           List   

                                      