In [20]:
import numpy as np
import pandas as pd
import nltk
import warnings
warnings.filterwarnings('ignore')
# nltk.download('stopwords')
# nltk.download('wordnet')
from nltk import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.porter import *
import string
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix, hstack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm

In [21]:
SEED = 42

In [22]:
list_businesses = [line.strip() for line in open("list_of_businesses.txt", 'r')]
list_hs_codes = [line.strip() for line in open("list_hs_codes.txt", 'r')]
list_strengthes = [line.strip() for line in open("list_strengthes.txt", 'r')]

In [23]:
def prepare_data(path, status):
    """
    path, str to dataset, e.g 'model_passed_review.csv'
    status, could be 'passed' or 'failed'
    e.g. prepare_data('model_passed_review.csv', 'passed')
    """
    df = pd.read_csv(path, header = None)
    
    df = df.drop([4, 5], axis=1)
    df = df.rename(columns=
                       {0 : "Business", 
                        1 : "Part_number",
                        2 : "Part_description",
                        3 : "HS_code",
                        6 : "Audit_trail",
                        7 : "Strength"})
    df = df.drop_duplicates(subset=['Audit_trail', 'HS_code']).reset_index(drop=True)
    df = df.fillna('missed_data')
    df['HS_code'] = df['HS_code'].astype('str')
    df['Strength'] = df['Strength'].astype('str')
    #converting the 
    df['Business'] = df['Business'].apply(lambda x: x if x in list_businesses else 'unknown')
    df['HS_code'] = df['HS_code'].apply(lambda x: x if x in list_hs_codes else 'unknown')
    df['Strength'] = df['Strength'].apply(lambda x: x if x in list_strengthes else 'unknown')
    #adding the label
    if status == "passed":
        df['Failed'] = 0
    else:
        df['Failed'] = 1
    df_text = df['Part_number'] + " " + df['Part_description'] + " " + df['Audit_trail']
    df_features = df[['Business', 'HS_code', 'Strength']]
    y = df['Failed']

    return df_text, df_features, y

In [24]:
passed_text, passed_features, y_passed = prepare_data('model_passed_review.csv', 'passed')
failed_text, failed_features, y_failed = prepare_data('model_failed_review.csv', 'failed')

In [25]:
df_train_text = pd.concat([passed_text, failed_text], ignore_index=True)
df_train_features = pd.concat([passed_features, failed_features], ignore_index=True)
y_train = pd.concat([y_passed, y_failed], ignore_index=True)

In [26]:
enc = OneHotEncoder(categories=(list_businesses, list_hs_codes, list_strengthes))

In [27]:
df_train_features = enc.fit_transform(df_train_features)
df_train_features

<66302x1366 sparse matrix of type '<class 'numpy.float64'>'
	with 198906 stored elements in Compressed Sparse Row format>

In [28]:
df_train_features.shape[1] == len(list_businesses) + len(list_hs_codes) + len(list_strengthes)

True

In [29]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_punct(text):
    table = str.maketrans('','',string.punctuation)
    return text.translate(table)

cachedStopWords = stopwords.words("english")

def remove_stopwords(text):
    text = ' '.join([word for word in text.split() if word not in cachedStopWords])
    return text

stemmer = SnowballStemmer('english')
#stemmer = PorterStemmer()
#lemmatizer = WordNetLemmatizer()

In [32]:
#The function takes as input dataset and columns to process
def clean_data(df):
    df = df.apply(lambda x : remove_URL(x))
    df = df.apply(lambda x : remove_punct(x))
    df = df.apply(lambda x : remove_stopwords(x))
    df = df.apply(lambda x : stemmer.stem(x))
    # df = df.apply(lambda x : lemmatizer.lemmatize(x))

    return df

In [33]:
df_cleaned_train_text = clean_data(df_train_text)
print('text reducing is ', len(" ".join(df_cleaned_train_text))/len(" ".join(df_train_text)))

text reducing is  0.7957101890654006


In [34]:
MAX_DF = 0.8
MIN_COUNT = 5
NGRAMS = (1, 1)

In [35]:
TOKEN_RE = re.compile(r'[a-z]+|-?\d*[-.,]?\d+|\S')

def tokenize_text_simple_regex(txt, min_token_size=2):
    txt = txt.lower()
    all_tokens = TOKEN_RE.findall(txt)

    return [token for token in all_tokens if len(token) >= min_token_size]

In [36]:
vector = TfidfVectorizer(tokenizer=tokenize_text_simple_regex,
                            min_df=MIN_COUNT, max_df=MAX_DF,
                            ngram_range = NGRAMS)
vector.fit(df_cleaned_train_text)

TfidfVectorizer(max_df=0.8, min_df=5,
                tokenizer=<function tokenize_text_simple_regex at 0x0000026412DBBE50>)

In [37]:
def get_top_n_words(corpus, n=None):

    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [38]:
get_top_n_words(df_cleaned_train_text, n=10)

[('per', 94695),
 ('it', 42820),
 ('not', 30120),
 ('steel', 27997),
 ('used', 26647),
 ('this', 24443),
 ('part', 20537),
 ('made', 19849),
 ('engineer', 17994),
 ('plm', 16537)]

In [39]:
df_cleaned_train_text = vector.transform(df_cleaned_train_text)
df_cleaned_train_text

<66302x18341 sparse matrix of type '<class 'numpy.float64'>'
	with 1700558 stored elements in Compressed Sparse Row format>

In [40]:
df_total = hstack([df_train_features, df_cleaned_train_text])
y = pd.concat([y_passed, y_failed])

In [41]:
X_train, X_holdout, y_train, y_holdout = train_test_split(df_total, y, 
                                                          test_size=0.3, 
                                                          random_state=SEED)

In [42]:
clf = LogisticRegression(random_state=SEED).fit(X_train, y_train)
print('F1 score (train) %.3f' % f1_score(y_train, clf.predict(X_train)))
print('F1 score (holdout) %.3f' % f1_score(y_holdout, clf.predict(X_holdout)))

F1 score (train) 0.778
F1 score (holdout) 0.702


In [43]:
clf2 = RidgeClassifier(random_state=SEED).fit(X_train, y_train)
print('F1 score (train) %.3f' % f1_score(y_train, clf2.predict(X_train)))
print('F1 score (holdout) %.3f' % f1_score(y_holdout, clf2.predict(X_holdout)))

F1 score (train) 0.834
F1 score (holdout) 0.711


In [44]:
clf3 = MultinomialNB().fit(X_train, y_train)
print('F1 score (train) %.3f' % f1_score(y_train, clf3.predict(X_train)))
print('F1 score (holdout) %.3f' % f1_score(y_holdout, clf3.predict(X_holdout)))

F1 score (train) 0.706
F1 score (holdout) 0.655


In [45]:
clf4 = svm.LinearSVC(random_state=SEED).fit(X_train, y_train)
print('F1 score (train) %.3f' % f1_score(y_train, clf4.predict(X_train)))
print('F1 score (holdout) %.3f' % f1_score(y_holdout, clf4.predict(X_holdout)))

F1 score (train) 0.859
F1 score (holdout) 0.717


In [46]:
test_passed_text, test_passed_features, y_passed_test = prepare_data('test_passed_review.csv', 'passed')
test_failed_text, test_failed_features, y_failed_test = prepare_data('test_failed_review.csv', 'failed')

In [47]:
df_test_text = pd.concat([test_passed_text, test_failed_text], ignore_index=True)
df_test_features = pd.concat([test_passed_features, test_failed_features], ignore_index=True)
y_test = pd.concat([y_passed_test, y_failed_test], ignore_index=True)

In [48]:
df_test_features = enc.transform(df_test_features)
df_test_features

<42767x1366 sparse matrix of type '<class 'numpy.float64'>'
	with 128301 stored elements in Compressed Sparse Row format>

In [49]:
df_test_features.shape[1] == len(list_businesses) + len(list_hs_codes) + len(list_strengthes)

True

In [50]:
df_cleaned_test_text = clean_data(df_test_text)
print('text reducing is ', len(" ".join(df_cleaned_test_text))/len(" ".join(df_test_text)))

text reducing is  0.7911105632121019


In [51]:
df_cleaned_test_text = vector.transform(df_cleaned_test_text)
df_cleaned_train_text

<66302x18341 sparse matrix of type '<class 'numpy.float64'>'
	with 1700558 stored elements in Compressed Sparse Row format>

In [52]:
X_test = hstack([df_test_features, df_cleaned_test_text])

In [53]:
print('Logistic regression F1 score (test) %.3f' % f1_score(y_test, clf.predict(X_test)))
print('Ridge Clasifier F1 score (test) %.3f' % f1_score(y_test, clf2.predict(X_test)))
print('Naive Bayes F1 score (test) %.3f' % f1_score(y_test, clf3.predict(X_test)))
print('Linear SVM F1 score (test) %.3f' % f1_score(y_test, clf4.predict(X_test)))

Logistic regression F1 score (test) 0.606
Ridge Clasifier F1 score (test) 0.593
Naive Bayes F1 score (test) 0.575
Linear SVM F1 score (test) 0.595
