In [None]:
import pandas as pd 
import numpy as np 
import spacy 
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

In [None]:
df = pd.read_csv('../Data/spam.csv',encoding='latin-1')
df.head()

In [100]:
df = df.iloc[:,0:2]

In [None]:
df.columns = ['label','text']
df.head()

In [None]:
nlp = spacy.load('en_core_web_sm')
tqdm.pandas(desc='Processing with spaCy')
spacy_results = df['text'].progress_map(nlp)

In [103]:
# Encode with Sentence Transformers

sentence_bert = SentenceTransformer('paraphrase-distilroberta-base-v1')
# tqdm.pandas(desc='Applying sentence-bert')
# vectors = df['text'].progress_map(model.encode)


In [None]:
import swifter

%time 

vectors_swifter = df['text'].swifter.apply(sentence_bert.encode)

In [None]:
df['raw_spacy'] = spacy_results
df['raw_pos'] = df['raw_spacy'].swifter.apply(lambda x: ' '.join([t.pos_ for t in x]))

In [None]:
df['sentence-bert'] = vectors_swifter
df.head()

In [None]:
df['label'] = df.label.swifter.apply(lambda x : 1 if x =='spam' else 0)
df.head()

In [108]:
df.drop(columns=['raw_spacy'],inplace=True)

In [171]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier



In [289]:
train_df = df.copy()

In [290]:
def stack_embeddings(embeddings):
    import numpy as np
    return np.vstack(embeddings.values)

ct = ColumnTransformer([
    ('bag of ngrams', TfidfVectorizer(ngram_range=(1, 2), max_features=3000), 'text'),
    ('bag of POS', CountVectorizer(ngram_range=(1, 2)), 'raw_pos'),
    # Lambda functions cannot be pickled
    ('sentence bert', FunctionTransformer(stack_embeddings), 'sentence-bert'),
    # ('bag of NER types', CountVectorizer(ngram_range=(1, 2)), 'raw_ner'),
    # ('ngrams before', TfidfVectorizer(ngram_range=(1, 2), max_features=3000), 'raw_before'),
    # ('ngrams after', TfidfVectorizer(ngram_range=(1, 2), max_features=3000), 'raw_after')    
],remainder='passthrough')

# lm = LogisticRegression()
xgb = XGBClassifier(random_state=0)

# pipeline = Pipeline([('transformer', ct), ('classifier', lm)])
pipeline = Pipeline([('transformer', ct), ('classifier', xgb)])


y,X = train_df.pop('label'),train_df

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42,stratify=y)

In [None]:


%time model = pipeline.fit(X_train, y_train)

In [292]:
y_pred = model.predict(X_test)

In [None]:
print(metrics.classification_report(y_test,y_pred))

In [294]:
# import joblib
# filename = 'model.sav'
# joblib.dump(model, filename)
import dill


pkl_filename = "../Models/model.pkl"
with open(pkl_filename, 'wb') as file:
    dill.dump(model, file)

In [None]:
ls ../Models

In [296]:
with open(pkl_filename,'rb') as file:
    loaded_model = dill.load(file)

In [297]:
def make_inference_df(input_text):

    model_input_dict = {}
    input_row_list = []
    

    spacy_raw = nlp(input_text)
    # pos_tags = [t.pos_ for t in spacy_raw]

    model_input_dict['text'] = input_text
    model_input_dict['raw_pos'] =  ' '.join([t.pos_ for t in spacy_raw])
    model_input_dict['sentence-bert'] = sentence_bert.encode(input_text)

    input_row_list.append(model_input_dict)

    model_input_df = pd.DataFrame(input_row_list)
    return model_input_df

In [None]:
sample_text = 'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C\'s'
# make_inference_df(sample_text)
loaded_model.predict(make_inference_df(sample_text))

In [None]:
sample_text_2 = 'Nah I don\'t think he goes to usf, he lives around here though'
print(sample_text_2)
loaded_model.predict(make_inference_df(sample_text))