In [13]:
import pandas as pd
import numpy as np

import nltk
import re
import string 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import gensim
from gensim.models import Word2Vec


In [2]:
def clean_string(text, stem="None"):

    final_string = ""

    # Make lower
    
    text = text.lower()

    # Remove line breaks

    text = re.sub('\n', '', text)

    # Remove puncuation

    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # Remove stop words

    text = text.split()
    useless_words = nltk.corpus.stopwords.words("english")
    useless_words = useless_words + ['hi', 'im']

    text_filtered = [word for word in text if not word in useless_words]

    # Remove numbers

    text_filtered = [re.sub('\w*\d\w*', '', w) for w in text_filtered]

    ## Remove special chars

    text_filtered = [re.sub(r"[^a-zA-Z0-9 ]", '', w) for w in text_filtered]

    # Stem or Lemmatize

    if stem == 'Stem':
        stemmer = PorterStemmer() 
        text_stemmed = [stemmer.stem(y) for y in text_filtered]
    elif stem == 'Lem':
        lem = WordNetLemmatizer()
        text_stemmed = [lem.lemmatize(y) for y in text_filtered]
    elif stem == 'Spacy':
        text_filtered = nlp(' '.join(text_filtered))
        text_stemmed = [y.lemma_ for y in text_filtered]
    else:
        text_stemmed = text_filtered

    final_string = ' '.join(text_stemmed)

    final_string = final_string.replace("  ", ' ')

    return final_string

In [3]:
df = pd.read_csv('https://github.com/joshnicholas/article_reccomendation/blob/main/archive/binary_cleaned.csv?raw=true')

# columns:
# 'status', 'resolved_title', 'resolved_url', 'keywords', 'excerpt',
#        'cleaned_text', 'title_word_count', 'word_count'


In [4]:
df["cleaned_text"] = df["cleaned_text"].astype(str)

# Split data into train and test
    
X_train, X_test, y_train, y_test = train_test_split(df["cleaned_text"], df["status"],test_size=0.2,shuffle=True)

# Word2Vec on sentences

X_train_tok= [nltk.word_tokenize(i) for i in X_train]  
X_test_tok= [nltk.word_tokenize(i) for i in X_test]


In [5]:
#Tf-Idf

tfidf_vectorizer = TfidfVectorizer(use_idf=True)

X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)

# Word2Vec model

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.values())))

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

df['clean_text_tok'] = [nltk.word_tokenize(i) for i in df['cleaned_text']]

model = Word2Vec(df['clean_text_tok'],min_count=1)   

w2v = dict(zip(model.wv.index2word, model.wv.syn0))  

modelw = MeanEmbeddingVectorizer(w2v)

# converting text to numerical data using Word2Vec

X_train_vectors_w2v = modelw.transform(X_train_tok)
X_test_vectors_w2v = modelw.transform(X_test_tok)

  w2v = dict(zip(model.wv.index2word, model.wv.syn0))


In [6]:
#FITTING THE CLASSIFICATION MODEL using Logistic Regression(tf-idf)

lr_tfidf=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_tfidf.fit(X_train_vectors_tfidf, y_train)  

#Predict y value for test dataset

y_predict = lr_tfidf.predict(X_test_vectors_tfidf)
y_prob = lr_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]

print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

print('AUC:', roc_auc)

              precision    recall  f1-score   support

           0       0.72      0.83      0.77      3328
           1       0.43      0.28      0.34      1509

    accuracy                           0.66      4837
   macro avg       0.57      0.56      0.55      4837
weighted avg       0.63      0.66      0.64      4837

Confusion Matrix: [[2766  562]
 [1086  423]]
AUC: 0.607335155732273


In [7]:
#FITTING THE CLASSIFICATION MODEL using Logistic Regression (W2v)

lr_w2v=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_w2v.fit(X_train_vectors_w2v, y_train)

#Predict y value for test dataset

y_predict = lr_w2v.predict(X_test_vectors_w2v)
y_prob = lr_w2v.predict_proba(X_test_vectors_w2v)[:,1]

print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

print('AUC:', roc_auc)

              precision    recall  f1-score   support

           0       0.70      0.98      0.82      3328
           1       0.62      0.08      0.15      1509

    accuracy                           0.70      4837
   macro avg       0.66      0.53      0.48      4837
weighted avg       0.68      0.70      0.61      4837

Confusion Matrix: [[3251   77]
 [1383  126]]
AUC: 0.5935954784115818


In [8]:
#FITTING THE CLASSIFICATION MODEL using Naive Bayes(tf-idf)

nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_vectors_tfidf, y_train) 

#Predict y value for test dataset

y_predict = nb_tfidf.predict(X_test_vectors_tfidf)
y_prob = nb_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]

print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

              precision    recall  f1-score   support

           0       0.69      1.00      0.82      3328
           1       0.56      0.01      0.01      1509

    accuracy                           0.69      4837
   macro avg       0.63      0.50      0.41      4837
weighted avg       0.65      0.69      0.56      4837

Confusion Matrix: [[3321    7]
 [1500    9]]
AUC: 0.6243402963628486


In [15]:
#Pre-processing the new dataset

df['clean_text'] = df['excerpt'].apply(lambda x: clean_string(x)) #preprocess the data

X_test= df['clean_text'] 

#converting words to numerical data using tf-idf

X_vector=tfidf_vectorizer.transform(X_test)

#use the best model to predict 'target' value for the new dataset 

y_predict = lr_tfidf.predict(X_vector)      
y_prob = lr_tfidf.predict_proba(X_vector)[:,1]

df['predict_prob'] = y_prob
df['target'] = y_predict

final = df[['clean_text','target' ,'predict_prob']].reset_index(drop=True)

print(final.head())

                                          clean_text  target  predict_prob
0  eminem opened restaurant detroit checked detro...       1      0.806617
1  teenaged girl periodically transforms giant pa...       1      0.789723
2  february   register subscribe save articles later       1      0.506887
3  abc grilled anticoalition tweets posted tv per...       1      0.892576
4  aussie comedian rental application cancelled r...       1      0.789556
