In [1]:
import pandas as pd
import numpy as np
import nltk as nltk
import spacy
import regex
import re
import sklearn
from nltk import *
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
import pickle

### Read in the data

In [2]:
train_data_path = "Data/webis-clickbait-22/train.jsonl"
valid_data_path = "Data/webis-clickbait-22/validation.jsonl"

df = pd.read_json(train_data_path, lines=True)
df2 = pd.read_json(valid_data_path, lines = True)
df = df.append(df2)
df = df.reset_index()
df = df.drop(columns = 'index')

In [3]:
pd.set_option('display.max_colwidth', None)

### Select important columns for analysis:

We are using the Posttext, Targetparagraph, spoiler and the postplatform to possibly classify the spoiler type (tag)

In [4]:
#df_selected = df[['postText','targetParagraphs', 'spoiler', 'tags']].astype(str)

### Text is cleaned and pre-processed in the following steps:

In [5]:
tokenizer = RegexpTokenizer(r"\w+")

In [6]:
stopwords = nltk.corpus.stopwords.words("english")

In [7]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in text]

In [8]:
def preprocess_data(df):
    df = df[['postText','targetParagraphs', 'spoiler', 'tags']]

    # convert all columns into strings
    df[['postText', 'targetParagraphs', 'spoiler', 'tags']] = df[['postText', 'targetParagraphs', 'spoiler', 'tags']].astype(str)
    #tokenize the relevant columns (not actually used for the Bag of Word approach)
    tokenizer = RegexpTokenizer(r"\w+")
    df["postText_tokens"] = df.apply(lambda row: tokenizer.tokenize(row["postText"]), axis = 1)
    df["paragraph_tokens"] = df.apply(lambda row: tokenizer.tokenize(row["targetParagraphs"]), axis = 1)
    df["spoiler_tokens"] = df.apply(lambda row: tokenizer.tokenize(row["spoiler"]), axis = 1)
    
    #removing stopwords
    stopwords = nltk.corpus.stopwords.words("english")
    df["postText_tokens"] = df.apply(lambda row: [element for element in row["postText_tokens"] if element not in stopwords], axis = 1)
    df["paragraph_tokens"] = df.apply(lambda row: [element for element in row["paragraph_tokens"] if element not in stopwords], axis = 1)
    df["spoiler_tokens"] = df.apply(lambda row: [element for element in row["spoiler_tokens"] if element not in stopwords], axis = 1)
    
    #lowercasing 
    df['postText_tokens'] = df['postText_tokens'].map(lambda row: list(map(str.lower, row)))
    df['paragraph_tokens'] = df['paragraph_tokens'].map(lambda row: list(map(str.lower, row)))
    df['spoiler_tokens'] = df['spoiler_tokens'].map(lambda row: list(map(str.lower, row)))
    
    # multiple space to single space
    df[['postText_tokens', 'paragraph_tokens', 'spoiler_tokens']] = df[['postText_tokens', 'paragraph_tokens', 'spoiler_tokens']].replace(r'\s+', ' ', regex=True)
    #special characters
    df[['postText_tokens', 'paragraph_tokens', 'spoiler_tokens']] = df[['postText_tokens', 'paragraph_tokens', 'spoiler_tokens']].replace(r'\W', ' ', regex = True)

    #lemmatize tokens
    df['postText_tokens'] = df['postText_tokens'].apply(lemmatize_text)
    df['paragraph_tokens'] = df['paragraph_tokens'].apply(lemmatize_text)
    df['spoiler_tokens'] = df['spoiler_tokens'].apply(lemmatize_text)
    
    #count column lengths
    df['postText_length'] = ""
    df['paragraph_length'] = ""
    df['spoiler_length'] = ""
    for i in range(len(df)):
        df['postText_length'][i] = len(df['postText_tokens'][i])
        df['paragraph_length'][i] = len(df['paragraph_tokens'][i])
        df['spoiler_length'][i] = len(df['spoiler_tokens'][i])
    
    for i in range(0, len(df)):
        questionmark = "?"
        df['has_questionmark'] = 'posthasquestionmark'
        if questionmark in df['postText'][i]:
            df['has_questionmark'] = 'posthasnoquestionmark'
            
            
    mean_postText_length = df['postText_length'].mean()
    mean_paragraph_length = df['paragraph_length'].mean()
    mean_spoiler_length = df['spoiler_length'].mean()

    df['postText_length'] = df['postText_length'].apply(lambda x: 'overavg_post_length' if x > mean_postText_length else 'underavg_post_length')
    df['paragraph_length'] = df['paragraph_length'].apply(lambda x: 'overavg_paragraph_length' if x > mean_paragraph_length else 'underavg_paragraph_length')
    df['spoiler_length'] = df['spoiler_length'].apply(lambda x: 'overavg_spoiler_length' if mean_spoiler_length > 49 else 'underavg_spoiler_length')

    
    for i in range(len(df)):
        df['has_numeric'] = any(str.isdigit(c) for c in df['targetParagraphs'][i])
    df['has_numeric'] = np.where(df['has_numeric'], 'hasnumeric', 'nonumeric')
   
    nlp = spacy.load('en_core_web_lg')
    df['Entities'] = df['postText'].apply(lambda sent: [(ent.text, ent.label_) for ent in nlp(sent).ents])  
    df['Entities'][0]

    for i in range(len(df)):
        tostring = str(df['Entities'][i])
        tostring = ' '.join(str(item) for tup in df['Entities'][i] for item in tup)
        df['Entities'][i] = tostring
  
    df['multi_signs'] = ""
    multi_signs = ['1.', '2.', '3.', '4.', '5.', '6.','7.', '8,', '9.', '10', 'first', 'second', 'third', 'list']
    df['multi_signs'] = df['targetParagraphs'].apply(lambda x: any([k in x for k in multi_signs]))

    df['combined_texts'] = ""
    df['combined_texts'] = df['postText'] + " " + df['targetParagraphs'] + " " + df['postText_length'] + " " + df['paragraph_length'] + " " + df['has_questionmark'] + " " + df['has_numeric'] + df['Entities']

    return df


In [9]:
df_selected = preprocess_data(df)
#df_selected

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["postText_tokens"] = df.apply(lambda row: tokenizer.tokenize(row["postText"]), axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["paragraph_tokens"] = df.apply(lambda row: tokenizer.tokenize(row["targetParagraphs"]), a

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['has_questionmark'] = 'posthasquestionmark'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['has_questionmark'] = 'posthasnoquestionmark'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['postText_length'] = df['postText_length'].apply(lambda x: 'overavg_post_length' if x > mean_postText_leng

In [10]:
y = df_selected['tags']

### Convert our text into bag of word vectors


In [11]:
#?vectorizer

In [12]:
#max_features could be used to constrain the amount of words
# ngram_range 2,2 is too long
#ngram range 1,2 works without tf-idf
vectorizer = CountVectorizer(stop_words=stopwords, lowercase=True, tokenizer = tokenizer.tokenize, ngram_range=(1,1))

In [13]:
# Add the vectorized dataframes to another and transform them back
combined_bow = vectorizer.fit_transform(df_selected['combined_texts']).toarray()
bag_of_words = pd.DataFrame(combined_bow, columns=vectorizer.get_feature_names())

In [14]:
bag_of_words

Unnamed: 0,0,00,000,00000000000000000000000000000000000000625,00000000130,000001002003004005006,0000024,00007,0005,000s,...,いいね,コメント,ツ,件,寵物小精靈,比卡超,皮卡丘,神奇寶貝,精靈寶可夢,認証済み
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
X = combined_bow

In [16]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### Finding TFIDF of our bag of words

(n = 1,2) took over an hour and didnt compute.

In [17]:
tfidfconverter = TfidfTransformer()

In [18]:
X = tfidfconverter.fit_transform(X).toarray()

In [19]:
X = pd.DataFrame(X)

In [20]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Columns: 62332 entries, 0 to 62331
dtypes: float64(62332)
memory usage: 1.9 GB


In [21]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,62322,62323,62324,62325,62326,62327,62328,62329,62330,62331
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.014798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3996,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3997,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3998,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Using the saved Logistic Regression for spoiler classification:

In [22]:
y = pd.DataFrame(y)
y = pd.get_dummies(y['tags'])
y = y.values.argmax(1)
y = pd.DataFrame(y)
y = y.rename(columns={0: "tags"})

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

In [32]:
# Load the Model back from file
PKl_Filename = 'Pickle_LR_Model.pkl'
with open(PKl_Filename, 'rb') as file:  
    Pickled_LR_Model = pickle.load(file)

Pickled_LR_Model

LogisticRegression()

In [33]:
Pickled_LR_Model.fit(X_train, y_train)
y_pred = Pickled_LR_Model.predict(X_test)
y_pred_proba = Pickled_LR_Model.predict_proba(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

  return f(*args, **kwargs)


[[ 23  59  61]
 [  7 178 120]
 [  2 114 236]]
              precision    recall  f1-score   support

           0       0.72      0.16      0.26       143
           1       0.51      0.58      0.54       305
           2       0.57      0.67      0.61       352

    accuracy                           0.55       800
   macro avg       0.60      0.47      0.47       800
weighted avg       0.57      0.55      0.52       800

0.54625


### Apply ML techniques

Random Forest and Logistic Regression were the most successful approaches. 

### Random Forest

In [24]:
classifier = RandomForestClassifier(n_estimators=500, random_state=8, max_depth=100)
classifier.fit(X_train, y_train) 

  classifier.fit(X_train, y_train)


RandomForestClassifier(max_depth=100, n_estimators=500, random_state=8)

In [25]:
y_pred = classifier.predict(X_test)

In [26]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[ 16  60  67]
 [  4 189 112]
 [  3 123 226]]
              precision    recall  f1-score   support

           0       0.70      0.11      0.19       143
           1       0.51      0.62      0.56       305
           2       0.56      0.64      0.60       352

    accuracy                           0.54       800
   macro avg       0.59      0.46      0.45       800
weighted avg       0.56      0.54      0.51       800

0.53875


### Logistic Regression

In [27]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

  return f(*args, **kwargs)


LogisticRegression()

In [28]:
y_pred = logreg.predict(X_test)
y_pred_proba = logreg.predict_proba(X_test)

In [29]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[ 23  59  61]
 [  7 178 120]
 [  2 114 236]]
              precision    recall  f1-score   support

           0       0.72      0.16      0.26       143
           1       0.51      0.58      0.54       305
           2       0.57      0.67      0.61       352

    accuracy                           0.55       800
   macro avg       0.60      0.47      0.47       800
weighted avg       0.57      0.55      0.52       800

0.54625


## Saving the best model as pickle file

In [30]:
# Save the model to file in the current working directory

Pkl_Filename = "Pickle_LR_Model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(logreg, file)

### Naive Bayes

In [33]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

  return f(*args, **kwargs)


In [34]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[ 34  57  52]
 [ 61 129 115]
 [ 77 123 152]]
              precision    recall  f1-score   support

           0       0.20      0.24      0.22       143
           1       0.42      0.42      0.42       305
           2       0.48      0.43      0.45       352

    accuracy                           0.39       800
   macro avg       0.36      0.36      0.36       800
weighted avg       0.40      0.39      0.40       800

0.39375


### XgBoost

In [35]:
import xgboost as xgb

In [36]:
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

In [37]:
params = { "objective": "multi:softmax", 'num_class': 3, 'max_depth' : 30}

In [38]:
d_train = xgb.DMatrix(X_train, y_train)
d_test = xgb.DMatrix(X_test, y_test)

In [39]:
xgb_model = xgb.train(params, d_train, num_boost_round=10)

In [40]:
y_pred = xgb_model.predict(d_test)
#y_pred = np.where(np.array(y_pred) > 0.5, 1, 0)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[ 41  51  51]
 [ 12 179 114]
 [ 22 135 195]]
              precision    recall  f1-score   support

           0       0.55      0.29      0.38       143
           1       0.49      0.59      0.53       305
           2       0.54      0.55      0.55       352

    accuracy                           0.52       800
   macro avg       0.53      0.48      0.49       800
weighted avg       0.52      0.52      0.51       800

0.51875


### KNearestNeighbors

In [41]:
from sklearn.neighbors import KNeighborsClassifier

In [42]:
model = KNeighborsClassifier(n_neighbors = 3)
model.fit(X_train, y_train)

  return self._fit(X, y)


KNeighborsClassifier(n_neighbors=3)

In [43]:
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[ 70  37  36]
 [ 88 121  96]
 [ 93 111 148]]
              precision    recall  f1-score   support

           0       0.28      0.49      0.36       143
           1       0.45      0.40      0.42       305
           2       0.53      0.42      0.47       352

    accuracy                           0.42       800
   macro avg       0.42      0.44      0.42       800
weighted avg       0.45      0.42      0.43       800

0.42375
