In [1]:
#!/usr/local/bin/python3
import argparse
import json
import pandas as pd
import numpy as np
import nltk as nltk
import spacy
import regex
import re
import sklearn
from nltk import *
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pickle

In [8]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in text]

In [140]:
def preprocess(df):
    df = df[['uuid','postText','targetParagraphs']]

    # convert all columns into strings
    df[['postText', 'targetParagraphs']] = df[['postText', 'targetParagraphs']].astype(str)
    #tokenize the relevant columns (not actually used for the Bag of Word approach)
    tokenizer = RegexpTokenizer(r"\w+")
    df["postText_tokens"] = df.apply(lambda row: tokenizer.tokenize(row["postText"]), axis = 1)
    df["paragraph_tokens"] = df.apply(lambda row: tokenizer.tokenize(row["targetParagraphs"]), axis = 1)

    #removing stopwords
    stopwords = nltk.corpus.stopwords.words("english")
    df["postText_tokens"] = df.apply(lambda row: [element for element in row["postText_tokens"] if element not in stopwords], axis = 1)
    df["paragraph_tokens"] = df.apply(lambda row: [element for element in row["paragraph_tokens"] if element not in stopwords], axis = 1)
    
    #lowercasing 
    df['postText_tokens'] = df['postText_tokens'].map(lambda row: list(map(str.lower, row)))
    df['paragraph_tokens'] = df['paragraph_tokens'].map(lambda row: list(map(str.lower, row)))
    
    # multiple space to single space
    df[['postText_tokens', 'paragraph_tokens']] = df[['postText_tokens', 'paragraph_tokens']].replace(r'\s+', ' ', regex=True)
    #special characters
    df[['postText_tokens', 'paragraph_tokens']] = df[['postText_tokens', 'paragraph_tokens']].replace(r'\W', ' ', regex = True)

    #lemmatize tokens
    df['postText_tokens'] = df['postText_tokens'].apply(lemmatize_text)
    df['paragraph_tokens'] = df['paragraph_tokens'].apply(lemmatize_text)

    #count column lengths
    df['postText_length'] = ""
    df['paragraph_length'] = ""
    for i in range(len(df)):
        df['postText_length'][i] = len(df['postText_tokens'][i])
        df['paragraph_length'][i] = len(df['paragraph_tokens'][i])
    
    for i in range(0, len(df)):
        questionmark = "?"
        df['has_questionmark'] = 'posthasquestionmark'
        if questionmark in df['postText'][i]:
            df['has_questionmark'] = 'posthasnoquestionmark'
            
            
    mean_postText_length = df['postText_length'].mean()
    mean_paragraph_length = df['paragraph_length'].mean()

    df['postText_length'] = df['postText_length'].apply(lambda x: 'overavg_post_length' if x > mean_postText_length else 'underavg_post_length')
    df['paragraph_length'] = df['paragraph_length'].apply(lambda x: 'overavg_paragraph_length' if x > mean_paragraph_length else 'underavg_paragraph_length')
    
    for i in range(len(df)):
        df['has_numeric'] = any(str.isdigit(c) for c in df['targetParagraphs'][i])
    df['has_numeric'] = np.where(df['has_numeric'], 'hasnumeric', 'nonumeric')
   
    nlp = spacy.load('en_core_web_lg')
    df['Entities'] = df['postText'].apply(lambda sent: [(ent.text, ent.label_) for ent in nlp(sent).ents])  
    df['Entities'][0]

    for i in range(len(df)):
        tostring = str(df['Entities'][i])
        tostring = ' '.join(str(item) for tup in df['Entities'][i] for item in tup)
        df['Entities'][i] = tostring
  
    df['multi_signs'] = ""
    multi_signs = ['1.', '2.', '3.', '4.', '5.', '6.','7.', '8,', '9.', '10', 'first', 'second', 'third', 'list']
    df['multi_signs'] = df['targetParagraphs'].apply(lambda x: any([k in x for k in multi_signs]))

    df['combined_texts'] = ""
    df['combined_texts'] = df['postText'] + " " + df['targetParagraphs'] + " " + df['postText_length'] + " " + df['paragraph_length'] + " " + df['has_questionmark'] + " " + df['has_numeric'] + df['Entities']

    return df

In [141]:
train_data_path = "../Data/webis-clickbait-22/validation.jsonl"
df = pd.read_json(train_data_path, lines=True)


In [142]:
data = preprocess(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["postText_tokens"] = df.apply(lambda row: tokenizer.tokenize(row["postText"]), axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["paragraph_tokens"] = df.apply(lambda row: tokenizer.tokenize(row["targetParagraphs"]), a

In [144]:
def transform_data(df):
    stopwords = nltk.corpus.stopwords.words("english")
    tokenizer = RegexpTokenizer(r"\w+")

    vectorizer = CountVectorizer(stop_words=stopwords, lowercase=True, tokenizer = tokenizer.tokenize, ngram_range=(1,1))
    combined_bow = vectorizer.fit_transform(df['combined_texts']).toarray()
    bag_of_words = pd.DataFrame(combined_bow, columns=vectorizer.get_feature_names())

    tfidfconverter = TfidfTransformer()
    combined_bow = tfidfconverter.fit_transform(combined_bow).toarray()
    combined_bow = pd.DataFrame(combined_bow)
    return combined_bow

In [145]:
transform_data(data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27347,27348,27349,27350,27351,27352,27353,27354,27355,27356
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
796,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.114082,0.0,0.0,0.0,0.0,0.0
798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [146]:
def getModel():
    pickled_LR_Model = pickle.load(open("Pickle_LR_Model.pkl", "rb"))
    return pickled_LR_Model

In [147]:
def predict_type(df):
    model = getModel()
    data = transform_data(df)
    dummy_df = pd.DataFrame(0, index=data.index, columns=range(62332-data.shape[1]))
    merged_df = pd.concat([data, dummy_df], axis=1)
    prediction = model.predict(merged_df)
    return prediction

In [148]:
prediction = predict_type(data)
prediction = pd.DataFrame(prediction)

In [157]:
prediction

Unnamed: 0,0
0,2
1,2
2,1
3,1
4,2
...,...
795,2
796,2
797,2
798,2


In [155]:
uuid = data[['uuid', 'postText']]

In [159]:
uuid['prediction'] = prediction


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uuid['prediction'] = prediction


In [171]:
uuid.prediction.value_counts()


2    591
1    209
Name: prediction, dtype: int64