# Get the data

In [None]:
import json
import os
import pandas as pd

# https://github.com/ipython/ipython/issues/10123
directory_path = os.getcwd()
dataset_no_figures_path = directory_path + '/../data/dataset_no_figures/'

is_clickbait = {}

with open(dataset_no_figures_path + 'truth_train.jsonl') as f:
    for line in f:
        truth = json.loads(line)
        is_clickbait[truth['id']] = 0 if truth['truthClass'] == 'no-clickbait' else 1
        
df = pd.DataFrame()

with open(dataset_no_figures_path + 'instances_train.jsonl') as f:
    for line in f:
        instance = json.loads(line)
        data = pd.DataFrame({'post_text': instance['postText'], 'is_clickbait': is_clickbait[instance['id']]}, index=[instance['id']])
        df = df.append(data)
        
# print(df)
print('finished')

# Preprocess the data

TODO remove newlines from postText? (e.g., \n in 17560)

# Feature Selection

* Coleman-Liau score (CLScore)
* RIX and LIX indices
* Formality measure (fmeasure)
* Number of uppercase words, presence of questionmarks and exclamation marks in headlines (titles), and the length of the title (number of words) are the most important content features


* The character n-gram features and the word 1-gram feature appear to contribute most to performance
    * Character n-grams are known to capture writing style


* headline: word count
* body: 1. Informality: We compute the frequencies of two informality indicators, namely internet slang and bait words. Additionally, the length of news bodies is also an input feature.


* Sent length, word length, ratio of stop words to content words

# Feature Engineering

In [None]:
from collections import Counter
from nltk import ngrams
# TODO
# from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.tokenize import sent_tokenize, word_tokenize
from string import ascii_lowercase, ascii_uppercase
import nltk


# https://stackoverflow.com/questions/10677020/real-word-count-in-nltk
def number_of_words(text):
# TODO
#     regexptokenizer = RegexpTokenizer(r'\w+')
#     words = regexptokenizer.tokenize(text)
    words = word_tokenize(text)
    return len(words)


def number_of_character_1_grams(text):
    characters = [c for c in text]
    onegrams = ngrams(characters, 1)
    return len([gram for gram in onegrams])


def number_of_character_2_grams(text):
    if len(text) == 0:
        return []
    characters = [c for c in text]
    twograms = ngrams(characters, 2)
    return len([gram for gram in twograms])


def number_of_character_3_grams(text):
    if len(text) <= 1:
        return 0
    characters = [c for c in text]
    threegrams = ngrams(characters, 3)
    return len([gram for gram in threegrams])


# https://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index
def clindex(text):
    text_lower = text.lower()
    number_of_letters = 0
    for character in text_lower:
        if character in ascii_lowercase:
            number_of_letters += 1
    number_of_sentences = len(sent_tokenize(text))
    n_of_words = number_of_words(text)
    l = 0
    s = 0
    # TODO should l and s be 0?
    if n_of_words == 0:
        pass
    else:
        # l = Letters ÷ Words × 100
        l = number_of_letters / n_of_words * 100
        # s = Sentences ÷ Words × 100
        s = number_of_sentences / n_of_words * 100
    return 0.0588 * l - 0.296 * s - 15.8


# https://stackoverflow.com/questions/10674832/count-verbs-nouns-and-other-parts-of-speech-with-pythons-nltk
def formality_measure(text):
    tokenized_text = nltk.word_tokenize(text.lower())
    t = nltk.Text(tokenized_text)
    pos_tags = nltk.pos_tag(t)
    counts = Counter(tag for word,tag in pos_tags)
    return (counts['NN'] + counts['NNP'] + counts['NNS'] + counts['JJ'] + counts['JJR'] + counts['JJS'] + counts['IN'] + counts['DT'] + counts['PDT'] + counts['WDT'] - counts['PRP'] - counts['PRP$'] - counts['WP'] - counts['WP$'] - counts['VB'] - counts['VBD'] - counts['VBG'] - counts['VBN'] - counts['VBP'] - counts['VBZ'] - counts['RB'] - counts['RBR'] - counts['RBS'] - counts['WRB'] - counts['UH'] + 100) / 2


def is_exclamation_question_mark_present(text):
    return 0 if '!' not in text and '?' not in text else 1


def lix(text):
    # TODO should we return 0?
    if len(sent_tokenize(text)) == 0:
        return 0
    return number_of_words(text) / len(sent_tokenize(text))


def number_of_uppercase_words(text):
    words = word_tokenize(text)
    n_of_uppercase_words = 0
    for word in words:
        if word[0] in ascii_uppercase:
            n_of_uppercase_words += 1
    return n_of_uppercase_words


def rix(text):
    lw = 0
    words = word_tokenize(text)
    for word in words:
        if len(word) >= 7:
            lw += 1
    # TODO should we return 0?
    if len(sent_tokenize(text)) == 0:
        return 0
    return lw / len(sent_tokenize(text))


def number_of_word_1_grams(text):
    onegrams = ngrams(word_tokenize(text), 1)
    return len([gram for gram in onegrams])


df['number_of_character_1_grams'] = None
df['number_of_character_2_grams'] = None
df['number_of_character_3_grams'] = None
df['clindex'] = None
df['formality_measure'] = None
df['is_exclamation_question_mark_present'] = None
df['lix'] = None
df['number_of_uppercase_words'] = None
df['number_of_words'] = None
df['rix'] = None
df['number_of_word_1_grams'] = None
for i in df.index:
    df.at[i, 'number_of_character_1_grams'] = number_of_character_1_grams(df.loc[i]['post_text'])
    df.at[i, 'number_of_character_2_grams'] = number_of_character_2_grams(df.loc[i]['post_text'])
    df.at[i, 'number_of_character_3_grams'] = number_of_character_3_grams(df.loc[i]['post_text'])
    df.at[i, 'clindex'] = clindex(df.loc[i]['post_text'])
    df.at[i, 'formality_measure'] = formality_measure(df.loc[i]['post_text'])
    df.at[i, 'is_exclamation_question_mark_present'] = is_exclamation_question_mark_present(df.loc[i]['post_text'])
    df.at[i, 'lix'] = lix(df.loc[i]['post_text'])
    df.at[i, 'number_of_uppercase_words'] = number_of_uppercase_words(df.loc[i]['post_text'])
    df.at[i, 'number_of_words'] = number_of_words(df.loc[i]['post_text'])
    df.at[i, 'rix'] = rix(df.loc[i]['post_text'])
    df.at[i, 'number_of_word_1_grams'] = number_of_word_1_grams(df.loc[i]['post_text'])
# print(df)
print('finished')

# Normalize features

* TODO get features in range of [0,1] or [-1,1]?
* TODO convert lix to five levels (0-4) based on this: very easy (0-24), easy (25-34), standard (35-44), difficult (45-54) and very difficult (more than 55)
* TODO convert rix to thirteen levels (0-13) based on this: 0.2, 0.5, 0.8, 1.3, 1.8, 2.4, 3.0, 3.7, 4.5, 5.3, 6.2, 7.2

# Create train and test sets

In [None]:
def create_train_test_sets(df):
    test_ratio = 0.2

    test_set_size = int(len(df) * test_ratio)

    train_set = df[:len(df) - test_set_size]
    test_set = df[len(df) - test_set_size:]

#     print('train_set length:', len(train_set))
#     print('test_set length:', len(test_set))

    test_set = test_set.drop('is_clickbait', 1)
    
    return train_set, test_set

train_set, test_set = create_train_test_sets(df)

print('finished')

# Score the models

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoLars
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

# TODO try logistic regression, naive bayes, random forest

MODEL_SCORES_FILENAME_PREPEND = 'model_scores_'

failures_dict = {}


def mae(y_test, predictions):
    lin_mae = mean_absolute_error(y_test, predictions)
    return lin_mae


def mse(y_test, predictions):
    lin_mse = mean_squared_error(y_test, predictions)
    lin_rmse = np.sqrt(lin_mse)
    return lin_rmse


def score_models(predictors_prepared, labels, X_test_prepared, y_test, max_features_one, max_features_two):
    print('scoring models')
    
    names = []
    mses = []
    maes = []
    
    if 'linear_regression' not in failures_dict:
        try:
            linear_regression = LinearRegression()
            linear_regression.fit(predictors_prepared, labels)
            predictions = linear_regression.predict(X_test_prepared)
            mse_value = mse(y_test, predictions)
            mae_value = mae(y_test, predictions)
            mses.append(mse_value)
            maes.append(mae_value)
            names.append('linear_regression')
            print('finished linear_regression')
        except:
            print('linear_regression failed')
            failures_dict['linear_regression'] = 1
    else:
        print('SKIPPING linear_regression, failed before')
        
#     if 'svr_linear' not in failures_dict:
#         try:
#             svr_linear = SVR(kernel='linear', C=1e3)
#             svr_linear.fit(predictors_prepared, labels)
#             predictions = svr_linear.predict(X_test_prepared)
#             mse_value = mse(y_test, predictions)
#             mae_value = mae(y_test, predictions)
#             mses.append(mse_value)
#             maes.append(mae_value)
#             names.append('svr_linear')
#             print('finished svr_linear')
#         except:
#             print('svr_linear failed')
#             failures_dict['svr_linear'] = 1
#     else:
#         print('SKIPPING svr_linear, failed before')
    
#     if 'svr_polynomial' not in failures_dict:
#         try:
#             svr_polynomial = SVR(kernel='poly', C=1e3, degree=2)
#             svr_polynomial.fit(predictors_prepared, labels)
#             predictions = svr_polynomial.predict(X_test_prepared)
#             mse_value = mse(y_test, predictions)
#             mae_value = mae(y_test, predictions)
#             mses.append(mse_value)
#             maes.append(mae_value)
#             names.append('svr_polynomial')
#             print('finished svr_polynomial')
#         except:
#             print('svr_polynomial failed')
#             failures_dict['svr_polynomial'] = 1
#     else:
#         print('SKIPPING svr_polynomial, failed before')

    if 'svr_rbf' not in failures_dict:
        try:
            svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
            svr_rbf.fit(predictors_prepared, labels)
            predictions = svr_rbf.predict(X_test_prepared)
            mse_value = mse(y_test, predictions)
            mae_value = mae(y_test, predictions)
            mses.append(mse_value)
            maes.append(mae_value)
            names.append('svr_rbf')
            print('finished svr_rbf')
        except:
            print('svr_rbf failed')
            failures_dict['svr_rbf'] = 1
    else:
        print('SKIPPING svr_rbf, failed before')
    
    if 'svr' not in failures_dict:
        try:
            svr = SVR()
            svr.fit(predictors_prepared, labels)
            predictions = svr.predict(X_test_prepared)
            mse_value = mse(y_test, predictions)
            mae_value = mae(y_test, predictions)
            mses.append(mse_value)
            maes.append(mae_value)
            names.append('svr')
            print('finished svr')
        except:
            print('svr failed')
            failures_dict['svr'] = 1
    else:
        print('SKIPPING svr, failed before')
    
    if 'ridge' not in failures_dict:
        try:
            ridge = Ridge(alpha=.5)
            ridge.fit(predictors_prepared, labels)
            predictions = ridge.predict(X_test_prepared)
            mse_value = mse(y_test, predictions)
            mae_value = mae(y_test, predictions)
            mses.append(mse_value)
            maes.append(mae_value)
            names.append('ridge')
            print('finished ridge')
        except:
            print('ridge failed')
            failures_dict['ridge'] = 1
    else:
        print('SKIPPING ridge, failed before')
    
    if 'ridge_cv' not in failures_dict:
        try:
            ridge_cv = RidgeCV(alphas=[0.1, 1.0, 10.0])
            ridge_cv.fit(predictors_prepared, labels)
            predictions = ridge_cv.predict(X_test_prepared)
            mse_value = mse(y_test, predictions)
            mae_value = mae(y_test, predictions)
            mses.append(mse_value)
            maes.append(mae_value)
            names.append('ridge_cv')
            print('finished ridge_cv')
        except:
            print('ridge_cv failed')
            failures_dict['ridge_cv'] = 1
    else:
        print('SKIPPING ridge_cv, failed before')
    
    if 'lasso' not in failures_dict:
        try:
            lasso = Lasso(alpha=0.1)
            lasso.fit(predictors_prepared, labels)
            predictions = lasso.predict(X_test_prepared)
            mse_value = mse(y_test, predictions)
            mae_value = mae(y_test, predictions)
            mses.append(mse_value)
            maes.append(mae_value)
            names.append('lasso')
            print('finished lasso')
        except:
            print('lasso failed')
            failures_dict['lasso'] = 1
    else:
        print('SKIPPING lasso, failed before')
    
    if 'bayesian_ridge' not in failures_dict:
        try:
            bayesian_ridge = BayesianRidge()
            bayesian_ridge.fit(predictors_prepared, labels)
            predictions = bayesian_ridge.predict(X_test_prepared)
            mse_value = mse(y_test, predictions)
            mae_value = mae(y_test, predictions)
            mses.append(mse_value)
            maes.append(mae_value)
            names.append('bayesian_ridge')
            print('finished bayesian_ridge')
        except:
            print('bayesian_ridge failed')
            failures_dict['bayesian_ridge'] = 1
    else:
        print('SKIPPING bayesian_ridge, failed before')
    
    if 'perceptron' not in failures_dict:
        try:
            perceptron = Perceptron()
            perceptron.fit(predictors_prepared, labels)
            predictions = perceptron.predict(X_test_prepared)
            mse_value = mse(y_test, predictions)
            mae_value = mae(y_test, predictions)
            mses.append(mse_value)
            maes.append(mae_value)
            names.append('perceptron')
            print('finished perceptron')
        except:
            print('perceptron failed')
            failures_dict['perceptron'] = 1
    else:
        print('SKIPPING perceptron, failed before')
    
    if 'lasso_lars' not in failures_dict:
        try:
            lasso_lars = LassoLars(alpha=.1)
            lasso_lars.fit(predictors_prepared, labels)
            predictions = lasso_lars.predict(X_test_prepared)
            mse_value = mse(y_test, predictions)
            mae_value = mae(y_test, predictions)
            mses.append(mse_value)
            maes.append(mae_value)
            names.append('lasso_lars')
            print('finished lasso_lars')
        except:
            print('lasso_lars failed')
            failures_dict['lasso_lars'] = 1
    else:
        print('SKIPPING lasso_lars, failed before')
    
    if 'grid_search_cv' not in failures_dict:
        try:
            param_grid = [
                # try 12 (3×4) combinations of hyperparameters
                {'n_estimators': [3, 10, 30], 'max_features': max_features_one},
                # then try 6 (2×3) combinations with bootstrap set as False
                {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': max_features_two},
            ]
            forest_reg = RandomForestRegressor()
            # train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
            grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
            grid_search.fit(predictors_prepared, labels)
            final_model = grid_search.best_estimator_
            predictions = final_model.predict(X_test_prepared)
            mse_value = mse(y_test, predictions)
            mae_value = mae(y_test, predictions)
            mses.append(mse_value)
            maes.append(mae_value)
            names.append('grid_search_cv')
            print('finished grid_search_cv')
        except:
            print('grid_search_cv failed')
            failures_dict['grid_search_cv'] = 1
    else:
        print('SKIPPING grid_search_cv, failed before')
    
    with open(MODEL_SCORES_FILENAME_PREPEND + '.csv', 'a', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(['name', 'mse', 'mae'])

        for i in range(0, len(names)):
            writer.writerow([names[i], mses[i], maes[i]])
    
print('finished')

# Create DataFrameSelector

Scikit-Learn doesn't handle DataFrames yet

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin


# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names


    def fit(self, X, y=None):
        return self


    def transform(self, X):
        return X[self.attribute_names].values
    
    print('finished')

# Set up pipeline

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


def get_pipeline(predictors):
    attributes = list(predictors)

    pipeline = Pipeline([
            ('selector', DataFrameSelector(attributes)),
            ('imputer', SimpleImputer(strategy='median')),
    ])
    
    return pipeline

print('finished')

In [None]:
import os

print('starting')

MODEL_FAILURES_FILENAME = 'model_failures.csv'

# if os.path.exists(MODEL_SCORES_FILENAME):
#     os.remove(MODEL_SCORES_FILENAME)
    
if os.path.exists(MODEL_FAILURES_FILENAME):
    os.remove(MODEL_FAILURES_FILENAME)

print('deleted?')
    
# separate predictors and labels
predictors = train_set.drop('is_clickbait', axis=1) # drop labels for training set
labels = train_set.loc[:, 'is_clickbait']

print('separated predictors and labels')

# prepare predictors
pipeline = get_pipeline(predictors)
print('got pipeline')
predictors_prepared = pipeline.fit_transform(predictors)

print('done')

# print('prepared predictors')

# # prepare X_test  
# X_test_prepared = pipeline.fit_transform(X_test)
# y_test = test_set.loc[:, 'is_clickbait']

# print('prepared X_test')

# max_features_one = [2, 4, 6]
# max_features_two = [2, 3, 4]
    
# score_models(predictors_prepared, labels, X_test_prepared, y_test, max_features_one, max_features_two)
    
# with open(MODEL_FAILURES_FILENAME, 'w+', newline='') as csvfile:
#         writer = csv.writer(csvfile, delimiter=',')

#         for key in failures_dict.items():
#             writer.writerow([key])

print('finished')

# Create error functions