## ML Challenge WS 2022/23

#### Task:

Your Task is to train a clickbait filter to classify clickbait articles by their headline.

#### Dataset:

The data consists of two files, a text file with clickbait headlines and one with headlines from news sources. The hold out dataset is organized the same way.

In [None]:
#Loading the clickbait data and assigning class 1

import pandas as pd
import numpy as np

with open('clickbait_yes') as fyes:
    linesyes = [line.rstrip() for line in fyes]

dfyes = pd.DataFrame(linesyes, columns=['headlines'])

dfyes['class']= 1
dfyes

#### Questions?

[kuglerk@uni-trier.de](mailto:kuglerk@uni-trier.de?subject=ML%20Challenge%20NLU)

In [None]:
#Loading the no clickbait data and assigning class 0

with open('clickbait_no') as fno:
    linesno = [line.rstrip() for line in fno]

dfno = pd.DataFrame(linesno, columns=['headlines'])

dfno['class']= 0
dfno

In [None]:
#loading the test set data
with open('clickbait_hold_X.csv') as ftest:
    linesno = [line.rstrip() for line in ftest]

X_test= pd.DataFrame(linesno, columns=['headlines'])

X_test

In [None]:
#making a single data set with clickbait and non click bait datasets
df = dfyes.append(dfno)

df

In [None]:
#method to preprocess and clean the dataframe remove punctuations, make lower case, remove stop words
def preprocess(df, col, lang):
    df[col] = df[col].str.replace(r'<[^<>]*>', '', regex=True)

    df[col] = df[col].str.lower()
    if lang=='en':
        df[col] = df[col].str.replace(r"n\'t", " not", regex=True)
        df[col] = df[col].str.replace(r"\'t", " not", regex=True)

    df[col] = df[col].str.replace(r'([\'\"\.\(\)\!\?\\/\,])', r' \1 ', regex=True)
    df[col] = df[col].str.replace(r'[^\w\s\?]', ' ', regex=True)
    df[col] = df[col].str.replace(r'([\;\:\|•«\n])', ' ', regex=True)
    return df

#preprocessing of dataset 
preprocess(df, 'headlines', 'en')

In [None]:
#preprocessing test data
preprocess(X_test, 'headlines', 'en')

In [None]:
X = df.drop(columns = ['class']).copy()
y = df['class']

X

In [None]:
y

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from scipy import sparse

#Create stopwords list
stopwords_list = stopwords.words('english')

#TF-IDF Vectorization for train and test data
tfidf = TfidfVectorizer(stop_words = stopwords_list,ngram_range = (1, 2))
tfidf_headlines = tfidf.fit_transform(X['headlines'])
tfidf_headlines_test = tfidf.transform(X_test['headlines'])

X_ef = X.drop(columns='headlines')
X_test_ef = X_test.drop(columns='headlines')

X = sparse.hstack([X_ef, tfidf_headlines]).tocsr()
X_test = sparse.hstack([X_test_ef, tfidf_headlines_test]).tocsr()

In [None]:
from sklearn.model_selection import train_test_split

#splitting into train and validation datasets
train_size = 0.8
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size = 0.8, random_state = 10)

In [None]:
g = y_valid.groupby(y_valid)
g.groups

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix

In [None]:
# fit the model to the training data
mn = MultinomialNB(alpha=.05)
mn.fit(X_train, y_train)

# use the model to make predictions on validation set
validPredict = mn.predict(X_valid)

validPredict.shape

In [None]:
from sklearn.metrics import f1_score

#f1 score for the model in validation set
f1_score(y_valid, validPredict)

In [None]:
# use the model to make predictions on test set
testPredict = mn.predict(X_test)

testPredict.shape

In [None]:
# predictions for the test data
testPredict

In [None]:
# generating the prediction text file
np.savetxt(r'predictions.txt', testPredict, fmt = '%d')