In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import urllib.request
from nltk.corpus import stopwords
import re
import numpy as np
from collections import Counter
from sklearn import preprocessing
import string

from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB


[nltk_data] Downloading package stopwords to /Users/iris/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/iris/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# load the data
non_clickbait_url = "http://www.cs.columbia.edu/~sarahita/CL/non_clickbait_data.txt"
clickbait_url = "http://www.cs.columbia.edu/~sarahita/CL/clickbait_data.txt"

# read url .txt file into string "data"
def get_data(url):
  data = urllib.request.urlopen(url).read().decode('utf-8')
  return data

non_clickbait_data = get_data(non_clickbait_url)
clickbait_data = get_data(clickbait_url)

In [3]:
# combine clickbait and non-clickbait data in a single list
non_clickbait_headlines = non_clickbait_data.rstrip('\n').split('\n')
clickbait_headlines = clickbait_data.rstrip('\n').split('\n')
all_headlines = non_clickbait_headlines + clickbait_headlines

In [4]:
# create a list of corresponding labels
non_cb_labels = [0] * len(non_clickbait_headlines)
cb_labels = [1] * len(clickbait_headlines)
all_labels = non_cb_labels + cb_labels

In [5]:
# for i in range(len(all_headlines)):
#   return word_tokenize(all_headlines[i])

In [6]:
all_headlines[0]

'Bill Changing Credit Card Rules Is Sent to Obama With Gun Measure Included'

In [7]:
# extract features: bag of stop words
def stop_words(texts):
        bow = []
        eng_stopwords = stopwords.words('english')
        for text in texts:      
                counts = []
                tokens = nltk.word_tokenize(text.lower())
                for sw in eng_stopwords:
                        sw_count = tokens.count(sw)
                        counts.append(sw_count)
                bow.append(counts)
        bow_np = np.array(bow).astype(float)
        return bow_np

In [8]:
# extract features
stop_words_features = stop_words(all_headlines)

In [9]:
stop_words_features.shape
# headlines * stopwords

(31998, 179)

In [8]:
#stop_words_features[-2]
print(len(stopwords.words('english')))

# we have 179 stopwords list [ i, i+1, i+3...179]
# we counter each item in the list each time we encounter that word.

179


In [11]:
# convert features and labels to numpy arrays
X = stop_words_features
Y = np.array(all_labels) # target var to predict

# run classifier using 10-fold cross validation
# report mean accuracy 

scores = cross_val_score(MultinomialNB(), X, Y, scoring='accuracy', cv=10)
print(scores)
print(scores.mean())

[0.88       0.8790625  0.866875   0.878125   0.8815625  0.8821875
 0.8834375  0.87125    0.8380744  0.87496093]
0.8735535323538606


In [12]:
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/iris/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [13]:
# # extract features: bag of stop words
# def my_pos_tags(texts):
#         bow = []
#         my_pos_tags_list = ['NN', 'NNP', 'DT', 'IN', 'JJ', 'NNS','CC','PRP','VB','VBG']
#         for text in texts:      
#                 counts = []
#                 tokens = pos_tag(word_tokenize(text))
#                 #print(tokens[0][1])
#                 for pos in my_pos_tags_list:
#                         #if my_pos_tag
#                         pos_count = tokens.count(pos)
#                         counts.append(pos_count)
#                 bow.append(counts)
#         pos_bow_np = np.array(bow).astype(float)
#         return pos_bow_np


In [14]:
sample_train = all_headlines[2:10]

In [56]:
# extract features: bag of stop words
def my_pos_tags(texts):
        bow = []
        my_pos_tags_list = ['NN', 'NNP', 'DT', 'IN', 'JJ', 'NNS','CC','PRP','VB','VBG']
        for text in texts:      
                counts = []
                tokens = pos_tag(word_tokenize(text))
                #print(tokens[0][1])
                for pos in my_pos_tags_list:
                        #if my_pos_tag
                        pos_count = tokens[0][1].count(pos)
                        counts.append(pos_count)
                bow.append(counts)
        pos_bow_np = np.array(bow).astype(float)
        return pos_bow_np

In [268]:
def my_pos(texts):
        bow = []
        pos_tags_list = ['NN', 'NNP', 'DT', 'IN', 'JJ', 'NNS','CC','PRP','VB','VBG']
        
        for text in texts:
                counts = []
                #tokens = [word_tokenize(i) for i in texts]
                tokens = word_tokenize(text)
                #print(tokens)
        
                tagged_tokens = pos_tag(tokens)
                #tagged_tokens = [pos_tag(j) for j in tokens]
                #print(tagged_tokens)

                only_token = [tag[1] for tag in tagged_tokens]
                #print(only_token)

                for pos in pos_tags_list:
                        tokens_count = only_token.count(pos)
                        counts.append(tokens_count)
                bow.append(counts)

        bow_arr = np.array(bow).astype(float)
        return bow_arr

# AttributeError: 'list' object has no attribute 'isdigit'
# you applied this to the list. you shoudl have applied to the string inside the list. 
# e.g. 'abc'.isdigit() 
#       [].isdigit() // error


In [280]:
pos_feature = my_pos(all_headlines)

In [281]:
pos_feature.shape

(31998, 10)

In [282]:
# convert features and labels to numpy arrays
X_sync = pos_feature
Y_sync = np.array(all_labels) # target var to predict

# run classifier using 10-fold cross validation
# report mean accuracy 

pos_scores = cross_val_score(MultinomialNB(), X_sync, Y_sync, scoring='accuracy', cv=10)
print(pos_scores)
print(pos_scores.mean())

[0.763125   0.760625   0.748125   0.758125   0.76       0.7740625
 0.750625   0.7546875  0.7561738  0.75273523]
0.7578284034073148


In [None]:
#https://kavita-ganesan.com/how-to-use-countvectorizer/
def my_lexicon(texts):
        bow = []
        pos_tags_list = ['NN', 'NNP', 'DT', 'IN', 'JJ', 'NNS','CC','PRP','VB','VBG']
        
        for text in texts:
                counts = []
                #tokens = [word_tokenize(i) for i in texts]
                tokens = word_tokenize(text)
                #print(tokens)
        
                tagged_tokens = pos_tag(tokens)
                #tagged_tokens = [pos_tag(j) for j in tokens]
                #print(tagged_tokens)

                only_token = [tag[1] for tag in tagged_tokens]
                #print(only_token)

                for pos in pos_tags_list:
                        tokens_count = only_token.count(pos)
                        counts.append(tokens_count)
                bow.append(counts)

        bow_arr = np.array(bow).astype(float)
        return bow_arr

# AttributeError: 'list' object has no attribute 'isdigit'
# you applied this to the list. you shoudl have applied to the string inside the list. 
# e.g. 'abc'.isdigit() 
#       [].isdigit() // error
