In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
news_set = pd.read_csv('irishtimes-date-text.csv') # Reading in the data
news_set.head()

Unnamed: 0,publish_date,headline_category,headline_text
0,19960102,news,UUP sees possibility of voting Major out
1,19960102,news,Pubs targeted as curbs on smoking are extended
2,19960102,news,Papers reveal secret links with O'Neill cabinet
3,19960102,news,Domestic chaos as Italy takes EU presidency
4,19960102,news,Learning about the star to which we owe life


In [3]:
num = len(news_set['headline_text'])
print(f'There are {num} headline text')

There are 1425460 headline text


In [4]:
duplicates =news_set.duplicated().sum()
print(f'There are {duplicates} in the dataset')

There are 9026 in the dataset


In [5]:
#This drops the duplicates
news_set.drop_duplicates(inplace = True)

In [6]:
# I personally will not be using the dates.
news_set.drop(['publish_date'], axis = 1, inplace = True)
news_set.head()

Unnamed: 0,headline_category,headline_text
0,news,UUP sees possibility of voting Major out
1,news,Pubs targeted as curbs on smoking are extended
2,news,Papers reveal secret links with O'Neill cabinet
3,news,Domestic chaos as Italy takes EU presidency
4,news,Learning about the star to which we owe life


In [7]:
num = len(news_set['headline_text'])
print(f'Now, there are {num} headline text')

Now, there are 1416434 headline text


### Cleaning the data

In [8]:
# Removing all the words containing numbers
import re
cleaned_text = []
for i in news_set['headline_text']:
    hold = re.sub('\w*\d\w*', ' ', i)
    cleaned_text.append(hold)
news_set['cleaned_text'] = cleaned_text
#news_set.head()

In [9]:
# Converting  to lower case
lower_case_text =[]
for i in news_set['cleaned_text']:
    hold = i.lower()
    lower_case_text.append(hold)

news_set['lower_case_text'] =  lower_case_text
#news_set.head()

In [10]:
# Removing Punctuations and replacing them with white spaces.
import string
no_punctuation = []
for i in news_set['lower_case_text']:
    hold = re.sub('[%s]'%re.escape(string.punctuation), '',str(i))
    no_punctuation.append(hold)
    
news_set['no_punctuation'] = no_punctuation
#news_set.head()

In [11]:
# Tokenizing the words
from nltk.tokenize import  word_tokenize

tokenize= []
for i in news_set['no_punctuation']:
    hold = word_tokenize(i)
    tokenize.append(hold)
    
news_set['tokenized_words'] = tokenize
#news_set.head()

In [12]:
#Removing stop words
from nltk.corpus import stopwords

stopwords = stopwords.words('english')

stop_func = lambda x: [word for word in x if word not in stopwords]
news_set['removed_stopwords'] = news_set['tokenized_words'].apply(stop_func)
#news_set.head()

In [13]:
# Stemming the words
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
stemmed = lambda x:[ ps.stem(i) for i in x]

    
news_set['stemmed_words'] =  news_set['removed_stopwords'].apply(stemmed)
#news_set.head()

In [14]:
final_format = []
for i in news_set['stemmed_words']:
    hold = ' '.join(i)
    final_format.append(hold)
    
news_set['final_format'] = final_format

In [15]:
news_set.head()

Unnamed: 0,headline_category,headline_text,cleaned_text,lower_case_text,no_punctuation,tokenized_words,removed_stopwords,stemmed_words,final_format
0,news,UUP sees possibility of voting Major out,UUP sees possibility of voting Major out,uup sees possibility of voting major out,uup sees possibility of voting major out,"[uup, sees, possibility, of, voting, major, out]","[uup, sees, possibility, voting, major]","[uup, see, possibl, vote, major]",uup see possibl vote major
1,news,Pubs targeted as curbs on smoking are extended,Pubs targeted as curbs on smoking are extended,pubs targeted as curbs on smoking are extended,pubs targeted as curbs on smoking are extended,"[pubs, targeted, as, curbs, on, smoking, are, ...","[pubs, targeted, curbs, smoking, extended]","[pub, target, curb, smoke, extend]",pub target curb smoke extend
2,news,Papers reveal secret links with O'Neill cabinet,Papers reveal secret links with O'Neill cabinet,papers reveal secret links with o'neill cabinet,papers reveal secret links with oneill cabinet,"[papers, reveal, secret, links, with, oneill, ...","[papers, reveal, secret, links, oneill, cabinet]","[paper, reveal, secret, link, oneil, cabinet]",paper reveal secret link oneil cabinet
3,news,Domestic chaos as Italy takes EU presidency,Domestic chaos as Italy takes EU presidency,domestic chaos as italy takes eu presidency,domestic chaos as italy takes eu presidency,"[domestic, chaos, as, italy, takes, eu, presid...","[domestic, chaos, italy, takes, eu, presidency]","[domest, chao, itali, take, eu, presid]",domest chao itali take eu presid
4,news,Learning about the star to which we owe life,Learning about the star to which we owe life,learning about the star to which we owe life,learning about the star to which we owe life,"[learning, about, the, star, to, which, we, ow...","[learning, star, owe, life]","[learn, star, owe, life]",learn star owe life


# Using TF-IDF Vectorizer

In [16]:
#vectorizing the words
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
x = tf.fit_transform(news_set['final_format'])
y = news_set.iloc[:,0]

In [17]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.20, random_state = 0)

In [18]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [19]:
y_pred_1 = classifier.predict(x_test)
y_pred_1

array(['news', 'news', 'sport.rugby.international', ..., 'sport', 'sport',
       'news'], dtype=object)

In [20]:
from sklearn.metrics import accuracy_score
score_1 = accuracy_score(y_test, y_pred_1)
score_1*100

55.22914923734588

In [21]:
from sklearn.naive_bayes import MultinomialNB 
nb = MultinomialNB()
nb.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [22]:
y_pred_2 = nb.predict(x_test)
y_pred_2

array(['news', 'news', 'sport', ..., 'sport', 'news', 'news'],
      dtype='<U57')

In [23]:
score_2 = accuracy_score(y_test, y_pred_2)
score_2*100

47.65944077913918

# Using Count Vectorizer

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x_1 = cv.fit_transform(news_set['final_format'])
y = news_set.iloc[:,0]

In [26]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_1,y, test_size = 0.20, random_state = 0)

In [27]:
from sklearn.linear_model import LogisticRegression
classifier_1 = LogisticRegression(random_state = 0)
classifier_1.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
y_pred_3 = classifier_1.predict(x_test)

In [29]:
score_3 = accuracy_score(y_test, y_pred_3)
score_3*100

57.201001104886565

In [30]:
from sklearn.naive_bayes import MultinomialNB 
nb_1 = MultinomialNB()
nb_1.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [31]:
y_pred_4 = nb_1.predict(x_test)

In [32]:
score_4 = accuracy_score(y_test, y_pred_4)
score_4*100

51.83294679953545

# The best classifier I have for the data  is the count vectorizer logistic regression classifer which got a 57.201 percent accuracy. 