In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
news_set = pd.read_csv('irishtimes-date-text.csv')
news_set.head()

Unnamed: 0,publish_date,headline_category,headline_text
0,19960102,news,UUP sees possibility of voting Major out
1,19960102,news,Pubs targeted as curbs on smoking are extended
2,19960102,news,Papers reveal secret links with O'Neill cabinet
3,19960102,news,Domestic chaos as Italy takes EU presidency
4,19960102,news,Learning about the star to which we owe life


In [7]:
news_set.shape

(1416434, 3)

In [8]:
news_set.isnull().sum() # checking if any cell is empty/null


publish_date         0
headline_category    0
headline_text        0
dtype: int64

In [9]:
news_set.duplicated().sum() # Checks for repeated rows

0

In [10]:
news_set.drop_duplicates(inplace = True) #removes all duplicates in the data set


In [11]:
dates = news_set.publish_date.values # Putting the dates in to rows using .values

In [12]:
year = []
month = []
day = []
for date in dates:
    str_date = list(str(date))
    year.append(int("".join(str_date[0:4]))) 
    month.append(int("".join(str_date[4:6])))
    day.append(int("".join(str_date[6:8])))
    

In [13]:
news_set['year'] = year
news_set['month'] = month
news_set['day'] = day


In [14]:
news_set.head()

Unnamed: 0,publish_date,headline_category,headline_text,year,month,day
0,19960102,news,UUP sees possibility of voting Major out,1996,1,2
1,19960102,news,Pubs targeted as curbs on smoking are extended,1996,1,2
2,19960102,news,Papers reveal secret links with O'Neill cabinet,1996,1,2
3,19960102,news,Domestic chaos as Italy takes EU presidency,1996,1,2
4,19960102,news,Learning about the star to which we owe life,1996,1,2


In [15]:
news_set.drop(['publish_date'], axis = 1, inplace = True)
news_set.head()

Unnamed: 0,headline_category,headline_text,year,month,day
0,news,UUP sees possibility of voting Major out,1996,1,2
1,news,Pubs targeted as curbs on smoking are extended,1996,1,2
2,news,Papers reveal secret links with O'Neill cabinet,1996,1,2
3,news,Domestic chaos as Italy takes EU presidency,1996,1,2
4,news,Learning about the star to which we owe life,1996,1,2


In [16]:
len(news_set.headline_category) # from the answer we can see that this is too much

1416434

In [17]:
len(news_set.headline_category.unique()) # This is better

156

In [None]:
news_set.headline_category = news_set.headline_category.apply(lambda x: x.split(".")[0]) 
news_set.head(100)

In [19]:
set([category for category in news_set.headline_category if "." not in category] ) 

{'business', 'culture', 'lifestyle', 'news', 'opinion', 'sport'}

In [24]:
news_set.headline_category = news_set.headline_category.apply(lambda x: x.split(".")[0])
news_set

Unnamed: 0,headline_category,headline_text,year,month,day
0,news,UUP sees possibility of voting Major out,1996,1,2
1,news,Pubs targeted as curbs on smoking are extended,1996,1,2
2,news,Papers reveal secret links with O'Neill cabinet,1996,1,2
3,news,Domestic chaos as Italy takes EU presidency,1996,1,2
4,news,Learning about the star to which we owe life,1996,1,2
5,news,EU proposal for 'skills cards' to detail educa...,1996,1,2
6,news,Roads authority asked to switch to use of conc...,1996,1,2
7,news,UCD cancels intake into evening course,1996,1,2
8,news,When days are seconds longer,1996,1,2
9,news,John Major's future may lie in unionist hands,1996,1,2


### Cleaning the dataset

In [None]:
import re
import nltk
import string
#nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


In [None]:
new_text = []
for i in range(0, len(news_set)):
    word = word_tokenize(news_set['headline_text'][i]) # tokenizing the data
    #clean_text = re.sub('[%s]'% re.escape(string.punctuation), ' ', str(word) ) # replaces punctuations with whitespaces
    clean_text = clean_text.lower() # converts all the words to lower cases
    clean_text = re.sub('\w*\d\w*', ' ', clean_text) #this removes all the numbers in the text.
    #LS = LancasterStemmer()
    lem = WordNetLemmatizer()
    clean_text = [lem.lemmatize(word) for word in clean_text if not word in stopwords.words('english')]
    clean_text = ' '.join(clean_text)
    new_text.append(clean_text)


# End of cleaning.

In [None]:
# using word clouds
from wordcloud import WordCloud

def make_wordcloud(words,title):
    cloud = WordCloud(width=1920, height=1080,max_font_size=200, max_words=300, background_color="white").generate(words)
    plt.figure(figsize=(20,20))
    plt.imshow(cloud, interpolation="gaussian")
    plt.axis("off") 
    plt.title(title, fontsize=60)
    plt.show()


In [None]:
all_text = " ".join(news_set[news_set.headline_category == "news"].filtered_text) 
make_wordcloud(all_text, "News") 


In [None]:
news_set['classification'] = news_set['headline_category'].replace(['news','culture','opinion','business','sport','lifestyle'],[0,1,2,3,4,5])
