### Data Cleaning

In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import seaborn as sb
from collections import Counter
import tqdm
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import unicodedata
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
express = pd.read_csv('express.csv')
express.head()

Unnamed: 0,URL,Headline,Date1,Date2,Content,Author
0,https://tribune.com.pk/story/1813087/6-asad-um...,Asad Umar checks Pakistan into a financial reh...,2018-09-28T03:58:09 GMT,"Published: September 28, 2018",One cannot expect a drug addict to find a job ...,M Bilal Lakhani
1,https://tribune.com.pk/story/1755479/3-sarah-p...,Sarah Palin tears into 'sick' Baron Cohen over...,2018-07-11T18:37:45 GMT,"Published: July 11, 2018",Ex-Alaska govern­or says she has fallen victim...,AFP .
2,https://tribune.com.pk/story/1693045/4-anwar-m...,Anwar Maqsood’s offensive satire on Sindhis ca...,2018-04-23T09:46:46 GMT,"Published: April 23, 2018",‘Aik Sindhi Ka Interv­iew’ takes dig at their ...,Rahul Aijaz
3,https://tribune.com.pk/story/1632147/1-making-...,Making sense of the times: Is reality stranger...,2018-02-12T04:27:29 GMT,"Published: February 12, 2018",In a world where realit­y compet­es with satir...,Niha Dagia
4,https://tribune.com.pk/story/1534757/russian-p...,Russian politicians fail to see funny side of ...,2017-10-18T11:59:59 GMT,"Published: October 18, 2017",A commun­ist politi­can brande­d it 'abomin­ab...,Reuters


In [3]:
dawn = pd.read_csv('data.csv')
dawn.head()

Unnamed: 0,URL,Headline,Date,Content,Author
0,https://www.dawn.com/news/1458127/mbbs-student...,MBBS student found dead in hostel room at Jams...,2019-01-17,A third-year MBBS student was found shot dead ...,Mohammad Hussain Khan
1,https://www.dawn.com/news/1458129/govt-decides...,"Govt decides to remove names of Bilawal, Murad...",2019-01-17,The federal cabinet has decided to remove the ...,Sanaullah Khan | Javed Hussain
2,https://www.dawn.com/news/1458125/afghan-presi...,Afghan president thanks PM Khan for Pakistan's...,2019-01-17,Afghan President Ashraf Ghani on Thursday tele...,Sanaullah Khan
3,https://www.dawn.com/news/1458123/mehwish-haya...,Mehwish Hayat acts out her musical dreams in n...,2019-01-17,Director and script writer Wajahat Rauf unveil...,
4,https://www.dawn.com/news/1458118/complete-tex...,Complete text of address by Justice Asif Saeed...,2019-01-17,Complete address by Justice Asif Saeed Khosa o...,Dawn.com


In [4]:
# Convert dates to datetime object
express['Date'] =  pd.to_datetime(express['Date1'])
dawn['Date'] =  pd.to_datetime(dawn['Date'])

In [5]:
# dropping unecessary columns
express = express.drop(['Date1', 'Date2'], axis=1)

In [6]:
# Replacing None and dawn.com with dawn since no author.
dawn['Author'] = dawn['Author'].replace(to_replace=r'None', value='dawn', regex=True)
dawn['Author'] = dawn['Author'].replace(to_replace=r'Dawn.com', value='dawn', regex=True)

We have the following 5 categories with no author names so we will replace them with the name of the newspaper so that it is later easier to identify a source for these articles.
  -  The Newspaper's Staff Reporter
  - A Correspondent
  - The Newspaper's Correspondent
  - The Newspaper's Staff Correspondent
  - Editorial

In [7]:
print(dawn['Author'].unique())
print('Number of Categories',len(dawn['Author'].unique()))

['Mohammad Hussain Khan' 'Sanaullah Khan | Javed Hussain' 'Sanaullah Khan'
 'dawn' 'dawn | Haseeb Bhatti' 'Rimmel Mohydin' 'Jamil Nagri'
 'Bureau Report' 'Kashif Abbasi' 'Ikram Junaidi'
 "The Newspaper's Staff Reporter" 'Naeem Sahoutara' 'Habib Khan Ghori'
 'Hasan Mansoor' 'Behram Baloch' 'Mohammad Asghar' 'Imran Ayub'
 'Syed Ali Shah' 'Haseeb Bhatti' 'Ali Akbar | AP' 'Rana Bilal'
 'Sara Malkani' 'Editorial' 'Munawer Azeem' 'A Correspondent'
 "The Newspaper's Correspondent" 'Zulfiqar Ali' 'Intikhab Hanif'
 'Tahir Siddiqui' 'Ishaq Tanoli' 'Faiza Ilyas' 'Tariq Naqash' 'Imtiaz Ali'
 'Tahir Naseer' 'Wajiha Khanain' 'Abdul Ghaffar' 'AP'
 'Amir Wasim | Javed Hussain | Nadir Guramani' 'Jawaid Bokhari'
 'Dr Niaz Murtaza' 'Amjad Iqbal' 'Hamid Asghar' 'APP'
 "The Newspaper's Staff Correspondent" 'Zulqernain Tahir'
 'Ghulam Hussain Khawaja' 'Dawn Report' 'Syed Irfan Raza'
 'Monitoring Desk' 'Malik Asad' 'Syed Ali Shah | APP' 'dawn | Amir Wasim'
 'Javed Hussain' 'Shakeel Qarar' 'Umair Javed' 'Nadi

In [8]:
dawn['Author'] = dawn['Author'].replace(to_replace=r'The Newspaper\'s Staff Reporter', value='dawn', regex=True)
dawn['Author'] = dawn['Author'].replace(to_replace=r'A Correspondent', value='dawn', regex=True)
dawn['Author'] = dawn['Author'].replace(to_replace=r'The Newspaper\'s Correspondent', value='dawn', regex=True)
dawn['Author'] = dawn['Author'].replace(to_replace=r'Editorial', value='dawn', regex=True)
dawn['Author'] = dawn['Author'].replace(to_replace=r'The Newspaper\'s Staff Correspondent', value='dawn', regex=True)

In [9]:
print('Number of Categories are:',len(dawn['Author'].unique()))

Number of Categories are: 101


In [10]:
data = pd.concat([dawn,express],axis=0,ignore_index=True,sort = False).reset_index()

In [11]:
# freeing memory
del[dawn,express]
data.head()

Unnamed: 0,index,URL,Headline,Date,Content,Author
0,0,https://www.dawn.com/news/1458127/mbbs-student...,MBBS student found dead in hostel room at Jams...,2019-01-17,A third-year MBBS student was found shot dead ...,Mohammad Hussain Khan
1,1,https://www.dawn.com/news/1458129/govt-decides...,"Govt decides to remove names of Bilawal, Murad...",2019-01-17,The federal cabinet has decided to remove the ...,Sanaullah Khan | Javed Hussain
2,2,https://www.dawn.com/news/1458125/afghan-presi...,Afghan president thanks PM Khan for Pakistan's...,2019-01-17,Afghan President Ashraf Ghani on Thursday tele...,Sanaullah Khan
3,3,https://www.dawn.com/news/1458123/mehwish-haya...,Mehwish Hayat acts out her musical dreams in n...,2019-01-17,Director and script writer Wajahat Rauf unveil...,dawn
4,4,https://www.dawn.com/news/1458118/complete-tex...,Complete text of address by Justice Asif Saeed...,2019-01-17,Complete address by Justice Asif Saeed Khosa o...,dawn


In [12]:
data.tail()

Unnamed: 0,index,URL,Headline,Date,Content,Author
393,393,https://tribune.com.pk/story/1412053/sneak-pea...,A sneak peak into the ‘CPEC Master Plan’,2017-05-18 09:19:52,Karach­i will soon be called China Town under ...,Shehzad Ghias
394,394,https://tribune.com.pk/story/1410222/guide-pak...,A guide for Pakistani parents: How to kill you...,2017-05-16 09:34:29,A helpfu­l guide brough­t to you by the Pakist...,Khurram Siddiqui
395,395,https://tribune.com.pk/story/1410238/tv-show-b...,If the TV show ‘Black Mirror’ was based in Pak...,2017-05-16 06:18:20,The Britis­h televi­sion show deals with the r...,Rahul Aijaz
396,396,https://tribune.com.pk/story/1375867/denying-e...,Denying entry to Pakistani Fulbright scholar m...,2017-04-06 10:24:21,US expose­s Pakist­an’s grand plan to bring do...,Shehzad Ghias
397,397,https://tribune.com.pk/story/1336417/not-conte...,"Tired of fighting Taliban at home, PM flies to...",2017-02-23 10:33:54,He is not the leader we want but much like Bat...,Shehzad Ghias


In [13]:
# Keeping only the date part of the datetime object for consistency.
data['Date'] = data['Date'].dt.date

In [14]:
# Saving cleaned data
data.to_csv('articles.csv',index=False)

In [59]:
data = pd.read_csv('articles.csv')
data.head()

Unnamed: 0,index,URL,Headline,Date,Content,Author
0,0,https://www.dawn.com/news/1458127/mbbs-student...,MBBS student found dead in hostel room at Jams...,2019-01-17,A third-year MBBS student was found shot dead ...,Mohammad Hussain Khan
1,1,https://www.dawn.com/news/1458129/govt-decides...,"Govt decides to remove names of Bilawal, Murad...",2019-01-17,The federal cabinet has decided to remove the ...,Sanaullah Khan | Javed Hussain
2,2,https://www.dawn.com/news/1458125/afghan-presi...,Afghan president thanks PM Khan for Pakistan's...,2019-01-17,Afghan President Ashraf Ghani on Thursday tele...,Sanaullah Khan
3,3,https://www.dawn.com/news/1458123/mehwish-haya...,Mehwish Hayat acts out her musical dreams in n...,2019-01-17,Director and script writer Wajahat Rauf unveil...,dawn
4,4,https://www.dawn.com/news/1458118/complete-tex...,Complete text of address by Justice Asif Saeed...,2019-01-17,Complete address by Justice Asif Saeed Khosa o...,dawn


In [18]:
# Number of articles for each author
Counter(data.Author)

Counter({'Mohammad Hussain Khan': 3,
         'Sanaullah Khan | Javed Hussain': 1,
         'Sanaullah Khan': 4,
         'dawn': 179,
         'dawn | Haseeb Bhatti': 1,
         'Rimmel Mohydin': 1,
         'Jamil Nagri': 2,
         'Bureau Report': 10,
         'Kashif Abbasi': 4,
         'Ikram Junaidi': 8,
         'Naeem Sahoutara': 6,
         'Habib Khan Ghori': 3,
         'Hasan Mansoor': 4,
         'Behram Baloch': 1,
         'Mohammad Asghar': 4,
         'Imran Ayub': 2,
         'Syed Ali Shah': 1,
         'Haseeb Bhatti': 18,
         'Ali Akbar | AP': 1,
         'Rana Bilal': 8,
         'Sara Malkani': 1,
         'Munawer Azeem': 2,
         'Zulfiqar Ali': 2,
         'Intikhab Hanif': 1,
         'Tahir Siddiqui': 4,
         'Ishaq Tanoli': 2,
         'Faiza Ilyas': 1,
         'Tariq Naqash': 1,
         'Imtiaz Ali': 5,
         'Tahir Naseer': 1,
         'Wajiha Khanain': 1,
         'Abdul Ghaffar': 1,
         'AP': 1,
         'Amir Wasim | Javed Hus

In [20]:
print('Total size of dataset is :',data.shape)

Total size of dataset is : (398, 6)


In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 6 columns):
index       398 non-null int64
URL         398 non-null object
Headline    398 non-null object
Date        398 non-null object
Content     398 non-null object
Author      398 non-null object
dtypes: int64(1), object(5)
memory usage: 18.7+ KB


In [27]:
print('Range of the published articles is',data.Date.min(),'-',data.Date.max())

Range of the published articles is 2017-02-23 - 2019-01-17


 - Find importance of words using headlines and content seperately

### Preprocessing

In [44]:
def remove_non_ascii(words):
    words = [unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore') for word in words]
    return words
# The first pre-processing step which we will do is transform our tweets into lower case.
# This avoids having multiple copies of the same words
def to_lowercase(words):
    words = [word.lower() for word in words]
    return words
# Removing punctuation to reduce the amount of the training data
def remove_punctuation(words):
    words = [re.sub(r'[^\w\s]', '', word) for word in words if re.sub(r'[^\w\s]', '', word) not in '']    
    return words
# Removing numbers from data since they aren't useful in this context.
def replace_numbers(words):
    words = [re.sub(r'[^\w\s]', '', word) for word in words]
    return words
def remove_stopwords(words):
    words = [word for word in words if word not in stopwords.words('english')]
    return words

In [45]:
def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words

In [60]:
data['tokenized_Content'] = data['Content'].apply(word_tokenize)
data['tokenized_Headline'] = data['Headline'].apply(word_tokenize)
data['tokenized_Content'] = data['tokenized_Content'].apply(normalize)
data['tokenized_Headline'] = data['tokenized_Headline'].apply(normalize)

In [61]:
data['tokenized_Headline'][5]

['profile', 'pakistan', 'next', 'chief', 'justice', 'asif', 'saeed', 'khosa']

In [62]:
data.head()

Unnamed: 0,index,URL,Headline,Date,Content,Author,tokenized_Content,tokenized_Headline
0,0,https://www.dawn.com/news/1458127/mbbs-student...,MBBS student found dead in hostel room at Jams...,2019-01-17,A third-year MBBS student was found shot dead ...,Mohammad Hussain Khan,"[thirdyear, mbbs, student, found, shot, dead, ...","[mbbs, student, found, dead, hostel, room, jam..."
1,1,https://www.dawn.com/news/1458129/govt-decides...,"Govt decides to remove names of Bilawal, Murad...",2019-01-17,The federal cabinet has decided to remove the ...,Sanaullah Khan | Javed Hussain,"[federal, cabinet, decided, remove, names, ppp...","[govt, decides, remove, names, bilawal, murad,..."
2,2,https://www.dawn.com/news/1458125/afghan-presi...,Afghan president thanks PM Khan for Pakistan's...,2019-01-17,Afghan President Ashraf Ghani on Thursday tele...,Sanaullah Khan,"[afghan, president, ashraf, ghani, thursday, t...","[afghan, president, thanks, pm, khan, pakistan..."
3,3,https://www.dawn.com/news/1458123/mehwish-haya...,Mehwish Hayat acts out her musical dreams in n...,2019-01-17,Director and script writer Wajahat Rauf unveil...,dawn,"[director, script, writer, wajahat, rauf, unve...","[mehwish, hayat, acts, musical, dreams, new, w..."
4,4,https://www.dawn.com/news/1458118/complete-tex...,Complete text of address by Justice Asif Saeed...,2019-01-17,Complete address by Justice Asif Saeed Khosa o...,dawn,"[complete, address, justice, asif, saeed, khos...","[complete, text, address, justice, asif, saeed..."


In [63]:
# Saving the clean data to a csv file
data.to_csv("preprocessed.csv",index=False)

In [64]:
data = pd.read_csv('preprocessed.csv')
data.head()

Unnamed: 0,index,URL,Headline,Date,Content,Author,tokenized_Content,tokenized_Headline
0,0,https://www.dawn.com/news/1458127/mbbs-student...,MBBS student found dead in hostel room at Jams...,2019-01-17,A third-year MBBS student was found shot dead ...,Mohammad Hussain Khan,"['thirdyear', 'mbbs', 'student', 'found', 'sho...","['mbbs', 'student', 'found', 'dead', 'hostel',..."
1,1,https://www.dawn.com/news/1458129/govt-decides...,"Govt decides to remove names of Bilawal, Murad...",2019-01-17,The federal cabinet has decided to remove the ...,Sanaullah Khan | Javed Hussain,"['federal', 'cabinet', 'decided', 'remove', 'n...","['govt', 'decides', 'remove', 'names', 'bilawa..."
2,2,https://www.dawn.com/news/1458125/afghan-presi...,Afghan president thanks PM Khan for Pakistan's...,2019-01-17,Afghan President Ashraf Ghani on Thursday tele...,Sanaullah Khan,"['afghan', 'president', 'ashraf', 'ghani', 'th...","['afghan', 'president', 'thanks', 'pm', 'khan'..."
3,3,https://www.dawn.com/news/1458123/mehwish-haya...,Mehwish Hayat acts out her musical dreams in n...,2019-01-17,Director and script writer Wajahat Rauf unveil...,dawn,"['director', 'script', 'writer', 'wajahat', 'r...","['mehwish', 'hayat', 'acts', 'musical', 'dream..."
4,4,https://www.dawn.com/news/1458118/complete-tex...,Complete text of address by Justice Asif Saeed...,2019-01-17,Complete address by Justice Asif Saeed Khosa o...,dawn,"['complete', 'address', 'justice', 'asif', 'sa...","['complete', 'text', 'address', 'justice', 'as..."
