## Data Wrangling

### Imports

In [1]:
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O
import numpy as np # linear algebra

from sklearn.feature_extraction.text import CountVectorizer
from nltk import tokenize
import warnings
from save_utils import save_file


### Load Data

In [2]:
fake_news = pd.read_csv('source_folder/train.csv')

In [3]:
fake_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24353 entries, 0 to 24352
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0.1  24353 non-null  int64 
 1   Unnamed: 0    24353 non-null  int64 
 2   title         24353 non-null  object
 3   text          24353 non-null  object
 4   label         24353 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 951.4+ KB


In [4]:
fake_news.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,0,‘Maury’ Show Official Facebook Posts F*CKED U...,Maury is perhaps one of the trashiest shows on...,0
1,1,1,Trump’s Favorite News Channel Tries To Soothe...,"Yesterday, after the father of one of the UCLA...",0
2,2,2,"Russia warns Iraq, Kurds not to destabilize Mi...",MOSCOW (Reuters) - Russia on Wednesday warned ...,1
3,3,3,WATCH STEVE SCALISE Throw A Strike At The Nati...,"House Majority Whip Steve Scalise (R., La.) th...",0
4,4,4,Trump Will HATE What Stephen Colbert Just Did...,It can be said that Late Show host Stephen Col...,0


### Number of missing values by column

In [5]:
missing = pd.concat([fake_news.isnull().sum(), 100 * fake_news.isnull().mean()], axis=1)
missing.columns=['count', '%']
missing.sort_values(by='count', ascending=False)

Unnamed: 0,count,%
Unnamed: 0.1,0,0.0
Unnamed: 0,0,0.0
title,0,0.0
text,0,0.0
label,0,0.0


## Value counts for title

In [6]:
(fake_news['title']).value_counts()

no title                                                                         92
Factbox: Trump fills top jobs for his administration                             10
newsticker                                                                        8
Factbox: Contenders for senior jobs in Trump's administration                     5
Factbox: Contenders, picks for key jobs in Trump's administration                 5
                                                                                 ..
Pakistan's ruling party nominates ousted PM Sharif to lead it                     1
Trump's unpredictability already troubles U.S. friends                            1
OBAMABOT CONGRESSWOMAN: ISIS Beheadings Do Not Justify U.S Airstrikes [Video]     1
Uganda in anti-online pornography drive seen by critics as diversion              1
Ukraine's Tymoshenko expects fair U.S. ruling after Manafort indicted             1
Name: title, Length: 24088, dtype: int64

In [7]:
(fake_news['text']).value_counts()

Maury is perhaps one of the trashiest shows on television today. It s right in line with the likes of the gutter trash that is Jerry Springer, and the fact that those shows are still on the air with the shit they air really is a sad testament to what Americans find to be entertaining. However, Maury really crossed the line with a Facebook post regarding one of their guest s appearance with a vile, disgusting caption on Tuesday evening.There was a young woman on there doing one of their episodes regarding the paternity of her child. However, on the page, the show posted an image of the woman, who happens to bear a striking resemblance to Senator and presidential candidate Ted Cruz. The caption from the Maury Show page read: The Lie Detector Test determined .that was a LIE!  Ted Cruz is just NOT that SEXY! As if that weren t horrible enough, the caption underneath the Imgur upload reads,  Ted Cruz in drag on Maury. Here is an image from the official Maury Facebook page:Here is the embed 

In [8]:
fake_news = fake_news.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
fake_news.head()

Unnamed: 0,title,text,label
0,‘Maury’ Show Official Facebook Posts F*CKED U...,Maury is perhaps one of the trashiest shows on...,0
1,Trump’s Favorite News Channel Tries To Soothe...,"Yesterday, after the father of one of the UCLA...",0
2,"Russia warns Iraq, Kurds not to destabilize Mi...",MOSCOW (Reuters) - Russia on Wednesday warned ...,1
3,WATCH STEVE SCALISE Throw A Strike At The Nati...,"House Majority Whip Steve Scalise (R., La.) th...",0
4,Trump Will HATE What Stephen Colbert Just Did...,It can be said that Late Show host Stephen Col...,0


In [9]:
# save the data to a new csv file to work EDA
datapath = '../DataScienceCapstone2/source_folder'
save_file(fake_news, 'fake_news_cleaned.csv', datapath)

A file already exists with this name.



Do you want to overwrite? (Y/N) Y


Writing file.  "../DataScienceCapstone2/source_folder/fake_news_cleaned.csv"


## EDA

In [10]:
length = len(fake_news['text'][0])
print(f'Length of a text: {length}')

Length of a text: 2338


In [11]:
fake_news['Length'] = fake_news['text'].str.len()
fake_news.head()

Unnamed: 0,title,text,label,Length
0,‘Maury’ Show Official Facebook Posts F*CKED U...,Maury is perhaps one of the trashiest shows on...,0,2338
1,Trump’s Favorite News Channel Tries To Soothe...,"Yesterday, after the father of one of the UCLA...",0,4467
2,"Russia warns Iraq, Kurds not to destabilize Mi...",MOSCOW (Reuters) - Russia on Wednesday warned ...,1,450
3,WATCH STEVE SCALISE Throw A Strike At The Nati...,"House Majority Whip Steve Scalise (R., La.) th...",0,1224
4,Trump Will HATE What Stephen Colbert Just Did...,It can be said that Late Show host Stephen Col...,0,1224


#### Word Count
Number of words in a best answer.

In [12]:
word_count = fake_news['text'][0].split()
print(f'Word count in a title: {len(word_count)}')

Word count in a title: 430


In [13]:
def word_count(text):
    text_list = text.split()
    return len(text_list)

In [14]:
fake_news['Word_count'] = fake_news['text'].apply(word_count)
fake_news.head()

Unnamed: 0,title,text,label,Length,Word_count
0,‘Maury’ Show Official Facebook Posts F*CKED U...,Maury is perhaps one of the trashiest shows on...,0,2338,430
1,Trump’s Favorite News Channel Tries To Soothe...,"Yesterday, after the father of one of the UCLA...",0,4467,715
2,"Russia warns Iraq, Kurds not to destabilize Mi...",MOSCOW (Reuters) - Russia on Wednesday warned ...,1,450,70
3,WATCH STEVE SCALISE Throw A Strike At The Nati...,"House Majority Whip Steve Scalise (R., La.) th...",0,1224,210
4,Trump Will HATE What Stephen Colbert Just Did...,It can be said that Late Show host Stephen Col...,0,1224,204


Average length of words.

In [15]:
fake_news['mean_word_length'] = fake_news['text'].map(lambda x: np.mean([len(word) for word in x.split(" ")]))
fake_news.head()

Unnamed: 0,title,text,label,Length,Word_count,mean_word_length
0,‘Maury’ Show Official Facebook Posts F*CKED U...,Maury is perhaps one of the trashiest shows on...,0,2338,430,4.401848
1,Trump’s Favorite News Channel Tries To Soothe...,"Yesterday, after the father of one of the UCLA...",0,4467,715,4.989276
2,"Russia warns Iraq, Kurds not to destabilize Mi...",MOSCOW (Reuters) - Russia on Wednesday warned ...,1,450,70,5.263889
3,WATCH STEVE SCALISE Throw A Strike At The Nati...,"House Majority Whip Steve Scalise (R., La.) th...",0,1224,210,4.724299
4,Trump Will HATE What Stephen Colbert Just Did...,It can be said that Late Show host Stephen Col...,0,1224,204,4.889423


Average length of setences in the text column.

In [16]:
np.mean([len(sent) for sent in tokenize.sent_tokenize(fake_news['text'][0])])

122.0

In [17]:
warnings.filterwarnings(action='ignore', message='Mean of empty slice')

def calculate_mean_sent_length(text):
    return np.mean([len(sent) for sent in tokenize.sent_tokenize(text)])

fake_news['mean_sent_length'] = fake_news['text'].apply(calculate_mean_sent_length)
fake_news.head()

Unnamed: 0,title,text,label,Length,Word_count,mean_word_length,mean_sent_length
0,‘Maury’ Show Official Facebook Posts F*CKED U...,Maury is perhaps one of the trashiest shows on...,0,2338,430,4.401848,122.0
1,Trump’s Favorite News Channel Tries To Soothe...,"Yesterday, after the father of one of the UCLA...",0,4467,715,4.989276,100.25
2,"Russia warns Iraq, Kurds not to destabilize Mi...",MOSCOW (Reuters) - Russia on Wednesday warned ...,1,450,70,5.263889,224.0
3,WATCH STEVE SCALISE Throw A Strike At The Nati...,"House Majority Whip Steve Scalise (R., La.) th...",0,1224,210,4.724299,173.714286
4,Trump Will HATE What Stephen Colbert Just Did...,It can be said that Late Show host Stephen Col...,0,1224,204,4.889423,305.25


Displays the top 20 most frequently occurring words in the text data.

In [18]:
cv = CountVectorizer(stop_words='english')
counts = cv.fit_transform(fake_news.text)
pd.DataFrame(counts.sum(axis=0),columns=cv.get_feature_names_out()).T.sort_values(0,ascending=False).head(20)

Unnamed: 0,0
trump,75396
said,74978
president,29662
people,22960
state,18785
reuters,17474
new,17162
donald,16029
government,15611
house,15430


### Bigram
Shows the top 20 most frequent bigrams in the 'fake_news' text data, along with their respective counts. Provides insights into pairs of consecutive words that frequently appear together in the given text data.

In [19]:
cv = CountVectorizer(ngram_range=(2,2),stop_words='english')
counts = cv.fit_transform(fake_news.text)
pd.DataFrame(counts.sum(axis=0),columns=cv.get_feature_names_out()).T.sort_values(0,ascending=False).head(20)

Unnamed: 0,0
donald trump,15000
united states,10415
white house,8360
hillary clinton,5218
new york,4943
featured image,4634
president donald,3972
washington reuters,3952
north korea,3729
barack obama,3027
