# Stylometry and Text Analytics as Data Science Tools

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# spaCy installations - uncomment this if you need to install spaCy
!pip install -U spacy --quiet
!python -m spacy download en_core_web_sm --quiet
!pip install spacy_fastlang --quiet
!pip install LeXmo --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.1/50.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.0/45.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
en-core-web-sm 3.6.0 requires spacy<3.7.0,>=3.6.0, but you have spacy 3.7.2 which is incompatible.[0m[31m
[0m2024-02-01 14:22:54.001652: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-01 14:22:54.001708: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempt

In [None]:
# Imports
import pandas as pd
import sys
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm_notebook
import pickle
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import requests
from LeXmo import LeXmo
from collections import defaultdict
import math
import re

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Secrets
from google.colab import userdata
userdata.get('HF_TOKEN')

# Custom modules
sys.path.append('/content/drive/MyDrive/Python/MSCAPP/utils/')
%load_ext autoreload
%autoreload 2
from utils import *

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Import data
fake = pd.read_csv('/content/drive/MyDrive/Python/MSCAPP/data/Fake.csv')
real = pd.read_csv('/content/drive/MyDrive/Python/MSCAPP/data/True.csv')

## Data Preparation

- Learn about data set structure
- Remove very small or empty strings
- Strip out news desk information
- Apply a text processing pipeline

### Learn about Data Set Structure

In [None]:
# Sizing
print(f'Count of fake news articles: {fake.shape[0]} | Count of real news articles: {real.shape[0]}')

Count of fake news articles: 23481 | Count of real news articles: 21417


In [None]:
# View details about the fake news articles
fake.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB


In [None]:
# Sample
fake.sample(3)

Unnamed: 0,title,text,subject,date
7547,WATCH: Rachel Maddow Just Skillfully Exposed ...,"On Friday, Donald Trump canceled a rally in Ch...",News,"March 12, 2016"
8301,The Right Celebrated Trayvon Martin’s Birthda...,The story of Trayvon Martin may be one of the ...,News,"February 6, 2016"
13520,SHE GREW UP BELIEVING BLACKS Could Only Suppor...,Keep your eye on Anita Moncreif If knowledge i...,politics,"Jul 14, 2016"


In [None]:
# View details about the real news articles
real.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB


In [None]:
# Sample
real.sample(3)

Unnamed: 0,title,text,subject,date
7080,Factbox: Trump picks women for U.N. ambassador...,(Reuters) - Republican U.S. President-elect Do...,politicsNews,"November 23, 2016"
17634,"U.S., Israel quit U.N. heritage agency citing ...",PARIS (Reuters) - The United States and Israel...,worldnews,"October 12, 2017"
5219,"Trump plan to slash State, foreign aid spendin...",WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"February 28, 2017"


### Remove Very Small or Empty Strings

For this step, look at the character count. We want to make sure we remove anything that might be too small to use. In this case, we'll set a threshold of 140 characters to reflect a the max tweet size from when this data set was originally compiled. In your own research, you can adjust this based on domain knowledge or some other important factor to your work.

In [None]:
# Updated fake dataframe
fake_mask = [len(fake['text'][i]) > 140 and not re.match(r'^\s*$', fake['text'][i]) and not re.match(r'^(http:\/\/|https:\/\/)\S+$', fake['text'][i]) and (len(fake['title'][i]) > 10) for i in range(fake.shape[0])]

# Reset index
fake_df = fake[fake_mask]

# Updated real dataframe
real_mask = [len(real['text'][i]) > 140 and not re.match(r'^\s*$', real['text'][i]) and not re.match(r'^(http:\/\/|https:\/\/)\S+$', real['text'][i]) and (len(real['title'][i]) > 10) for i in range(real.shape[0])]

# Reset index
real_df = real[real_mask]

In [None]:
# Check for updates to data sets
print(f'Count of fake news articles: {fake_df.shape[0]} | Count of real news articles: {real_df.shape[0]}')

Count of fake news articles: 22231 | Count of real news articles: 21416


This is a lot of data to work with, which is great, but it will require a high-RAM runtime environment to work through it without using up all of the resources. Additionally, it will take a very long time to run our text processing pipeline on all of the data. To solve both of these problems, we will randomly sample a subset of each dataset.

In [None]:
# Subset
fake_sample = fake_df.sample(n=10000, random_state=42).reset_index(drop=False).rename(columns={'index': 'original_id'})
real_sample = real_df.sample(n=10000, random_state=42).reset_index(drop=False).rename(columns={'index': 'original_id'})

### Remove News Desk Information

News desk information contains the form of 'CITY (News Outlet) - '. This appears exclusively in the real data set, and as such, it should be removed.
- If we don't remove this detail, then whatever model you build could use it as a feature to judge an article as real or fake.

In [None]:
# Remove news desk information - fake articles
%%time
fake_sample['article_text'] = fake_sample.text.apply(lambda x: newsDeskRemover(x))

CPU times: user 20min 32s, sys: 3.68 s, total: 20min 36s
Wall time: 20min 45s


In [None]:
fake_sample.head()

Unnamed: 0,original_id,title,text,subject,date,article_text
0,21221,86 YR OLD BARBARA WALTERS Tells 40 Yr Old ‘Ame...,"Real journalism ended a long time ago, but thi...",left-news,"Dec 15, 2015","Real journalism ended a long time ago, but thi..."
1,18041,Woman Just Rescued from #Harvey Flood Goes Off...,A clueless CNN reporter trying to interview a ...,left-news,"Aug 29, 2017",A clueless CNN reporter trying to interview a ...
2,7484,What Trump Promised Carson For An Endorsement...,Watching people line up behind Donald Trump is...,News,"March 15, 2016",Watching people line up behind Donald Trump is...
3,259,The Senate Banking Committee Held An Equifax ...,Remember when half of America had their credit...,News,"October 5, 2017",Remember when half of America had their credit...
4,6062,NY Attorney General On Trump University: ‘Thi...,Republican presidential nominee Donald Trump i...,News,"June 2, 2016",Republican presidential nominee Donald Trump i...


In [None]:
# Remove news desk information - real articles
%%time
real_sample['article_text'] = real_sample.text.apply(lambda x: newsDeskRemover(x))

CPU times: user 25min 12s, sys: 4.65 s, total: 25min 16s
Wall time: 25min 25s


In [None]:
# Check
real_sample.head()

Unnamed: 0,original_id,title,text,subject,date,article_text
0,14315,China pledges to be more open to providing inf...,SHANGHAI (Reuters) - Central and local governm...,worldnews,"November 21, 2017",Central and local governments will be more op...
1,3277,Exclusive: Trump targets illegal immigrants wh...,"(Reuters) - In September 2014, Gilberto Velasq...",politicsNews,"June 9, 2017","(Reuters) - In September 2014, Gilberto Velasq..."
2,2876,"At G20 summit, Trump pledges $639 million in a...",HAMBURG (Reuters) - U.S. President Donald Trum...,politicsNews,"July 8, 2017",U.S. President Donald Trump on Saturday promi...
3,5160,Ex-Christie associates lose bid for new trial ...,NEW YORK (Reuters) - A federal judge rejected ...,politicsNews,"March 2, 2017",A federal judge rejected a request for a new ...
4,10841,"Clinton, Sanders clash over Obama as they vie ...",MILWAUKEE (Reuters) - Democratic presidential ...,politicsNews,"February 11, 2016",Democratic presidential candidates Hillary Cl...


### Apply a Text Processing Pipeline

Text processing pipelines clean text data up to the point that they are ready for use in an analytics project. For our purposes, we will need to alter the pipeline a bit.

__Steps in Our Pipeline__
- Sentence tokenization
- Word tokenization
- Part-of-Speech tagging
- Lemmatization
- Named Entity Recognition

In [None]:
# Update the pipeline components
# pipeBuilder()

In [None]:
# View the updated pipeline
nlp.components

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7e83e1af3e20>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7e83e1af2c80>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7e83e1c11770>),
 ('sentencizer', <spacy.pipeline.sentencizer.Sentencizer at 0x7e83e0e9e700>),
 ('senter', <spacy.pipeline.senter.SentenceRecognizer at 0x7e83e1af3be0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7e83e1a83500>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7e83e2f15c80>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7e83e1c118c0>)]

In [None]:
# For future use, export this pipeline
nlp.to_disk('/content/drive/MyDrive/Python/MSCAPP/utils/pipeline/')

Next, apply the pipeline to each data set to create Document objects. These objects will allow us to calculate metrics about our texts that will contribute to model building.

In [None]:
# Fake data set
%%time
fake_sample['text_docs'] = [doc for doc in nlp.pipe(fake_sample.article_text, batch_size=100)]

CPU times: user 10min 51s, sys: 13.1 s, total: 11min 4s
Wall time: 11min 25s


In [None]:
# Real data set
%%time
real_sample['text_docs'] = [doc for doc in nlp.pipe(real_sample.article_text, batch_size=100)]

CPU times: user 8min 44s, sys: 13.7 s, total: 8min 57s
Wall time: 9min 1s


In [None]:
# Pickle the list of Doc objects
fake_sample.to_pickle("/content/drive/MyDrive/Python/MSCAPP/data/fake_sample_data.pkl")
real_sample.to_pickle("/content/drive/MyDrive/Python/MSCAPP/data/real_sample_data.pkl")

### Assess Emotion

Due to the size of each data set, we will write in batches of 500.



In [None]:
# Import after disconnect
fake_sample = pd.read_pickle("/content/drive/MyDrive/Python/MSCAPP/data/fake_sample_data.pkl")
real_sample = pd.read_pickle("/content/drive/MyDrive/Python/MSCAPP/data/real_sample_data.pkl")

In [None]:
# Create a dictionary of emotion
fake_emotions = []
true_emotions = []

# Get the number of batches
import math
batch_size = 500
fake_batch_range = math.ceil(fake_sample.shape[0] / batch_size)
true_batch_range = math.ceil(real_sample.shape[0]/ batch_size)

In [None]:
# Iterate through the DataFrame in batches
for start in range(0, fake_sample.shape[0], batch_size):
  # Initialize tqdm with the total number of iterations
  progress_bar = tqdm_notebook(total=batch_size, desc="Processing")
  # Create the dictionary
  this_dict = defaultdict(dict)
  end = min(start + batch_size, fake_sample.shape[0])
  # Get the batch using iloc
  batch = fake_sample.iloc[start:end]
  # Process the batch using iterrows
  for index, row in batch.iterrows():
    # Process for emotions
    emotions = LeXmo.LeXmo(row.article_text)
    emotions.pop('text', None)
    this_dict[index] = emotions
    # Update progress bar
    progress_bar.update(1)
  # Close progress bar
  progress_bar.close()
  # Write to pickle
  with open(f"/content/drive/MyDrive/Python/MSCAPP/data/fake_emotions_{start}_{end-1}.pkl", "wb") as file:
    pickle.dump(this_dict, file)

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

In [None]:
# Iterate through the DataFrame in batches
for start in range(0, real_sample.shape[0], batch_size):
  # Initialize tqdm with the total number of iterations
  progress_bar = tqdm_notebook(total=batch_size, desc="Processing")
  # Create the dictionary
  this_dict = defaultdict(dict)
  end = min(start + batch_size, real_sample.shape[0])
  # Get the batch using iloc
  batch = real_sample.iloc[start:end]
  # Process the batch using iterrows
  for index, row in batch.iterrows():
    # Process for emotions
    emotions = LeXmo.LeXmo(row.article_text)
    emotions.pop('text', None)
    this_dict[index] = emotions
    # Update progress bar
    progress_bar.update(1)
  # Close progress bar
  progress_bar.close()
  # Write to pickle
  with open(f"/content/drive/MyDrive/Python/MSCAPP/data/real_emotions_{start}_{end-1}.pkl", "wb") as file:
    pickle.dump(this_dict, file)

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

Processing:   0%|          | 0/500 [00:00<?, ?it/s]

## Process Titles

Next, the same processes will be repeated for the titles.

- Remove if there are no titles.
- Apply a text processing pipeline.

### Apply a Text Processing Pipeline

In [None]:
# Fake data set
%%time
fake_sample['title_docs'] = [doc for doc in nlp.pipe(fake_sample.title, batch_size=100)]

CPU times: user 33.1 s, sys: 155 ms, total: 33.3 s
Wall time: 33.3 s


In [None]:
# Real data set
%%time
real_sample['title_docs'] = [doc for doc in nlp.pipe(real_sample.title, batch_size=100)]

CPU times: user 23 s, sys: 48.1 ms, total: 23 s
Wall time: 23 s


In [None]:
# Pickle the updated dataframes
fake_sample.to_pickle("/content/drive/MyDrive/Python/MSCAPP/data/fake_full_sample.pkl")
real_sample.to_pickle("/content/drive/MyDrive/Python/MSCAPP/data/real_full_sample.pkl")