# Scraped Data Analysis

In [7]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from textblob import TextBlob
import spacy

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import GridSearchCV

import torch
from transformers import AutoTokenizer, AutoModel, pipeline

import openpyxl
import xlrd

# Just in case
import requests
from bs4 import BeautifulSoup

pd.set_option('display.max_colwidth', None)

In [8]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/betaknight/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/betaknight/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
spacy.load('en_core_web_sm')

<spacy.lang.en.English at 0x15729b8d0>

In [10]:
dataset_path = os.path.join(os.pardir, 'scraped_data')

df = pd.read_csv(os.path.join(dataset_path, 'wiki_bias_predictions.csv'))

In [11]:
df.head()

Unnamed: 0,combined_text,bias_word_count,lexicon_match_count,outlet,topic,type,label_opinion,bias_prediction,bias_probability,article_title,sentence_index
0,"Donald John Trump (born June 14, 1946) is an American politician, media personality, and businessman who is the 47th president of the United States.",0,0,usa-today,politics,center,Somewhat factual but also opinionated,0,0.283093,Donald Trump,0
1,"A member of the Republican Party, he served as the 45th president from 2017 to 2021.",0,0,usa-today,politics,center,Somewhat factual but also opinionated,0,0.270619,Donald Trump,1
2,"Born into a wealthy family in the New York City borough of Queens, Trump graduated from the University of Pennsylvania in 1968 with a bachelor's degree in economics.",0,0,usa-today,politics,center,Somewhat factual but also opinionated,0,0.415372,Donald Trump,2
3,"He became the president of his family's real estate business in 1971, renamed it the Trump Organization, and began acquiring and building skyscrapers, hotels, casinos, and golf courses.",0,0,usa-today,politics,center,Somewhat factual but also opinionated,0,0.2967,Donald Trump,3
4,"He launched side ventures, many licensing the Trump name, and filed for six business bankruptcies in the 1990s and 2000s.",0,0,usa-today,politics,center,Somewhat factual but also opinionated,0,0.272795,Donald Trump,4


In [16]:
df['bias_probability'].min(), df['bias_probability'].max()

(np.float64(0.0814672196601992), np.float64(0.579719237932214))

────────────────────────────────────────────────────────────────────────────

This shows the range of bias probabilities in the new Wikipedia dataset.

The lowest predicted bias probability is about **0.081**, and the highest is around **0.580**.  
This tells us that most sentences in Wikipedia articles are predicted to be **low in bias**, which makes sense given Wikipedia’s neutral writing style.  
However, the presence of higher values near **0.58** shows that the model is still able to detect subtle bias in some sentences.

────────────────────────────────────────────────────────────────────────────

In [17]:
df['bias_prediction'].value_counts()

bias_prediction
0    17686
1       15
Name: count, dtype: int64

────────────────────────────────────────────────────────────────────────────

This shows how many sentences were flagged as biased in the new Wikipedia dataset.

Out of **17,701 total sentences**, only **15** were labeled as biased (`1`).  
This is a very small portion, which matches expectations — Wikipedia aims for a neutral tone, and the model was trained to be careful about labeling something as biased.  
The few biased predictions likely represent sentences with subtle framing, emotionally charged language, or controversial phrasing.

────────────────────────────────────────────────────────────────────────────

In [20]:
biased_rows = df[df['bias_prediction'] == 1]
biased_rows.head(15)

Unnamed: 0,combined_text,bias_word_count,lexicon_match_count,outlet,topic,type,label_opinion,bias_prediction,bias_probability,article_title,sentence_index
4083,New York: Harper.,0,0,usa-today,politics,center,Somewhat factual but also opinionated,1,0.549528,QAnon,678
4112,"The abortion issue gained renewed attention in 2011 in a debate that The New York Times says ""has begun to sound like the debate in the United States"".",0,0,usa-today,politics,center,Somewhat factual but also opinionated,1,0.546173,Pro-life,19
4146,Anti-abortion groups like Students for Life of America and Susan B. Anthony Pro-Life America are at times associated with conservatism.,0,0,usa-today,politics,center,Somewhat factual but also opinionated,1,0.510336,Pro-life,53
6234,New York: Routledge.,0,0,usa-today,politics,center,Somewhat factual but also opinionated,1,0.549528,Immigration to the United States,480
6242,New York: E.P.,0,0,usa-today,politics,center,Somewhat factual but also opinionated,1,0.549528,Immigration to the United States,488
8810,"Trump.""",0,0,usa-today,politics,center,Somewhat factual but also opinionated,1,0.579719,Breitbart News,53
9013,The New York Times (NYT) is an American daily newspaper based in New York City.,0,0,usa-today,politics,center,Somewhat factual but also opinionated,1,0.54067,The New York Times,0
9029,"It has expanded to several other publications, including The New York Times Magazine, The New York Times International Edition, and The New York Times Book Review.",0,0,usa-today,politics,center,Somewhat factual but also opinionated,1,0.514546,The New York Times,16
9049,"The Great Depression forced Sulzberger to reduce The New York Times's operations, and developments in the New York newspaper landscape resulted in the formation of larger newspapers, such as the New York Herald Tribune and the New York World-Telegram.",0,0,usa-today,politics,center,Somewhat factual but also opinionated,1,0.509814,The New York Times,36
9079,"The exodus of readers to suburban New York newspapers, such as Newsday and Gannett papers, adversely affected The New York Times's circulation.",0,0,usa-today,politics,center,Somewhat factual but also opinionated,1,0.51506,The New York Times,66


────────────────────────────────────────────────────────────────────────────

These are the 15 sentences the model identified as biased across all the Wikipedia topics it scanned.

Each sentence received a `bias_probability` above the threshold of **0.33**, meaning the model found subtle signs of bias — even if the sentence seems neutral at first glance.

Interestingly, several of the flagged sentences are **publisher references** (e.g., "New York: Harper.") which likely tripped the model due to patterns seen during training. Others, like the sentences from the **Pro-life** or **New York Times** articles, contain phrases that reflect **political framing, media associations, or ideological links**.

This small set of predictions shows that while most of Wikipedia is neutral, the model is capable of identifying nuanced or structurally biased content in a consistent, careful way.

────────────────────────────────────────────────────────────────────────────