# Assessing Wikipedia Bias

## 1. You will need to collect data from a source of your choosing (dataset, wikipedia API, web-scraping)

## Introduction

The project

## Data Overview

In [14]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import random

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


from wordcloud import WordCloud

import xgboost as xgb
from textblob import TextBlob

from sklearn.model_selection import GridSearchCV

from tqdm import tqdm


In [16]:
# Load the datasets
data = pd.read_csv('news_articles.csv')

# Display the first few rows of the dataset
display(data.head())

Unnamed: 0,author,published,title,text,language,site_url,main_img_url,type,label,title_without_stopwords,text_without_stopwords,hasImage
0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,muslims busted they stole millions in govt ben...,print they should pay all the back all the mon...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,muslims busted stole millions govt benefits,print pay back money plus interest entire fami...,1.0
1,reasoning with facts,2016-10-29T08:47:11.259+03:00,re why did attorney general loretta lynch plea...,why did attorney general loretta lynch plead t...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,attorney general loretta lynch plead fifth,attorney general loretta lynch plead fifth bar...,1.0
2,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,breaking weiner cooperating with fbi on hillar...,red state \nfox news sunday reported this mor...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,breaking weiner cooperating fbi hillary email ...,red state fox news sunday reported morning ant...,1.0
3,Fed Up,2016-11-01T05:22:00.000+02:00,pin drop speech by father of daughter kidnappe...,email kayla mueller was a prisoner and torture...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,pin drop speech father daughter kidnapped kill...,email kayla mueller prisoner tortured isis cha...,1.0
4,Fed Up,2016-11-01T21:56:00.000+02:00,fantastic trumps point plan to reform healthc...,email healthcare reform to make america great ...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,fantastic trumps point plan reform healthcare ...,email healthcare reform make america great sin...,1.0


### Data preprocessing

In [4]:
# Display the column names of the dataset
column_names = data.columns.tolist()
display(column_names)

['text',
 'news_link',
 'outlet',
 'topic',
 'type',
 'label_bias',
 'label_opinion',
 'biased_words']

In [18]:
# Display the shape of the dataset
n_rows, n_cols = data.shape
print(f"The DataFrame has {n_rows} rows and {n_cols} columns")

The DataFrame has 2096 rows and 12 columns


In [17]:
# Display the informative summary of the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2096 entries, 0 to 2095
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   author                   2096 non-null   object 
 1   published                2096 non-null   object 
 2   title                    2096 non-null   object 
 3   text                     2050 non-null   object 
 4   language                 2095 non-null   object 
 5   site_url                 2095 non-null   object 
 6   main_img_url             2095 non-null   object 
 7   type                     2095 non-null   object 
 8   label                    2095 non-null   object 
 9   title_without_stopwords  2094 non-null   object 
 10  text_without_stopwords   2046 non-null   object 
 11  hasImage                 2095 non-null   float64
dtypes: float64(1), object(11)
memory usage: 196.6+ KB


In [19]:
# Display the descriptive statistics of the dataset

data.describe()

Unnamed: 0,hasImage
count,2095.0
mean,0.777088
std,0.416299
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


## 2. You will conduct EDA that you see fit to appropriately investigate text of wikipedia articles you look to predict on for biased terms, sentiment, or other linguistic significance.

## Explorating Data Analysis

### Duplicates

In [8]:
# Display the number of duplicates in the dataset
duplicates = data[data.duplicated()]
display(f"Number of duplicated data: {duplicates.shape[0]}")

'Number of duplicated data: 0'

### Missing Values

In [9]:
# Display the number of missing values in the dataset
display(data.isna().sum())

# Check for missing values in the DataFrame as a percentage
display(data.isna().sum()/len(data)) 

text                0
news_link          32
outlet              0
topic               0
type             1000
label_bias          0
label_opinion       0
biased_words        0
dtype: int64

text             0.000000
news_link        0.008710
outlet           0.000000
topic            0.000000
type             0.272183
label_bias       0.000000
label_opinion    0.000000
biased_words     0.000000
dtype: float64

In [9]:
# Drop rows with missing values in the 'news_link' and 'article' columns
data.dropna(subset=['news_link'], inplace=True)
data.dropna(subset=['type'], inplace=True)

In [10]:
# Check for missing values in the DataFrame as a percentage
display(data.isna().sum()/len(data)) 

text             0.0
news_link        0.0
outlet           0.0
topic            0.0
type             0.0
label_bias       0.0
label_opinion    0.0
biased_words     0.0
dtype: float64

In [11]:
# Cleaning the text data in the 'text' column
# Define a function to clean the text data 
def clear_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-z\s]","", text)
    text = text.split()
    return " ".join(text)

In [12]:
data['label_bias'] = data['label_bias'].str.lower()

In [13]:
# Apply the clear_text function to the 'comment_text' column
data['clean_text'] = data['text'].astype(str).apply(clear_text) 
data= data.drop(columns=['text'])

# Display the first 5 rows of the comments DataFrame after cleaning
display(data.sample(5)) 


Unnamed: 0,news_link,outlet,topic,type,label_bias,label_opinion,biased_words,clean_text
2401,https://www.reuters.com/article/us-usa-electio...,Reuters,universal health care,center,non-biased,Entirely factual,[],some party leaders including house speaker nan...
3095,https://www.breitbart.com/2nd-amendment/2019/0...,Breitbart,gun-control,right,non-biased,No agreement,[],think about it universal background checks req...
2176,https://www.huffpost.com/entry/marta-brazil-wo...,HuffPost,sport,left,non-biased,Expresses writer’s opinion,[],rather its because of the sheer fact that she ...
238,https://thefederalist.com/2019/07/01/first-fac...,Federalist,abortion,right,non-biased,Somewhat factual but also opinionated,[],although facebook chief operating officer sher...
2420,https://eu.usatoday.com/story/entertainment/ce...,USA Today,trump-presidency,center,non-biased,Entirely factual,[],stephen colbert and jimmy kimmel are firing ba...


In [14]:
# Check for missing values
print(data['clean_text'].isna().sum())  

0


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2644 entries, 0 to 3673
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   news_link      2644 non-null   object
 1   outlet         2644 non-null   object
 2   topic          2644 non-null   object
 3   type           2644 non-null   object
 4   label_bias     2644 non-null   object
 5   label_opinion  2644 non-null   object
 6   biased_words   2644 non-null   object
 7   clean_text     2644 non-null   object
dtypes: object(8)
memory usage: 185.9+ KB


In [16]:
## Set of English stop words
stop_words =  set(stopwords.words('english')) 

In [17]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer() 

def lemmatize(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words]
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmas)

In [18]:
# Apply the clear_text function to the 'comment_text' column
data['lemmatize_text'] = data['clean_text'].apply(lemmatize) 

In [19]:
# Display the first 5 rows of the comments DataFrame after cleaning
display(data[['clean_text', 'lemmatize_text']].head(20))

Unnamed: 0,clean_text,lemmatize_text
0,orange is the new black star yael stone is ren...,orange new black star yael stone renouncing u ...
1,we have one beautiful law trump recently said ...,one beautiful law trump recently said characte...
2,immigrants as criminals and eugenics all of wh...,immigrant criminal eugenics considered fringe ...
3,we sounded the alarm in the early months of tr...,sounded alarm early month trump presidency pri...
9,a new low washington post media critic blows u...,new low washington post medium critic blow tuc...
10,gangster capitalist trump is running a mafia s...,gangster capitalist trump running mafia state ...
11,the most progressive president since fdr biden...,progressive president since fdr bidens policy ...
18,the goal is to send a message of peace the yea...,goal send message peace yearold claimed insist...
20,you know theres over million people with preex...,know there million people preexisting conditio...
21,people were arrested for offences including as...,people arrested offence including assaulting p...


In [20]:
data.shape

(2644, 9)

## 3. You will conduct supervised learning to be able to predict if a given text is biased. You might want to be able to do this on the sentence by sentence level.

In [21]:
data.columns

Index(['news_link', 'outlet', 'topic', 'type', 'label_bias', 'label_opinion',
       'biased_words', 'clean_text', 'lemmatize_text'],
      dtype='object')

In [22]:
data['label_bias'].value_counts()

label_bias
non-biased      1344
biased          1299
no agreement       1
Name: count, dtype: int64

In [23]:
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

data['biased_score'] = data['clean_text'].apply(get_sentiment)
data['biased_label'] = data['biased_score'].apply(lambda x: 'biased' if x > 0 else 'unbiased')

### TF-IDF & Logistic Regression:

In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    data['clean_text'], data['biased_label'], test_size=0.2, random_state=42
)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [27]:

model = LogisticRegression()
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.722117202268431
[[172  97]
 [ 50 210]]
              precision    recall  f1-score   support

      biased       0.77      0.64      0.70       269
    unbiased       0.68      0.81      0.74       260

    accuracy                           0.72       529
   macro avg       0.73      0.72      0.72       529
weighted avg       0.73      0.72      0.72       529



In [29]:
from lightgbm import LGBMClassifier

In [30]:

param_grid = {
    'num_leaves': [15, 31, 63],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100],
    'max_depth': [-1, 10, 20],
}

grid = GridSearchCV(LGBMClassifier(class_weight='balanced'), param_grid, cv=3)
grid.fit(X_train_vec, y_train)
best_model = grid.best_estimator_


[LightGBM] [Info] Number of positive: 749, number of negative: 661
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004877 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6708
[LightGBM] [Info] Number of data points in the train set: 1410, number of used features: 258
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 749, number of negative: 661
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003432 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6809
[LightGBM] [Info] Number of data points in the train set: 1410, number of used features: 267
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number

In [31]:
y_pred = best_model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.7315689981096408
[[164 105]
 [ 37 223]]
              precision    recall  f1-score   support

      biased       0.82      0.61      0.70       269
    unbiased       0.68      0.86      0.76       260

    accuracy                           0.73       529
   macro avg       0.75      0.73      0.73       529
weighted avg       0.75      0.73      0.73       529



## 4. You need to have a prediction function that can take in a new wikipedia article and predict how biased it is. You can do this by predicting if each sentence in an article is biased, then perhaps scaling the results by the length of the article to get somewhat of a“bias score”

In [None]:
# def get_sentiment(text):
#    return TextBlob(text).sentiment.polarity

#data['biased_score'] = data['clean_text'].apply(get_sentiment)
#data['biased_label'] = data['biased_score'].apply(lambda x: 'biased' if x > 0 else 'unbiased')