# Assessing Wikipedia Bias

## 1. You will need to collect data from a source of your choosing (dataset, wikipedia API, web-scraping)

## Data Overview

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import random

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


from wordcloud import WordCloud

import xgboost as xgb
from textblob import TextBlob

from tqdm import tqdm


In [2]:
# Load the datasets
data = pd.read_csv('final_labels.csv', sep=';')
data_1 = pd.read_csv('news_articles.csv')
# Display the first few rows of the dataset
display(data.head())
display(data_1.head())

Unnamed: 0,text,news_link,outlet,topic,type,group_id,num_sent,label_bias,label_opinion,article,biased_words
0,YouTube is making clear there will be no “birt...,https://eu.usatoday.com/story/tech/2020/02/03/...,usa-today,elections-2020,center,1,1,Biased,Somewhat factual but also opinionated,YouTube says no ‘deepfakes’ or ‘birther’ video...,"['belated', 'birtherism']"
1,So while there may be a humanitarian crisis dr...,https://www.alternet.org/2019/01/here-are-5-of...,alternet,immigration,left,1,1,Biased,Expresses writer’s opinion,Speaking to the country for the first time fro...,['crisis']
2,"Looking around the United States, there is nev...",https://thefederalist.com/2020/03/11/woman-who...,federalist,abortion,right,1,1,Biased,Somewhat factual but also opinionated,The left has a thing for taking babies hostage...,"['killing', 'never', 'developing', 'humans', '..."
3,The Republican president assumed he was helpin...,http://www.msnbc.com/rachel-maddow-show/auto-i...,msnbc,environment,left,1,1,Biased,Expresses writer’s opinion,"In Barack Obama’s first term, the administrati...","['rejects', 'happy', 'assumed']"
4,The explosion of the Hispanic population has l...,https://www.breitbart.com/politics/2015/02/26/...,breitbart,student-debt,right,1,1,Biased,No agreement,"Republicans should stop fighting amnesty, Pres...",['explosion']


Unnamed: 0,author,published,title,text,language,site_url,main_img_url,type,label,title_without_stopwords,text_without_stopwords,hasImage
0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,muslims busted they stole millions in govt ben...,print they should pay all the back all the mon...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,muslims busted stole millions govt benefits,print pay back money plus interest entire fami...,1.0
1,reasoning with facts,2016-10-29T08:47:11.259+03:00,re why did attorney general loretta lynch plea...,why did attorney general loretta lynch plead t...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,attorney general loretta lynch plead fifth,attorney general loretta lynch plead fifth bar...,1.0
2,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,breaking weiner cooperating with fbi on hillar...,red state \nfox news sunday reported this mor...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,breaking weiner cooperating fbi hillary email ...,red state fox news sunday reported morning ant...,1.0
3,Fed Up,2016-11-01T05:22:00.000+02:00,pin drop speech by father of daughter kidnappe...,email kayla mueller was a prisoner and torture...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,pin drop speech father daughter kidnapped kill...,email kayla mueller prisoner tortured isis cha...,1.0
4,Fed Up,2016-11-01T21:56:00.000+02:00,fantastic trumps point plan to reform healthc...,email healthcare reform to make america great ...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,fantastic trumps point plan reform healthcare ...,email healthcare reform make america great sin...,1.0


### Data preprocessing

In [3]:
# Display the column names of the dataset
column_names = data.columns.tolist()
display(column_names)

column_names = data_1.columns.tolist()
display(column_names)

['text',
 'news_link',
 'outlet',
 'topic',
 'type',
 'group_id',
 'num_sent',
 'label_bias',
 'label_opinion',
 'article',
 'biased_words']

['author',
 'published',
 'title',
 'text',
 'language',
 'site_url',
 'main_img_url',
 'type',
 'label',
 'title_without_stopwords',
 'text_without_stopwords',
 'hasImage']

In [4]:
# Display the shape of the dataset
n_rows, n_cols = data.shape
print(f"The DataFrame has {n_rows} rows and {n_cols} columns")

# Display the shape of the dataset
n_rows, n_cols = data_1.shape
print(f"The DataFrame has {n_rows} rows and {n_cols} columns")

The DataFrame has 1700 rows and 11 columns
The DataFrame has 2096 rows and 12 columns


In [5]:
# Display the informative summary of the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1700 entries, 0 to 1699
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   text           1700 non-null   object
 1   news_link      1681 non-null   object
 2   outlet         1700 non-null   object
 3   topic          1700 non-null   object
 4   type           1700 non-null   object
 5   group_id       1700 non-null   int64 
 6   num_sent       1700 non-null   int64 
 7   label_bias     1700 non-null   object
 8   label_opinion  1700 non-null   object
 9   article        1595 non-null   object
 10  biased_words   1700 non-null   object
dtypes: int64(2), object(9)
memory usage: 146.2+ KB


In [6]:
# Display the descriptive statistics of the dataset
data.describe()

Unnamed: 0,group_id,num_sent
count,1700.0,1700.0
mean,43.0,1.124706
std,24.542908,0.414256
min,1.0,1.0
25%,22.0,1.0
50%,43.0,1.0
75%,64.0,1.0
max,85.0,5.0


## 2. You will conduct EDA that you see fit to appropriately investigate text of wikipedia articles you look to predict on for biased terms, sentiment, or other linguistic significance.

## Explorating Data Analysis

### Duplicates

In [7]:
# Display the number of duplicates in the dataset
duplicates = data[data.duplicated()]
display(f"Number of duplicated data: {duplicates.shape[0]}")

'Number of duplicated data: 0'

### Missing Values

In [8]:
# Display the number of missing values in the dataset
display(data.isna().sum())

# Check for missing values in the DataFrame as a percentage
display(data.isna().sum()/len(data)) 

text               0
news_link         19
outlet             0
topic              0
type               0
group_id           0
num_sent           0
label_bias         0
label_opinion      0
article          105
biased_words       0
dtype: int64

text             0.000000
news_link        0.011176
outlet           0.000000
topic            0.000000
type             0.000000
group_id         0.000000
num_sent         0.000000
label_bias       0.000000
label_opinion    0.000000
article          0.061765
biased_words     0.000000
dtype: float64

In [9]:
# Drop rows with missing values in the 'news_link' and 'article' columns
data.dropna(subset=['news_link'], inplace=True)
data.dropna(subset=['article'], inplace=True)

In [10]:
# Check for missing values in the DataFrame as a percentage
display(data.isna().sum()/len(data)) 

text             0.0
news_link        0.0
outlet           0.0
topic            0.0
type             0.0
group_id         0.0
num_sent         0.0
label_bias       0.0
label_opinion    0.0
article          0.0
biased_words     0.0
dtype: float64

In [11]:
# Cleaning the text data in the 'text' column
# Define a function to clean the text data 
def clear_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-z\s]","", text)
    text = text.split()
    return " ".join(text)

In [12]:
data['label_bias'] = data['label_bias'].str.lower()

In [13]:
# Apply the clear_text function to the 'comment_text' column
data['clean_text'] = data['text'].astype(str).apply(clear_text) 
data= data.drop(columns=['text'])

# Display the first 5 rows of the comments DataFrame after cleaning
display(data.sample(5)) 


Unnamed: 0,news_link,outlet,topic,type,group_id,num_sent,label_bias,label_opinion,article,biased_words,clean_text
772,https://thefederalist.com/2019/11/08/nationali...,federalist,white-nationalism,right,67,1,biased,Expresses writer’s opinion,"First Things editor R.R. Reno's book, 'Return ...","['intolerant', 'authoritarianism', 'haunting']",a specter is haunting the west our elites see ...
1092,https://www.foxnews.com/politics/trump-pokes-f...,fox-news,environment,right,38,1,no agreement,Somewhat factual but also opinionated,President Trump poked fun at Sen. Amy Klobucha...,"['poked', 'fun']",president trump poked fun at sen amy klobuchar...
41,https://www.reuters.com/article/us-usa-electio...,reuters,elections-2020,center,4,1,biased,Expresses writer’s opinion,"WILMINGTON, Del. (Reuters) - Democratic presid...",['contrast'],bidens appearance was a contrast with the appr...
1280,https://www.foxnews.com/politics/democrats-rej...,fox-news,immigration,right,14,1,non-biased,Entirely factual,Democrats this week approved legislation to re...,[],democrats this week approved legislation to re...
1154,https://www.alternet.org/2020/05/why-the-calls...,alternet,coronavirus,left,72,1,no agreement,Entirely factual,When the coronavirus pandemic was first declar...,"['racialized', 'epicenter']",in new york city the national epicenter of the...


In [14]:
# Check for missing values
print(data['clean_text'].isna().sum())  

0


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1576 entries, 0 to 1699
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   news_link      1576 non-null   object
 1   outlet         1576 non-null   object
 2   topic          1576 non-null   object
 3   type           1576 non-null   object
 4   group_id       1576 non-null   int64 
 5   num_sent       1576 non-null   int64 
 6   label_bias     1576 non-null   object
 7   label_opinion  1576 non-null   object
 8   article        1576 non-null   object
 9   biased_words   1576 non-null   object
 10  clean_text     1576 non-null   object
dtypes: int64(2), object(9)
memory usage: 147.8+ KB


In [16]:
## Set of English stop words
stop_words =  set(stopwords.words('english')) 

In [17]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer() 

def lemmatize(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words]
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmas)

In [18]:
# Apply the clear_text function to the 'comment_text' column
data['lemmatize_text'] = data['clean_text'].apply(lemmatize) 

In [19]:
# Display the first 5 rows of the comments DataFrame after cleaning
display(data[['clean_text', 'lemmatize_text']].head(20))

Unnamed: 0,clean_text,lemmatize_text
0,youtube is making clear there will be no birth...,youtube making clear birtherism platform year ...
1,so while there may be a humanitarian crisis dr...,may humanitarian crisis driving vulnerable peo...
2,looking around the united states there is neve...,looking around united state never enough welfa...
3,the republican president assumed he was helpin...,republican president assumed helping industry ...
4,the explosion of the hispanic population has l...,explosion hispanic population longterm job pro...
5,the antivaccine movement made headlines last s...,antivaccine movement made headline last spring...
6,voting in quasimilitarized settings was not co...,voting quasimilitarized setting confined natio...
7,but one glaring absentee was trump who not onl...,one glaring absentee trump declined invitation...
9,track and field athletes dont typically earn t...,track field athlete dont typically earn lucrat...
10,in other words the agency responsible for prot...,word agency responsible protecting consumer wa...


In [20]:
data.shape

(1576, 12)

## 3. You will conduct supervised learning to be able to predict if a given text is biased. You might want to be able to do this on the sentence by sentence level.

In [21]:
data.columns

Index(['news_link', 'outlet', 'topic', 'type', 'group_id', 'num_sent',
       'label_bias', 'label_opinion', 'article', 'biased_words', 'clean_text',
       'lemmatize_text'],
      dtype='object')

In [23]:
data['label_bias'].value_counts()

label_bias
biased          975
non-biased      465
no agreement    136
Name: count, dtype: int64

In [24]:
# def get_sentiment(text):
#    return TextBlob(text).sentiment.polarity

#data['biased_score'] = data['clean_text'].apply(get_sentiment)
#data['biased_label'] = data['biased_score'].apply(lambda x: 'biased' if x > 0 else 'unbiased')

In [27]:
X_train, X_test, y_train, y_test = train_test_split(
    data['clean_text'], data['label_bias'], test_size=0.2, random_state=42
)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = LogisticRegression()
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6708860759493671
[[193   0  11]
 [ 31   0   2]
 [ 60   0  19]]
              precision    recall  f1-score   support

      biased       0.68      0.95      0.79       204
no agreement       0.00      0.00      0.00        33
  non-biased       0.59      0.24      0.34        79

    accuracy                           0.67       316
   macro avg       0.42      0.40      0.38       316
weighted avg       0.59      0.67      0.60       316



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 4. You need to have a prediction function that can take in a new wikipedia article and predict how biased it is. You can do this by predicting if each sentence in an article is biased, then perhaps scaling the results by the length of the article to get somewhat of a“bias score”