# Assessing Wikipedia Bias

## 1. You will need to collect data from a source of your choosing (dataset, wikipedia API, web-scraping)

## Data Overview

In [31]:
# Import necessary libraries
import pandas as pd
import re
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats as st

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")


In [32]:
# Load the datasets
data = pd.read_excel('data.xlsx')
# Display the first few rows of the dataset
display(data.head())

Unnamed: 0.1,Unnamed: 0,sentence,outlet,topic,type,article,biased_words2,text,text_low,pos,...,ne_NORP_context,ne_ORDINAL_context,ne_ORG_context,ne_PERCENT_context,ne_PERSON_context,ne_PRODUCT_context,ne_QUANTITY_context,ne_TIME_context,ne_WORK_OF_ART_context,ne_LANGUAGE_context
0,0,"""Orange Is the New Black"" star Yael Stone is r...",Fox News,environment,right,"""Orange Is the New Black"" star Yael Stone is r...",[],Orange,orange,PROPN,...,0,0,0,0,0,0,0,0,1,0
1,3,"""Orange Is the New Black"" star Yael Stone is r...",Fox News,environment,right,"""Orange Is the New Black"" star Yael Stone is r...",[],New,new,PROPN,...,0,0,0,0,0,0,0,0,1,0
2,4,"""Orange Is the New Black"" star Yael Stone is r...",Fox News,environment,right,"""Orange Is the New Black"" star Yael Stone is r...",[],Black,black,PROPN,...,0,0,0,0,1,0,0,0,1,0
3,5,"""Orange Is the New Black"" star Yael Stone is r...",Fox News,environment,right,"""Orange Is the New Black"" star Yael Stone is r...",[],star,star,NOUN,...,0,0,0,0,1,0,0,0,1,0
4,6,"""Orange Is the New Black"" star Yael Stone is r...",Fox News,environment,right,"""Orange Is the New Black"" star Yael Stone is r...",[],Yael,yael,PROPN,...,0,0,0,0,1,0,0,0,1,0


### Data preprocessing

In [33]:
# Display the column names of the dataset
column_names = data.columns.tolist()
display(column_names)

['Unnamed: 0',
 'sentence',
 'outlet',
 'topic',
 'type',
 'article',
 'biased_words2',
 'text',
 'text_low',
 'pos',
 'lemma',
 'lemma_low',
 'tag',
 'dep',
 'is_stop',
 'glove_vec300_norm',
 'order',
 'tfidf_art',
 'label3',
 'label4',
 'label5',
 'is_ne',
 'ne_label',
 'negative_conc',
 'positive_conc',
 'weak_subj',
 'strong_subj',
 'MRCP_concretness_ratings',
 'MRCP_Imagability_ratings',
 'hyperbolic_terms',
 'attitude_markers',
 'kill_verbs',
 'bias_lexicon',
 'assertive_verbs',
 'factive_verbs',
 'report_verbs',
 'implicative_verbs',
 'hedges',
 'boosters',
 'affect ',
 'posemo ',
 'negemo ',
 'anx ',
 'anger ',
 'sad ',
 'social ',
 'family ',
 'friend ',
 'female ',
 'male ',
 'cogproc ',
 'insight ',
 'cause ',
 'discrep ',
 'tentat ',
 'certain ',
 'differ ',
 'percept ',
 'see ',
 'hear ',
 'feel ',
 'bio ',
 'body ',
 'health ',
 'sexual ',
 'ingest ',
 'drives ',
 'affiliation ',
 'achieve ',
 'power ',
 'reward ',
 'risk ',
 'focuspast ',
 'focuspresent ',
 'focusfuture 

In [34]:
# Display the shape of the dataset
n_rows, n_cols = data.shape
print(f"The DataFrame has {n_rows} rows and {n_cols} columns")

The DataFrame has 65822 rows and 301 columns


In [35]:
# Display the informative summary of the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65822 entries, 0 to 65821
Columns: 301 entries, Unnamed: 0 to ne_LANGUAGE_context
dtypes: bool(2), float64(4), int64(281), object(14)
memory usage: 150.3+ MB


In [36]:
# Display the descriptive statistics of the dataset
data.describe()

Unnamed: 0.1,Unnamed: 0,glove_vec300_norm,order,tfidf_art,label3,label4,label5,negative_conc,positive_conc,weak_subj,...,ne_NORP_context,ne_ORDINAL_context,ne_ORG_context,ne_PERCENT_context,ne_PERSON_context,ne_PRODUCT_context,ne_QUANTITY_context,ne_TIME_context,ne_WORK_OF_ART_context,ne_LANGUAGE_context
count,65822.0,65822.0,65822.0,64632.0,65822.0,65822.0,65822.0,65822.0,65822.0,65822.0,...,65822.0,65822.0,65822.0,65822.0,65822.0,65822.0,65822.0,65822.0,65822.0,65822.0
mean,54925.591641,7.616181,16.46603,0.209871,0.051898,0.051898,0.051898,0.126447,0.117362,0.1594,...,0.07139,0.005469,0.130078,0.00316,0.132889,0.001899,0.000471,0.008325,0.005348,9.1e-05
std,31794.88123,0.864257,11.775391,0.068478,0.221822,0.221822,0.221822,0.332355,0.321854,0.366051,...,0.257476,0.073753,0.336392,0.056126,0.339457,0.043537,0.021697,0.090864,0.072933,0.009547
min,0.0,5.004817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,27253.25,6.995967,7.0,0.163616,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,54976.5,7.550039,15.0,0.202041,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,82310.5,8.141136,24.0,0.245051,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,110350.0,11.844529,89.0,0.699726,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## 2. You will conduct EDA that you see fit to appropriately investigate text of wikipedia articles you look to predict on for biased terms, sentiment, or other linguistic significance.

## Explorating Data Analysis

### Duplicates

In [37]:
# Display the number of duplicates in the dataset
duplicates = data[data.duplicated()]
display(f"Number of duplicated data: {duplicates.shape[0]}")

'Number of duplicated data: 0'

### Missing Values

In [38]:
# Display the number of missing values in the dataset
display(data.isna().sum())

# Check for missing values in the DataFrame as a percentage
display(data.isna().sum()/len(data)) 

Unnamed: 0                    0
sentence                      0
outlet                        0
topic                         0
type                      16206
                          ...  
ne_PRODUCT_context            0
ne_QUANTITY_context           0
ne_TIME_context               0
ne_WORK_OF_ART_context        0
ne_LANGUAGE_context           0
Length: 301, dtype: int64

Unnamed: 0                0.000000
sentence                  0.000000
outlet                    0.000000
topic                     0.000000
type                      0.246209
                            ...   
ne_PRODUCT_context        0.000000
ne_QUANTITY_context       0.000000
ne_TIME_context           0.000000
ne_WORK_OF_ART_context    0.000000
ne_LANGUAGE_context       0.000000
Length: 301, dtype: float64

In [39]:
# Drop rows with missing values in the 'type' column
data.dropna(subset=['type'], inplace=True)

In [40]:
# Check for missing values in the DataFrame as a percentage
display(data.isna().sum()/len(data)) 

Unnamed: 0                0.0
sentence                  0.0
outlet                    0.0
topic                     0.0
type                      0.0
                         ... 
ne_PRODUCT_context        0.0
ne_QUANTITY_context       0.0
ne_TIME_context           0.0
ne_WORK_OF_ART_context    0.0
ne_LANGUAGE_context       0.0
Length: 301, dtype: float64

In [41]:
# Cleaning the text data in the 'text' column
# Define a function to clean the text data 
def clear_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-z\s]","", text)
    text = text.split()
    return " ".join(text)

In [42]:
# Apply the clear_text function to the 'comment_text' column
data['clean_text'] = data['sentence'].astype(str).apply(clear_text) 
data= data.drop(columns=['sentence'])

# Display the first 5 rows of the comments DataFrame after cleaning
display(data.sample(5)) 


Unnamed: 0.1,Unnamed: 0,outlet,topic,type,article,biased_words2,text,text_low,pos,lemma,...,ne_ORDINAL_context,ne_ORG_context,ne_PERCENT_context,ne_PERSON_context,ne_PRODUCT_context,ne_QUANTITY_context,ne_TIME_context,ne_WORK_OF_ART_context,ne_LANGUAGE_context,clean_text
50863,84833,USA Today,trump-presidency,center,The president left the composition of the task...,[],composition,composition,NOUN,composition,...,0,0,0,0,0,0,0,0,0,the president left the composition of the task...
52675,87921,Alternet,immigration,left,The Trump administration gave the Border Patro...,[],asylum,asylum,NOUN,asylum,...,0,0,0,0,0,0,0,0,0,the trump administration gave the border patro...
37890,63193,Alternet,universal health care,left,"President Trump, who repeatedly has lied to th...","['claiming', 'lied']",working,working,VERB,work,...,0,0,0,0,0,0,0,0,0,president trump who repeatedly has lied to the...
25002,41703,Reuters,black lives matter,center,"In recent weeks, Saks’ problems have been comp...",[],included,included,VERB,include,...,0,0,0,0,0,0,0,0,0,in recent weeks saks problems have been compou...
53951,89975,MSNBC,abortion,left,The votes come as a new conservative majority ...,['nervous'],come,come,VERB,come,...,0,0,0,0,0,0,0,0,0,the votes come as a new conservative majority ...


In [43]:
# Check for missing values
print(data['clean_text'].isna().sum())  

0


In [44]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49616 entries, 0 to 65821
Columns: 301 entries, Unnamed: 0 to clean_text
dtypes: bool(2), float64(4), int64(281), object(14)
memory usage: 113.7+ MB


In [45]:
## Set of English stop words
stop_words =  set(stopwords.words('english')) 

In [46]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer() 

def lemmatize(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words]
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmas)

In [47]:
# Apply the clear_text function to the 'comment_text' column
data['lemmatize_text'] = data['clean_text'].apply(lemmatize) 

In [48]:
# Display the first 5 rows of the comments DataFrame after cleaning
display(data[['clean_text', 'lemmatize_text']].sample(5))

Unnamed: 0,clean_text,lemmatize_text
40603,schlapps apology comes as the us is convulsed ...,schlapps apology come u convulsed protest poli...
2548,a us official speaking on condition of anonymi...,u official speaking condition anonymity confir...
34458,once powerful hollywood producer harvey weinst...,powerful hollywood producer harvey weinstein c...
48880,the leftwing mob had gathered in parliament sq...,leftwing mob gathered parliament square london...
36679,president donald trump has characterized those...,president donald trump characterized clashing ...


In [49]:
data.shape

(49616, 302)

## 3. You will conduct supervised learning to be able to predict if a given text is biased. You might want to be able to do this on the sentence by sentence level.

## 4. You need to have a prediction function that can take in a new wikipedia article and predict how biased it is. You can do this by predicting if each sentence in an article is biased, then perhaps scaling the results by the length of the article to get somewhat of a“bias score”