# Assessing Wikipedia Bias

## 1. You will need to collect data from a source of your choosing (dataset, wikipedia API, web-scraping)

## Introduction

The project 

## Data Overview

In [2]:
# Import necessary libraries
import pandas as pd
import re
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats as st

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")


In [3]:
# Load the datasets
data = pd.read_csv('news_articles.csv')
# Display the first few rows of the dataset
display(data.head())

Unnamed: 0,author,published,title,text,language,site_url,main_img_url,type,label,title_without_stopwords,text_without_stopwords,hasImage
0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,muslims busted they stole millions in govt ben...,print they should pay all the back all the mon...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,muslims busted stole millions govt benefits,print pay back money plus interest entire fami...,1.0
1,reasoning with facts,2016-10-29T08:47:11.259+03:00,re why did attorney general loretta lynch plea...,why did attorney general loretta lynch plead t...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,attorney general loretta lynch plead fifth,attorney general loretta lynch plead fifth bar...,1.0
2,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,breaking weiner cooperating with fbi on hillar...,red state \nfox news sunday reported this mor...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,breaking weiner cooperating fbi hillary email ...,red state fox news sunday reported morning ant...,1.0
3,Fed Up,2016-11-01T05:22:00.000+02:00,pin drop speech by father of daughter kidnappe...,email kayla mueller was a prisoner and torture...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,pin drop speech father daughter kidnapped kill...,email kayla mueller prisoner tortured isis cha...,1.0
4,Fed Up,2016-11-01T21:56:00.000+02:00,fantastic trumps point plan to reform healthc...,email healthcare reform to make america great ...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,fantastic trumps point plan reform healthcare ...,email healthcare reform make america great sin...,1.0


### Data preprocessing

In [4]:
# Display the column names of the dataset
column_names = data.columns.tolist()
display(column_names)

['author',
 'published',
 'title',
 'text',
 'language',
 'site_url',
 'main_img_url',
 'type',
 'label',
 'title_without_stopwords',
 'text_without_stopwords',
 'hasImage']

In [5]:

import re

data.columns = [re.sub(r'(?<!^)(?=[A-Z])', '_', col).lower() for col in data.columns]
print(data.columns)


Index(['author', 'published', 'title', 'text', 'language', 'site_url',
       'main_img_url', 'type', 'label', 'title_without_stopwords',
       'text_without_stopwords', 'has_image'],
      dtype='object')


In [6]:
# Display the column names of the dataset
column_names = data.columns.tolist()
display(column_names)

['author',
 'published',
 'title',
 'text',
 'language',
 'site_url',
 'main_img_url',
 'type',
 'label',
 'title_without_stopwords',
 'text_without_stopwords',
 'has_image']

In [7]:
# Display the shape of the dataset
n_rows, n_cols = data.shape
print(f"The DataFrame has {n_rows} rows and {n_cols} columns")

The DataFrame has 2096 rows and 12 columns


In [8]:
# Display the informative summary of the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2096 entries, 0 to 2095
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   author                   2096 non-null   object 
 1   published                2096 non-null   object 
 2   title                    2096 non-null   object 
 3   text                     2050 non-null   object 
 4   language                 2095 non-null   object 
 5   site_url                 2095 non-null   object 
 6   main_img_url             2095 non-null   object 
 7   type                     2095 non-null   object 
 8   label                    2095 non-null   object 
 9   title_without_stopwords  2094 non-null   object 
 10  text_without_stopwords   2046 non-null   object 
 11  has_image                2095 non-null   float64
dtypes: float64(1), object(11)
memory usage: 196.6+ KB


In [9]:
# Display the descriptive statistics of the dataset
data.describe()

Unnamed: 0,has_image
count,2095.0
mean,0.777088
std,0.416299
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


## 2. You will conduct EDA that you see fit to appropriately investigate text of wikipedia articles you look to predict on for biased terms, sentiment, or other linguistic significance.

## Explorating Data Analysis

### Duplicates

In [10]:
# Display the number of duplicates in the dataset
duplicates = data[data.duplicated()]
display(f"Number of duplicated data: {duplicates.shape[0]}")

'Number of duplicated data: 10'

### Missing Values

In [11]:
# Display the number of missing values in the dataset
display(data.isna().sum())

# Check for missing values in the DataFrame as a percentage
display(data.isna().sum()/len(data)) 

author                      0
published                   0
title                       0
text                       46
language                    1
site_url                    1
main_img_url                1
type                        1
label                       1
title_without_stopwords     2
text_without_stopwords     50
has_image                   1
dtype: int64

author                     0.000000
published                  0.000000
title                      0.000000
text                       0.021947
language                   0.000477
site_url                   0.000477
main_img_url               0.000477
type                       0.000477
label                      0.000477
title_without_stopwords    0.000954
text_without_stopwords     0.023855
has_image                  0.000477
dtype: float64

In [12]:
# Drop rows with missing values in the 'type' column
data.dropna(subset=['type'], inplace=True)

In [13]:
# Check for missing values in the DataFrame as a percentage
display(data.isna().sum()/len(data)) 

author                     0.000000
published                  0.000000
title                      0.000000
text                       0.021480
language                   0.000000
site_url                   0.000000
main_img_url               0.000000
type                       0.000000
label                      0.000000
title_without_stopwords    0.000477
text_without_stopwords     0.023389
has_image                  0.000000
dtype: float64

In [14]:
# Cleaning the text data in the 'text' column
# Define a function to clean the text data 
def clear_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-z\s]","", text)
    text = text.split()
    return " ".join(text)

In [15]:
# Apply the clear_text function to the 'comment_text' column
data['clean_text'] = data['sentence'].astype(str).apply(clear_text) 
data= data.drop(columns=['sentence'])

# Display the first 5 rows of the comments DataFrame after cleaning
display(data.sample(5)) 


KeyError: 'sentence'

In [None]:
# Check for missing values
print(data['clean_text'].isna().sum())  

0


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49616 entries, 0 to 65821
Columns: 301 entries, Unnamed: 0 to clean_text
dtypes: bool(2), float64(4), int64(281), object(14)
memory usage: 113.7+ MB


In [None]:
## Set of English stop words
stop_words =  set(stopwords.words('english')) 

In [None]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer() 

def lemmatize(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words]
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmas)

In [None]:
# Apply the clear_text function to the 'comment_text' column
data['lemmatize_text'] = data['clean_text'].apply(lemmatize) 

In [None]:
# Display the first 5 rows of the comments DataFrame after cleaning
display(data[['clean_text', 'lemmatize_text']].sample(5))

Unnamed: 0,clean_text,lemmatize_text
40603,schlapps apology comes as the us is convulsed ...,schlapps apology come u convulsed protest poli...
2548,a us official speaking on condition of anonymi...,u official speaking condition anonymity confir...
34458,once powerful hollywood producer harvey weinst...,powerful hollywood producer harvey weinstein c...
48880,the leftwing mob had gathered in parliament sq...,leftwing mob gathered parliament square london...
36679,president donald trump has characterized those...,president donald trump characterized clashing ...


In [None]:
data.shape

(49616, 302)

## 3. You will conduct supervised learning to be able to predict if a given text is biased. You might want to be able to do this on the sentence by sentence level.

## 4. You need to have a prediction function that can take in a new wikipedia article and predict how biased it is. You can do this by predicting if each sentence in an article is biased, then perhaps scaling the results by the length of the article to get somewhat of a“bias score”