# COGS 108 - EDA Checkpoint

# Names

- James Larsen
- Alejandro Servin
- Lily Steiner
- Mayra Trejo
- Lucy Lennemann

<a id='research_question'></a>
# Research Question

How has the sentiment of the language surrounding Deafness used by popular online news sources (ABC, New York Times, USA Today, The Guardian, Associated Press) changed since the 80s?

# Setup

In [None]:
#import necessary packages, some will be used during analysis
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
import unicodedata
import nltk
from textblob import TextBlob, Word
from nltk.corpus import stopwords
nltk.download('stopwords')
from datetime import date

In [None]:
# Import Datasets
# Import ABC Dataset
with open('dataset/abc_data.json') as abc_ds:
    abc_data=json.load(abc_ds)
    
# Import Alternative Press Dataset
with open('dataset/ap_data.json') as ap_ds:
    ap_data=json.load(ap_ds)

# Import The Guardian Dataset
with open('dataset/guard_data.json') as guard_ds:
    guard_data=json.load(guard_ds)
    
# Import New York Times Dataset
with open('dataset/nyt_data.json') as nyt_ds:
    nyt_data=json.load(nyt_ds)

# Import USA Today Dataset
with open('dataset/usa_data.json') as usa_ds:
    usa_data=json.load(usa_ds)

In [None]:
# Convert datasets to dataforms
abc_df = pd.read_json('dataset/abc_data.json')
ap_df = pd.read_json('dataset/ap_data.json') 
guard_df = pd.read_json('dataset/guard_data.json')
nyt_df = pd.read_json('dataset/nyt_data.json') 
usa_df = pd.read_json('dataset/usa_data.json') 

In [None]:
# Set row and column display
pd.options.display.max_rows=6
pd.options.display.max_columns=5

#Used to look for text errors reverted for cleaning
#pd.options.display.max_colwidth=None 

pd.options.display.max_colwidth=40

In [None]:
#Space for textblob coode

# Data Cleaning

Describe your data cleaning steps here.

1. We are reordering the columns of all the dataframes so that they match.
2. We are converting the date strings into pd.datetime format
3. We are removing all articles before 1980-01-01
4. We are removing unicode artifacts from the text using unicodedata.normalize
5. We are removing any extraneous articles
6. We are removing any extraneous pieces of article text

### ABC Dataset

In [None]:
#visualize dataframe
abc_df                                      

In [None]:
# Reorganize columns
abc_df = abc_df[['headline','date','source','url','text']]

# Convert 'date' to datetime format and only visualize date
abc_df['date'] = pd.to_datetime(abc_df['date'], errors='coerce')

# Remove articles before 1980-01-01
abc_df = abc_df[~(abc_df['date']<='1980-01-01')]

# Drop 'source' column for easier visualization
abc_df.drop(columns=['source'])

In [None]:
#look for null values
abc_df.isnull().sum()

In [None]:
#Comb for unique values in the 'headline' column
abc_df['headline'].unique()

In [None]:
#Comb text for unique values in the 'text' column
abc_df['text'].unique()

In [None]:
#Clean text
abc_df['text'] = abc_df['text'].apply(lambda t: unicodedata.normalize('NFKD', t))

### Alternative Press Dataset

In [None]:
#visualize dataframe
ap_df 

In [None]:
# Reorganize columns
ap_df = ap_df[['headline','date','source','url','text']]

# Convert 'date' to datetime format and only visualize date
ap_df['date'] = pd.to_datetime(ap_df['date'])

#Remove articles before 1980-01-01
ap_df = ap_df[~(ap_df['date']<='1980-01-01')]

# Drop 'source' column for easier visualization
ap_df.drop(columns=['source'])


In [None]:
# Look for null values
ap_df.isnull().sum()

In [None]:
#Comb for unique values in the 'headline' column
ap_df['headline'].unique()

In [None]:
#Comb for unique values in the 'text' column
ap_df['text'].unique()

In [None]:
#Remove articles that report sports scores
ap_df = ap_df[ap_df['headline'].str.contains("Monday's Scores|Tuesday's Scores|Wednesday's Scores|Thursday's Scores|Friday's Scores|Saturday's Scores|Sunday's Scores")==False]

#Clean text
ap_df['text'] = ap_df['text'].apply(lambda t: unicodedata.normalize('NFKD', t))

### The Guardian Dataset

In [None]:
#visualize dataframe
guard_df

In [None]:
# Reorganize columns
guard_df = guard_df[['headline','date','source','url','text']]

# Convert 'date' to datetime format and only visualize date
guard_df['date'] = pd.to_datetime(guard_df['date'])

#Remove articles before 1980-01-01
guard_df = guard_df[~(guard_df['date']<='1980-01-01')]

# Drop 'source' column for easier visualization
guard_df.drop(columns=['source'])

In [None]:
# Look for null values
guard_df.isnull().sum()

In [None]:
#Comb for unique values in the 'headline' column
guard_df['headline'].unique()

In [None]:
#Comb for unique values in the 'text' column
guard_df['text'].unique()

In [None]:
#Clean text
guard_df['text'] = guard_df['text'].apply(lambda t: unicodedata.normalize('NFKD', t))

### New York Times Dataset

In [None]:
#visualize dataframe
nyt_df                                    

In [None]:
# Reorganize columns
nyt_df = nyt_df[['headline','date','source','url','text']]

# Convert 'date' to datetime format and only visualize date
nyt_df['date'] = pd.to_datetime(nyt_df['date'])

#Remove articles before 1980-01-01
nyt_df = nyt_df[~(nyt_df['date']<='1980-01-01')]

# Drop 'source' column for easier visualization
nyt_df.drop(columns=['source'])

# Visualize 'text' to search for errors
#nyt_df['text']

In [None]:
#Look for null values
nyt_df.isnull().sum()

In [None]:
#Comb for unique values in the 'headline' column
nyt_df['headline'].unique()

In [None]:
#Comb for unique values in teh 'text' column
nyt_df['text'].unique()

### USA Today Dataset

In [None]:
#create dataframe using dataset

#visualize dataframe
usa_df 

In [None]:
# Reorganize columns
usa_df = usa_df[['headline','date','source','url','text']]

# Convert 'date' to datetime format and only visualize date
pd.options.mode.chained_assignment = None

usa_df['date'] = usa_df['date'].str.extract(r'Published:? (.*?)(?:Updated:?.*)?$')
usa_df['date'] = usa_df['date'].str.replace('ET', '')
usa_df['date'] = pd.to_datetime(usa_df['date'])

pd.options.mode.chained_assignment = 'warn'

# Remove articles before 1980-01-01
usa_df = usa_df[~(usa_df['date']<='1980-01-01')]

# Drop 'source' column for easier visualization
usa_df.drop(columns=['source'])

#Find data types
usa_df.dtypes

In [None]:
#look for null values
print(usa_df.isnull().sum())
usa_df.dropna(inplace=True)

In [None]:
#Comb for unique values in the 'headline' column
usa_df['headline'].unique()

In [None]:
#Comb for unique values in the 'text' column
usa_df['text'].unique()

In [None]:
#Clean Text
usa_df['text'] = usa_df['text'].apply(lambda t: unicodedata.normalize('NFKD', t))

In [None]:
# List of dataframes for function iteration
df_list = [abc_df, ap_df, guard_df, nyt_df, usa_df]

In [None]:
combined_df = pd.concat(df_list)
combined_df.head()

# Data Analysis & Results (EDA)

Carry out EDA on your dataset(s); Describe in this section

### Sentiment Analysis 

In this next section, we are creating new dataframes which will have the analysis results in addition to defining a few helper functions for our analysis. 

In [None]:
abc_sent = abc_df
ap_sent = ap_df
guard_sent = guard_df
nyt_sent = nyt_df
usa_sent = usa_df

In [None]:
#find sentiment for a given piece of text
def get_sentiment(text):
    blob = TextBlob(text)
    polarity, subjectivity = blob.sentiment
    return polarity, subjectivity

In [None]:
#cleans text and returns textblob object for keyword analysis 
def cleaned_blob(text):    
    #removes all quotations, periods, commas, and hyphens
    text = text.replace('‘', '')
    text = text.replace('’', '')
    text = text.replace('“', '')
    text = text.replace('”', '')
    text = text.replace('.', ' ')
    text = text.replace(',', ' ')
    text = text.replace('–', ' ')   
    text = text.replace('-', ' ')
    #removes stopwords 
    words_list = (x for x in TextBlob(text).words if x not in stopwords.words('English'))
    #removes numbers, not relevant for keyword analysis
    words_list = (x for x in words_list if x.isalpha())
    #lemmatizes
    words_list = (Word(word).lemmatize() for word in words_list)
    # joins all words into one string
    cleaned = ' '.join(words_list)
    b = TextBlob(cleaned) 
    #remove leading/trailing whitespace and makes all lowercase
    b = b.strip()
    b = b.lower()
    return b

Now we will apply this function to each news dataframe and add two columns with the objectivity score and subjectivity score (both ranging from -1 to 1) 

In [None]:
news_data_sent = [abc_sent, ap_sent, guard_sent, nyt_sent, usa_sent]
for df in news_data_sent:
    df[['polarity', 'subjectivity']]=df.apply(lambda x: get_sentiment(x['text']),axis=1,
                             result_type='expand')

In [None]:
#test to see if properly configured
abc_sent

### Data Analysis 