In [1]:
# imports
import pandas as pd
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import pipeline

# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [4]:
# read in 1st dataset
news1 = pd.read_csv('./data/articles1.csv')
news1.shape

(50000, 10)

In [5]:
# read in 2nd dataset
news2 = pd.read_csv('./data/articles2.csv')
news2.shape

(49999, 10)

In [6]:
# read in 3rd dataset
news3 = pd.read_csv('./data/articles3.csv')
news3.shape

(42571, 10)

In [7]:
# concatenate datasets
frames = [news1, news2, news3]

df = pd.concat(frames)

df.shape

(142570, 10)

In [8]:
# reset index of datasets (had to do this otherwise when dropping publications it would take out all the rows with the same index since there was index overlap)
df.reset_index(inplace=True)

In [9]:
# drop unecessary columns from dataset
df.drop(columns=['index', 'Unnamed: 0', 'id', 'url'], inplace = True)

In [10]:
# drop unneeded publications from dataset
df.drop(df.loc[df['publication'] == 'Business Insider'].index, inplace = True)
df.drop(df.loc[df['publication'] =='Talking Points Memo'].index, inplace = True)

In [11]:
# cast title to str (there was a float title)
df['title'] = df['title'].astype(str)

# cast date to datetime
df['date'] = pd.to_datetime(df['date'])

In [12]:
# checking value counts of news sources
df['publication'].value_counts()

Breitbart          23781
New York Post      17493
NPR                11992
CNN                11488
Washington Post    11114
Reuters            10710
Guardian            8681
New York Times      7803
Atlantic            7179
National Review     6203
Vox                 4947
Buzzfeed News       4854
Fox News            4354
Name: publication, dtype: int64

In [13]:
# initializing our text processors
sent = SentimentIntensityAnalyzer()
tokenizer_1 = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
lemmatizer = WordNetLemmatizer()

In [14]:
# diving our dataset by publication and saving it to a dictionary where the key is the publication and the value is it's dataframe
t0 = time.time()
data_dict = {}
for i in df['publication'].value_counts().index:
    # creating each individual dataframe with a standardized name and resetting the index
    ldf = i.replace(' ', '').lower()
    data_dict[ldf] = pd.DataFrame(df.loc[df['publication'] == i])
    data_dict[ldf].reset_index(inplace = True)
    data_dict[ldf].drop(columns = 'index', inplace= True)
    
    # applying sentiment analyis to the content
    data_dict[ldf]['sent_content'] = data_dict[ldf]['content'].apply(sent.polarity_scores)
    data_dict[ldf]['neg_c'] = [d.get('neg') for d in data_dict[ldf]['sent_content']]
    data_dict[ldf]['neu_c'] = [d.get('neu') for d in data_dict[ldf]['sent_content']]
    data_dict[ldf]['pos_c'] = [d.get('pos') for d in data_dict[ldf]['sent_content']]
    data_dict[ldf]['comp_c'] = [d.get('compound') for d in data_dict[ldf]['sent_content']]
    
    # tokenizing and lemmatizing the content
    data_dict[ldf]['tokens_c'] = data_dict[ldf]['content'].apply(tokenizer_1.tokenize)
    data_dict[ldf]['lem_c'] = [[lemmatizer.lemmatize(token) for token in l] for l in data_dict[ldf]['tokens_c'] ]
    
    # applying sentiment analysis to the title
    data_dict[ldf]['sent_title'] = data_dict[ldf]['title'].apply(sent.polarity_scores)
    data_dict[ldf]['neg_t'] = [d.get('neg') for d in data_dict[ldf]['sent_title']]
    data_dict[ldf]['neu_t'] = [d.get('neu') for d in data_dict[ldf]['sent_title']]
    data_dict[ldf]['pos_t'] = [d.get('pos') for d in data_dict[ldf]['sent_title']]
    data_dict[ldf]['comp_t'] = [d.get('compound') for d in data_dict[ldf]['sent_title']]
    
    # tokenizing and lemmatizing the title
    data_dict[ldf]['tokens_t'] = data_dict[ldf]['title'].apply(tokenizer_1.tokenize)
    data_dict[ldf]['lem_t'] = [[lemmatizer.lemmatize(token) for token in l] for l in data_dict[ldf]['tokens_t'] ]

print(f'Creating the Data Dictionary took {time.time() - t0} seconds to run')

KeyboardInterrupt: 

In [None]:
# outputting our data separate files for each publication
for i in data_dict:
    data_dict[i].to_csv(f'./data/{i}.csv')

In [3]:
# create list of the publication names

names = ['buzzfeednews','cnn', 'vox',
         'guardian', 'atlantic', 'washingtonpost',
         'newyorktimes', 'npr', 'reuters',
         'newyorkpost', 'foxnews', 'nationalreview',
         'breitbart']

In [4]:
#creating our publication test dataframe
bert_pub_train = pd.concat([pd.read_csv(f'./data/{i}.csv').sample(3000) for i in names])
bert_pub_train = bert_pub_train[['publication', 'content']]
bert_pub_train['publication'] = [i.replace(' ', '').lower() for i in bert_pub_train['publication']]

# removing cnn from our articles
bert_pub_train['content'] = [i.replace('(' + 'CNN' + ')', '') for i in bert_pub_train['content']]

In [69]:


# creating our publication test dataframe
bert_pub_test = pd.concat([pd.read_csv(f'./data/{i}.csv').sample(200) for i in names])
bert_pub_test = bert_pub_test[['publication', 'content']]
bert_pub_test['publication'] = [i.replace(' ', '').lower() for i in bert_pub_test['publication']]

# removing cnn from our articles
bert_pub_test['content'] = [i.replace('(' + 'CNN' + ')', '') for i in bert_pub_test['content']]


In [5]:
# creating a smaller dataframe for parameter testing
bert_pub_train_test = pd.concat([pd.read_csv(f'./data/{i}.csv').sample(500) for i in names])
bert_pub_train_test = bert_pub_train_test[['publication', 'content']]
bert_pub_train_test['publication'] = [i.replace(' ', '').lower() for i in bert_pub_train_test['publication']]

# removing cnn from our articles
bert_pub_train_test['content'] = [i.replace('(' + 'CNN' + ')', '') for i in bert_pub_train_test['content']]

In [37]:
# creating our publication test dataframe, didn't sample as we need to filter for political articles
bert_bias_test = pd.concat([pd.read_csv(f'./data/{i}.csv') for i in names])
bert_bias_test['publication'] = [i.replace(' ', '').lower() for i in bert_bias_test['publication']]
bert_bias_test = bert_bias_test[['publication', 'content']]

#removing cnn from out articles
bert_bias_test['content'] = [i.replace('(' + 'CNN' + ')', '') for i in bert_bias_test['content']]

In [39]:
# creating list of political words to filter poltical topics
poli_names = ['Trump', 'Hillary', 'Clinton', 'Obama', 'Biden', 'Sanders', 'GOP', 'Republicans', 'Democrats', 'Liberals']

In [40]:
# filtering for political articles
bert_bias_filtered = bert_bias_test[bert_bias_test['content'].str.contains('Trump|Hillary|Clinton|Obama|Biden|Sanders|GOP|Republicans|Democrats|Liberals')]

In [42]:
# creating our final bias_test df
bert_bias_test = bert_bias_filtered.copy(deep = True)

In [5]:
# turning our publications into numbers
bert_pub_train['publication'].replace({'buzzfeednews' : 0, 'cnn' : 1, 'vox' : 2,
                                 'guardian' : 3, 'atlantic' : 4, 'washingtonpost' : 5, 
                                 'newyorktimes' : 6, 'npr' : 7, 'reuters' : 8, 
                                 'newyorkpost' : 9, 'foxnews' :10, 'nationalreview' : 11, 
                                 'breitbart' : 12}, inplace= True)


# bert_pub_test['publication'].replace({'buzzfeednews' : 0, 'cnn' : 1, 'vox' : 2,
#                                  'guardian' : 3, 'atlantic' : 4, 'washingtonpost' : 5, 
#                                  'newyorktimes' : 6, 'npr' : 7, 'reuters' : 8, 
#                                  'newyorkpost' : 9, 'foxnews' :10, 'nationalreview' : 11, 
#                                  'breitbart' : 12}, inplace= True)

In [6]:
bert_pub_train_test['publication'].replace({'buzzfeednews' : 0, 'cnn' : 1, 'vox' : 2,
                                 'guardian' : 3, 'atlantic' : 4, 'washingtonpost' : 5, 
                                 'newyorktimes' : 6, 'npr' : 7, 'reuters' : 8, 
                                 'newyorkpost' : 9, 'foxnews' :10, 'nationalreview' : 11, 
                                 'breitbart' : 12}, inplace= True)

In [47]:
# turning our publications into bias numbers
bert_bias_test['publication'].replace({'buzzfeednews' : 0, 'cnn' : 0, 'vox' : 0,
                                 'guardian' : 1, 'atlantic' : 1, 'washingtonpost' : 1, 
                                 'newyorktimes' : 1, 'npr' : 2, 'reuters' : 2, 
                                 'newyorkpost' : 3, 'foxnews' :4, 'nationalreview' : 4, 
                                 'breitbart' : 4}, inplace = True)

In [60]:
# sampling from our df to create testing and training data
bert_bias_train = pd.concat([bert_bias_test.loc[bert_bias_test['publication'] == i].sample(2500) for i in bert_bias_test['publication'].value_counts().index])
bert_bias_test = pd.concat([bert_bias_test.loc[bert_bias_test['publication'] == i].sample(200) for i in bert_bias_test['publication'].value_counts().index])

In [63]:
# renaimng our column 
bert_bias_train.rename(columns = {'publication': 'bias'}, inplace=True)
bert_bias_test.rename(columns = {'publication': 'bias'}, inplace=True)

In [6]:
# saving our pub test/train
bert_pub_train.to_csv('./data/bert_pub_train.csv')
#bert_pub_test.to_csv('./data/bert_pub_test.csv')

In [79]:
# saving our bias test/train
bert_bias_train.to_csv('./data/bert_bias_train.csv')
bert_bias_test.to_csv('./data/bert_bias_test.csv')

In [8]:
# saving parameter testing dataframe
bert_pub_train_test.to_csv('./data/bert_pub_train_test.csv')