In [3]:
import pymongo
from pymongo import MongoClient
import pandas as pd
import numpy as np
import re

In [4]:
cnn_df = pd.read_csv('data/cnn.csv')
fox_df = pd.read_csv('data/fox.csv')

In [77]:
client = MongoClient()
db = client.capstone
collection = db.onion1
docs = collection.find()


In [56]:
def onion_cleaner(mongo_cursor):
    '''
    input: mongo_cursor - cursor for the onion documents that 
    need cleaning
    output: dataframe with articles cleaned and stripped of characters
    Also adds Satire, CNN, and Fox columns for testing purposes
    '''
    dict_list = []
    # for loop iterates through mongo cursor and removes '_id'dict
    # it also breaks apart the dictionary into keys and values and 
    # appends to the dict_list
    for x in mongo_cursor:
        x.pop('_id')
        q = list(x.items())
        dict_list.append(q)
    # Converts the dict_list to an array
    art_arr = np.array(dict_list)
    # Gets the shape of the array so it can be reshaped
    art_shape = art_arr.shape
    # Reshapes array so it is 2D
    exp = art_arr.reshape(art_shape[0],art_shape[2])
    # Creates DF with URL and Article columns
    df = pd.DataFrame(exp, columns=['Title', 'Article'])
    # Converts Article column to list for text processing
    art = list(df['Article'])
    clean_list = []
    # Removes unwanted characters in article text and then appends
    # to clean_list
    for sample in art:
        sample1 = re.sub('<br/>', '', sample)
        sample2 = re.sub('</p>', '', sample1)
        sample3 = re.sub('—', ' ', sample2)
        sample4 = re.sub('<em>', '', sample3)
        sample5 = re.sub('</em>', '', sample4)
        sample6 = re.sub('\xa0', '', sample5)
        sample7 = re.sub('<p>', '', sample6)
        sample8 = re.sub('sic', '', sample7)
        clean_list.append(sample8)
    # Adds cleaned articles back to DF
    df['Article'] = clean_list
    # Creates dummies columns for future testing
    df['Satire'] = 1
    df['CNN'] = 0
    df['Fox']= 0
    return df

In [37]:
def cnn_cleaner(cnn_df):
    cnn_list = list(cnn_df.content)
    clean_cnn_list = []
    for article in cnn_list:
        clean_article = re.sub('CNN', '', article)
        clean_cnn_list.append(clean_article)
    clean_cnn_df = pd.DataFrame(clean_cnn_list, columns=['Article'])
    clean_cnn_df['Satire'] = 0
    clean_cnn_df['CNN'] = 1
    clean_cnn_df['Fox'] = 0
    return clean_cnn_df
        

In [43]:
def fox_cleaner(fox_df):
    fox_list = list(fox_df.content)
    clean_fox_df = pd.DataFrame(fox_list, columns=['Article'])
    clean_fox_df['Satire'] = 0
    clean_fox_df['CNN'] = 0
    clean_fox_df['Fox'] = 1
    return clean_fox_df

In [44]:
fox_cleaner(fox_df)

Unnamed: 0,Article,Satire,CNN,Fox
0,More than a dozen people were hurt after mass...,0,0,1
1,Republican presidential candidate Ben Carson’...,0,0,1
2,Democratic presidential candidate Martin O’Ma...,0,0,1
3,A nasty battle has broken out in the Republic...,0,0,1
4,Donald Trump launched new attacks against Bil...,0,0,1
5,Two Munich train stations were evacuated Thur...,0,0,1
6,Authorities are stepping up security for New ...,0,0,1
7,"In Bangkok, partygoers will ring in the new...",0,0,1
8,The plunge in oil prices has given a needed b...,0,0,1
9,Oil prices capped a second straight year as on...,0,0,1


In [57]:
df = onion_cleaner(docs)

In [61]:
df.drop('Title', axis=1)

Unnamed: 0,Article,Satire,CNN,Fox
0,"PALO ALTO, CA Frustrated at their lack of prog...",1,0,0
1,WASHINGTON After passage of a bill to block Pr...,1,0,0
2,WASHINGTON Deciding it was time to “let the dr...,1,0,0
3,"LAWTON, OK Brought to the brink of tears by th...",1,0,0
4,"ROCKFORD, MD Instinctively exerting his domina...",1,0,0
5,"HANOI, VIETNAM Tearing up as he described the ...",1,0,0
6,"AUSTIN, TX Expressing his concern that the rel...",1,0,0
7,VATICAN CITY Hoping to gain new insights into ...,1,0,0
8,WASHINGTON Demonstrating their findings with a...,1,0,0
9,"PORTLAND, OR Speculating that he probably shou...",1,0,0


In [63]:
def build_full_df(mongo_cursor, cnn_df, fox_df):
    df_onion = onion_cleaner(mongo_cursor)
    df_onion = df_onion.drop('Title', axis=1)
    df_cnn = cnn_cleaner(cnn_df)
    df_fox = fox_cleaner(fox_df)
    return df_onion, df_cnn, df_fox

In [64]:
sample_cnn_df = cnn_df[0:500]

In [66]:
sample_fox_df = fox_df[0:500]

In [78]:
df_onion = onion_cleaner(docs)

In [80]:
df_onion = df_onion.drop('Title', axis=1)

In [83]:
df_cnn = cnn_cleaner(sample_cnn_df)

In [84]:
df_fox = fox_cleaner(sample_fox_df)

In [97]:
df_final = pd.concat([df_onion, df_cnn, df_fox], axis = 0)

In [98]:
df_final

Unnamed: 0,Article,Satire,CNN,Fox
0,"PALO ALTO, CA Frustrated at their lack of prog...",1,0,0
1,WASHINGTON After passage of a bill to block Pr...,1,0,0
2,WASHINGTON Deciding it was time to “let the dr...,1,0,0
3,"LAWTON, OK Brought to the brink of tears by th...",1,0,0
4,"ROCKFORD, MD Instinctively exerting his domina...",1,0,0
5,"HANOI, VIETNAM Tearing up as he described the ...",1,0,0
6,"AUSTIN, TX Expressing his concern that the rel...",1,0,0
7,VATICAN CITY Hoping to gain new insights into ...,1,0,0
8,WASHINGTON Demonstrating their findings with a...,1,0,0
9,"PORTLAND, OR Speculating that he probably shou...",1,0,0


In [None]:
def fox_word_count(content_list):
    '''
    Takes in a list of fox articles that is a list of strings
    and then returns an average word count for each article.
    input: list of strings
    output: average word count per article
    '''
    
    count = 0
    words = 0
    
    
    # Come back for more preprocessing
    # Right now this is just a rough estimate
    # because the numbers in the articles are messing up
    # the count
    # (), "" are also messing up the count
    for x in content_list:
        length = len(x.split()) 
        words += length
        count += 1
    print(count)
    print(words)
    return words/count

In [None]:
def word_count(content_list):
    '''
    Takes in a list of fox articles that is a list of strings
    and then returns an average word count for each article.
    input: list of strings
    output: average word count per article
    '''
    
    count = 0
    words = 0
    
    
    # Come back for more preprocessing
    # Right now this is just a rough estimate
    # because the numbers in the articles are messing up
    # the count
    # (), "" are also messing up the count
    for x in content_list:
        length = len(x.split()) 
        words += length
        count += 1
    print(count)
    print(words)
    return words/count

In [None]:
def cnn_word_count(content_list):
    '''
    Takes in content of all cnn articles as a list. 
    Then removes the '(CNN)' that is at the beginning of every
    article and gets a word count. 
    input: list of strings  
    output: float - average number of words per article
    '''
    count = 0
    words = 0
    # Come back for more preprocessing
    # Right now this is just a rough estimate
    # because the numbers in the articles are messing up
    # the count
    # (), "" are also messing up the count
    for article in content_list:
        split_article = article.split()
        for word in split_article:
            if word == '(CNN)':
                split_article.remove(word)
        length = len(split_article) 
        words += length
        count += 1
    print(count)
    print(words)
    return words/count