# Poll Data - Web Scraping of Real Clear Politics Site

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
#using Pandas to get all tables from website page with Poll Data
dfs = pd.read_html('https://www.realclearpolitics.com/epolls/2020/president/us/general_election_trump_vs_biden-6247.html')

dfs

[                             Poll         Date   Sample  MoE  Biden (D)  \
 0                     RCP Average   7/9 - 7/24       --   --       50.0   
 1         CBS News/YouGovCBS News  7/21 - 7/24  1401 LV   --       51.0   
 2      Rasmussen ReportsRasmussen  7/15 - 7/21  2500 LV  2.0       47.0   
 3        The Hill/HarrisXThe Hill  7/17 - 7/20  2829 RV  1.8       45.0   
 4                FOX NewsFOX News  7/12 - 7/15  1104 RV  3.0       49.0   
 5        ABC News/Wash PostABC/WP  7/12 - 7/15   673 LV   --       54.0   
 6  CNBC/Change Research (D)*CNBC*  7/10 - 7/12  1258 LV  2.8       51.0   
 7            QuinnipiacQuinnipiac   7/9 - 7/13  1273 RV  2.8       52.0   
 8   NBC News/Wall St. JrnlNBC/WSJ   7/9 - 7/12   900 RV  3.3       51.0   
 
    Trump (R)      Spread  
 0       40.9  Biden +9.1  
 1       41.0   Biden +10  
 2       45.0    Biden +2  
 3       38.0    Biden +7  
 4       41.0    Biden +8  
 5       44.0   Biden +10  
 6       41.0   Biden +10  
 7       37.0 

In [3]:
# making dfs for each table on the webpage
import urllib
from bs4 import BeautifulSoup

html_table = urllib.request.urlopen('https://www.realclearpolitics.com/epolls/2020/president/us/general_election_trump_vs_biden-6247.html').read()

# fix HTML
soup = BeautifulSoup(html_table, "html.parser")
# warn! id ratings-table is your page specific
for table in soup.findChildren(attrs={'id': 'ratings-table'}): 
    for c in table.children:
        if c.name in ['tbody', 'thead']:
            c.unwrap()

df = pd.read_html(str(soup), flavor="bs4")
len(df[2])

166

In [4]:
#saving df with poll_data needed
poll_data = df[2]

#saving df to csv file
poll_data.to_csv('poll_data.csv')

In [123]:
poll_data_dates = pd.read_csv('data/poll_data_dates.csv')
poll_data_dates = poll_data_dates.dropna()
poll_data_dates['Date'] = pd.to_datetime(poll_data_dates['Date'])
poll_data_dates['Start Date'] = pd.to_datetime(poll_data_dates['Start Date'])

poll_data_dates.sort_values(by=['Date'],ascending=True)
poll_data.isnull().sum()
poll_data_dates.dtypes

Poll                  object
Start Date    datetime64[ns]
End Date              object
Date          datetime64[ns]
Sample                object
MoE                   object
Biden (D)            float64
Trump (R)            float64
Spread                object
dtype: object

In [124]:
#Getting Twitter notebook to add poll data

Tweet_notebook = pd.read_csv('data/Tweet_notebook.csv')
Tweet_notebook = Tweet_notebook[['Date','username','Sentiment']]
Tweet_notebook['Date'] = pd.to_datetime(Tweet_notebook['Date'])

Tweet_notebook.isnull().sum()
Tweet_notebook.dtypes
Tweet_notebook.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8514 entries, 0 to 8513
Data columns (total 3 columns):
Date         8514 non-null datetime64[ns]
username     8514 non-null object
Sentiment    8514 non-null float64
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 199.6+ KB


In [138]:
# merging to get poll data
# In theory, the merge_asof should allow for the poll data to be matched via date to the tweet notebook
# by the closest date. However, I keep getting an error message saying that keys must be sorted. I checked for
# nulls and even sort the keys in the function.

right = poll_data_dates
left = Tweet_notebook
left_sorted = left.sort_values(by="Date")
right_sorted = right.sort_values(by='Date')

#poll_df = pd.merge(left_sorted,right_sorted)



In [139]:
#poll_df

In [143]:
#Merging to get poll data for each days tweets
poll_df = pd.merge_asof(left,right,on='Date',tolerance=pd.Timedelta('10d'),allow_exact_matches=False)

#Saving CSV
poll_df.to_csv('data/poll_df.csv')
poll_df.head()

Unnamed: 0,Date,username,Sentiment,Poll,Start Date,End Date,Sample,MoE,Biden (D),Trump (R),Spread
0,2020-02-11,nytimes,0.136,QuinnipiacQuinnipiac,2020-02-05,2/9/20,1519 RV,2.5,50.0,43.0,Biden +7
1,2020-02-13,tamiann02,0.256,QuinnipiacQuinnipiac,2020-02-05,2/9/20,1519 RV,2.5,50.0,43.0,Biden +7
2,2020-02-13,1val1richy,0.256,QuinnipiacQuinnipiac,2020-02-05,2/9/20,1519 RV,2.5,50.0,43.0,Biden +7
3,2020-02-13,kolsaw,0.061,QuinnipiacQuinnipiac,2020-02-05,2/9/20,1519 RV,2.5,50.0,43.0,Biden +7
4,2020-02-15,_watch_observe_,0.217,NPR/PBS/MaristNPR/PBS,2020-02-13,2/16/20,1164 RV,3.7,50.0,44.0,Biden +6


In [98]:
import plotly.express as px


fig = px.scatter(poll_df, x="Biden (D)", y="Sentiment", title='Biden Poll Performance Sentiment Over Time')
fig.show()

In [100]:
import plotly.express as px


fig = px.scatter(poll_df, x="Trump (R)", y="Sentiment", title='Trump Poll Performance Sentiment Over Time')
fig.show()

In [None]:
import seaborn as sns

sns.kdeplot()