In [1]:
# Import dependency
import pandas as pd

In [2]:
url = 'https://www.realclearpolitics.com/epolls/other/trump_favorableunfavorable-5493.html#polls'

In [3]:
# Use Panda's read_html to parse the url & display all tables
table = pd.read_html(url)
table

[                               Poll         Date   Sample  Favorable  \
 0                       RCP Average   8/9 - 8/25       --       42.1   
 1            Economist/YouGovYouGov  8/23 - 8/25  1254 RV       44.0   
 2      CNBC/Change Research (D)CNBC  8/21 - 8/23  2362 LV       42.0   
 3  Politico/Morning ConsultPolitico  8/21 - 8/23  1992 RV       42.0   
 4                            CNNCNN  8/12 - 8/15   987 RV       43.0   
 5          ABC News/Wash PostABC/WP  8/12 - 8/15   868 RV       41.0   
 6     NBC News/Wall St. JrnlNBC/WSJ   8/9 - 8/12   900 RV       40.0   
 7                  FOX NewsFOX News   8/9 - 8/12  1000 RV       43.0   
 
    Unfavorable  Spread  
 0         55.0   -12.9  
 1         55.0   -11.0  
 2         55.0   -13.0  
 3         56.0   -14.0  
 4         55.0   -12.0  
 5         57.0   -16.0  
 6         52.0   -12.0  
 7         55.0   -12.0  ,
                        0
 0          42.1Favorable
 1  55.0Unfavorable +12.9,
                           

In [4]:
# Assign the complete polling data table to the variable 'df' & display
df = table[2]
df

Unnamed: 0,Poll,Date,Sample,Favorable,Unfavorable,Spread
0,RCP Average,8/9 - 8/25,--,42.1,55.0,-12.9
1,Economist/YouGovYouGov,8/23 - 8/25,1254 RV,44.0,55.0,-11
2,CNBC/Change Research (D)CNBC,8/21 - 8/23,2362 LV,42.0,55.0,-13
3,Politico/Morning ConsultPolitico,8/21 - 8/23,1992 RV,42.0,56.0,-14
4,Economist/YouGovYouGov,8/16 - 8/18,1246 RV,43.0,57.0,-14
...,...,...,...,...,...,...
742,The Economist/YouGovEconomist,6/27 - 6/29,1000 A,30.0,60.0,-30
743,The Economist/YouGovEconomist,6/20 - 6/22,1000 A,32.0,60.0,-28
744,MonmouthMonmouth,6/11 - 6/14,829 RV,18.0,57.0,-39
745,The Economist/YouGovEconomist,6/13 - 6/15,1000 A,26.0,60.0,-34


In [5]:
# Rename columns to match the column names of the other polling dataset
df2 = df.rename(columns={"Favorable":"Approve", "Unfavorable":"Disapprove"})

# Split the 'Sample' column into 'Sample' & 'Population' columns respectively
# Add the new column to the existing dataframe 
df2[['Sample','Population']] = df2.Sample.str.split(expand=True) 

# Show the dataframe
df2

Unnamed: 0,Poll,Date,Sample,Approve,Disapprove,Spread,Population
0,RCP Average,8/9 - 8/25,--,42.1,55.0,-12.9,
1,Economist/YouGovYouGov,8/23 - 8/25,1254,44.0,55.0,-11,RV
2,CNBC/Change Research (D)CNBC,8/21 - 8/23,2362,42.0,55.0,-13,LV
3,Politico/Morning ConsultPolitico,8/21 - 8/23,1992,42.0,56.0,-14,RV
4,Economist/YouGovYouGov,8/16 - 8/18,1246,43.0,57.0,-14,RV
...,...,...,...,...,...,...,...
742,The Economist/YouGovEconomist,6/27 - 6/29,1000,30.0,60.0,-30,A
743,The Economist/YouGovEconomist,6/20 - 6/22,1000,32.0,60.0,-28,A
744,MonmouthMonmouth,6/11 - 6/14,829,18.0,57.0,-39,RV
745,The Economist/YouGovEconomist,6/13 - 6/15,1000,26.0,60.0,-34,A


In [6]:
# Select the rows that contain polling info on the pre-election 2016 year
pre_election = df2.iloc[508:716]

pre_election

Unnamed: 0,Poll,Date,Sample,Approve,Disapprove,Spread,Population
508,Economist/YouGovYouGov,12/24 - 12/27,1412,45.0,51.0,-6,RV
509,Rasmussen ReportsRasmussen,12/22 - 12/22,1000,51.0,47.0,+4,LV
510,Economist/YouGovYouGov,12/17 - 12/20,1185,46.0,50.0,-4,RV
511,Reuters/IpsosReuters,12/16 - 12/20,2065,52.0,48.0,+4,A
512,USA Today/SuffolkUSA Today,12/14 - 12/18,1000,41.0,46.0,-5,RV
...,...,...,...,...,...,...,...
711,QuinnipiacQuinnipiac,2/10 - 2/15,1342,37.0,57.0,-20,RV
712,PPP (D)PPP (D),2/2 - 2/3,1236,30.0,63.0,-33,RV
713,The Economist/YouGovEconomist,2/11 - 2/15,2000,38.0,57.0,-19,A
714,The Economist/YouGovEconomist,1/27 - 1/30,2000,36.0,58.0,-22,A


In [7]:
# Reset the index
pre_election_approval = pre_election.reset_index(drop=True)

# Split the 'Date' column into 'Start Date' & 'End Date' columns respectively, to match other polling datasets
pre_election_approval[['Start_Date','End_Date']] = pre_election_approval.Date.str.split(" - ",expand=True)

# Reorder the columns to match the column order of other dataframes to be merged with
pre_election_approval = pre_election_approval[['Poll', 'Start_Date', 'End_Date', 'Approve', 'Disapprove', 
                                               'Spread', 'Sample', 'Population']]

pre_election_approval

Unnamed: 0,Poll,Start_Date,End_Date,Approve,Disapprove,Spread,Sample,Population
0,Economist/YouGovYouGov,12/24,12/27,45.0,51.0,-6,1412,RV
1,Rasmussen ReportsRasmussen,12/22,12/22,51.0,47.0,+4,1000,LV
2,Economist/YouGovYouGov,12/17,12/20,46.0,50.0,-4,1185,RV
3,Reuters/IpsosReuters,12/16,12/20,52.0,48.0,+4,2065,A
4,USA Today/SuffolkUSA Today,12/14,12/18,41.0,46.0,-5,1000,RV
...,...,...,...,...,...,...,...,...
203,QuinnipiacQuinnipiac,2/10,2/15,37.0,57.0,-20,1342,RV
204,PPP (D)PPP (D),2/2,2/3,30.0,63.0,-33,1236,RV
205,The Economist/YouGovEconomist,2/11,2/15,38.0,57.0,-19,2000,A
206,The Economist/YouGovEconomist,1/27,1/30,36.0,58.0,-22,2000,A


In [8]:
pre_election_approval.dtypes

Poll           object
Start_Date     object
End_Date       object
Approve       float64
Disapprove    float64
Spread         object
Sample         object
Population     object
dtype: object

In [9]:
# Alter 'Start Date' & 'End Date' columns to be in datetime format & offset for correct year
pre_election_approval['Start_Date'] = pd.to_datetime(pre_election_approval['Start_Date'].str.strip(), format = '%m/%d')
pre_election_approval['Start_Date'] = pre_election_approval['Start_Date'] + pd.offsets.DateOffset(years=116)
pre_election_approval['End_Date'] = pd.to_datetime(pre_election_approval['End_Date'].str.strip(), format = '%m/%d')
pre_election_approval['End_Date'] = pre_election_approval['End_Date'] + pd.offsets.DateOffset(years=116)

In [10]:
# Show cleaned-up dataframe
pre_election_approval

Unnamed: 0,Poll,Start_Date,End_Date,Approve,Disapprove,Spread,Sample,Population
0,Economist/YouGovYouGov,2016-12-24,2016-12-27,45.0,51.0,-6,1412,RV
1,Rasmussen ReportsRasmussen,2016-12-22,2016-12-22,51.0,47.0,+4,1000,LV
2,Economist/YouGovYouGov,2016-12-17,2016-12-20,46.0,50.0,-4,1185,RV
3,Reuters/IpsosReuters,2016-12-16,2016-12-20,52.0,48.0,+4,2065,A
4,USA Today/SuffolkUSA Today,2016-12-14,2016-12-18,41.0,46.0,-5,1000,RV
...,...,...,...,...,...,...,...,...
203,QuinnipiacQuinnipiac,2016-02-10,2016-02-15,37.0,57.0,-20,1342,RV
204,PPP (D)PPP (D),2016-02-02,2016-02-03,30.0,63.0,-33,1236,RV
205,The Economist/YouGovEconomist,2016-02-11,2016-02-15,38.0,57.0,-19,2000,A
206,The Economist/YouGovEconomist,2016-01-27,2016-01-30,36.0,58.0,-22,2000,A


In [12]:
# Save to a new csv file
pre_election_approval.to_csv('Output_data/pre_election_approval.csv')